Skip to content

Commit

Permalink
[cherry pick] add warning for dataloader incompatable upgrade (#33514)
Browse files Browse the repository at this point in the history
* add warning log for DataLoader output format imcompatible upgrade. test=develop

* add unittest. test=develop

* fix ci converage. test=develop

* fix ci coverage. test=develop
  • Loading branch information
heavengate committed Jun 15, 2021
1 parent 0079e0b commit bbedca4
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 0 deletions.
43 changes: 43 additions & 0 deletions python/paddle/fluid/dataloader/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,51 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from ..log_helper import get_logger

from collections.abc import Sequence


class _DatasetFetcher(object):
def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
self.dataset = dataset
self.auto_collate_batch = auto_collate_batch
self.collate_fn = collate_fn
self.drop_last = drop_last
self._is_warning_logged = False

def fetch(self, batch_indices):
raise NotImplementedError("'fetch' not implement for class {}".format(
self.__class__.__name__))

def _log_warning(self):
warn_str = "Detect dataset only contains single fileds, return format " \
"changed since Paddle 2.1. In Paddle <= 2.0, DataLoader add " \
"a list surround output data(e.g. return [data]), and in " \
"Paddle >= 2.1, DataLoader return the single filed directly " \
"(e.g. return data). For example, in following code: \n\n"
warn_str += \
"import numpy as np\n" \
"from paddle.io import DataLoader, Dataset\n\n" \
"class RandomDataset(Dataset):\n" \
" def __getitem__(self, idx):\n" \
" data = np.random.random((2, 3)).astype('float32')\n\n" \
" return data\n\n" \
" def __len__(self):\n" \
" return 10\n\n" \
"dataset = RandomDataset()\n" \
"loader = DataLoader(dataset, batch_size=1)\n" \
"data = next(loader())\n\n"

warn_str += "In Paddle <= 2.0, data is in format '[Tensor(shape=(1, 2, 3), " \
"dtype=float32)]', and in Paddle >= 2.1, data is in format" \
" 'Tensor(shape=(1, 2, 3), dtype=float32)'\n"

logger = get_logger(
"DataLoader", logging.INFO, fmt='%(levelname)s: %(message)s')
logger.warning(warn_str)


class _IterableDatasetFetcher(_DatasetFetcher):
def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
Expand All @@ -40,9 +73,14 @@ def fetch(self, batch_indices):
data.append(next(self.dataset_iter))
except StopIteration:
break

if len(data) == 0 or (self.drop_last and
len(data) < len(batch_indices)):
raise StopIteration
if not isinstance(data[0],
Sequence) and not self._is_warning_logged:
self._log_warning()
self._is_warning_logged = True
else:
data = next(self.dataset_iter)

Expand All @@ -59,6 +97,11 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
def fetch(self, batch_indices):
if self.auto_collate_batch:
data = [self.dataset[idx] for idx in batch_indices]

if not isinstance(data[0],
Sequence) and not self._is_warning_logged:
self._log_warning()
self._is_warning_logged = True
else:
data = self.dataset[batch_indices]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,59 @@ def test_main(self):
self.run_main(num_workers)


class SingleFieldDataset(Dataset):
def __init__(self, sample_num):
self.sample_num = sample_num

def __len__(self):
return self.sample_num

def __getitem__(self, idx):
return np.random.random((2, 3)).astype('float32')


class TestSingleFieldDataset(unittest.TestCase):
def init_dataset(self):
self.sample_num = 16
self.dataset = SingleFieldDataset(self.sample_num)

def run_main(self, num_workers):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
place = paddle.CPUPlace()
with fluid.dygraph.guard(place):
self.init_dataset()
dataloader = DataLoader(
self.dataset,
places=place,
num_workers=num_workers,
batch_size=2,
drop_last=True)

for i, data in enumerate(dataloader()):
assert isinstance(data, paddle.Tensor)
assert data.shape == [2, 2, 3]

def test_main(self):
for num_workers in [0, 2]:
self.run_main(num_workers)


class SingleFieldIterableDataset(IterableDataset):
def __init__(self, sample_num):
self.sample_num = sample_num

def __iter__(self):
for _ in range(self.sample_num):
yield np.random.random((2, 3)).astype('float32')


class TestSingleFieldIterableDataset(TestSingleFieldDataset):
def init_dataset(self):
self.sample_num = 16
self.dataset = SingleFieldIterableDataset(self.sample_num)


class TestDataLoaderGenerateStates(unittest.TestCase):
def setUp(self):
self.inputs = [(0, 1), (0, 2), (1, 3)]
Expand Down

0 comments on commit bbedca4

Please sign in to comment.