From 041ad5a98a8ef656e8db98fd874d51868acb76f2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 18 May 2021 10:43:40 +0000 Subject: [PATCH 1/4] add warning log for DataLoader output format imcompatible upgrade. test=develop --- python/paddle/fluid/dataloader/fetcher.py | 43 +++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py index 41e12fbc68ec1..05382b04dc457 100644 --- a/python/paddle/fluid/dataloader/fetcher.py +++ b/python/paddle/fluid/dataloader/fetcher.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +from ..log_helper import get_logger + +from collections.abc import Sequence + class _DatasetFetcher(object): def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): @@ -19,11 +24,39 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): self.auto_collate_batch = auto_collate_batch self.collate_fn = collate_fn self.drop_last = drop_last + self._is_warning_logged = False def fetch(self, batch_indices): raise NotImplementedError("'fetch' not implement for class {}".format( self.__class__.__name__)) + def _log_warning(self): + warn_str = "Detect dataset only contains single fileds, return format " \ + "changed since Paddle 2.1. In Paddle <= 2.0, DataLoader add " \ + "a list surround output data(e.g. return [data]), and in " \ + "Paddle >= 2.1, DataLoader return the single filed directly " \ + "(e.g. return data). For example, in following code: \n\n" + warn_str += \ + "import numpy as np\n" \ + "from paddle.io import DataLoader, Dataset\n\n" \ + "class RandomDataset(Dataset):\n" \ + " def __getitem__(self, idx):\n" \ + " data = np.random.random((2, 3)).astype('float32')\n\n" \ + " return data\n\n" \ + " def __len__(self):\n" \ + " return 10\n\n" \ + "dataset = RandomDataset()\n" \ + "loader = DataLoader(dataset, batch_size=1)\n" \ + "data = next(loader())\n\n" + + warn_str += "In Paddle <= 2.0, data is in format '[Tensor(shape=(1, 2, 3), " \ + "dtype=float32)]', and in Paddle >= 2.1, data is in format" \ + " 'Tensor(shape=(1, 2, 3), dtype=float32)'\n" + + logger = get_logger( + "DataLoader", logging.INFO, fmt='%(levelname)s: %(message)s') + logger.warning(warn_str) + class _IterableDatasetFetcher(_DatasetFetcher): def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): @@ -40,9 +73,14 @@ def fetch(self, batch_indices): data.append(next(self.dataset_iter)) except StopIteration: break + if len(data) == 0 or (self.drop_last and len(data) < len(batch_indices)): raise StopIteration + if not isinstance(data[0], + Sequence) and not self._is_warning_logged: + self._log_warning() + self._is_warning_logged = True else: data = next(self.dataset_iter) @@ -59,6 +97,11 @@ def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): def fetch(self, batch_indices): if self.auto_collate_batch: data = [self.dataset[idx] for idx in batch_indices] + + if not isinstance(data[0], + Sequence) and not self._is_warning_logged: + self._log_warning() + self._is_warning_logged = True else: data = self.dataset[batch_indices] From da8f9089080ac2c3f6c8d9a63e7e339c98940e5f Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Wed, 19 May 2021 07:11:55 +0000 Subject: [PATCH 2/4] add unittest. test=develop --- .../test_multiprocess_dataloader_dataset.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index 4c69d003d80f8..9eadd6ba37599 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -330,6 +330,41 @@ def test_main(self): self.run_main(num_workers) +class SingleFieldDataset(Dataset): + def __init__(self, sample_num): + self.sample_num = sample_num + + def __len__(self): + return self.sample_num + + def __getitem__(self, idx): + return np.random.random((2, 3)).astype('float32') + + +class TestSingleFieldDataset(unittest.TestCase): + def run_main(self, num_workers): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + place = paddle.CPUPlace() + with fluid.dygraph.guard(place): + dataset = SingleFieldDataset(16) + assert len(dataset) == 16 + dataloader = DataLoader( + dataset, + places=place, + num_workers=num_workers, + batch_size=2, + drop_last=True) + + for i, data in enumerate(dataloader()): + assert isinstance(data, paddle.Tensor) + assert data.shape == [2, 2, 3] + + def test_main(self): + for num_workers in [0, 2]: + self.run_main(num_workers) + + class TestDataLoaderGenerateStates(unittest.TestCase): def setUp(self): self.inputs = [(0, 1), (0, 2), (1, 3)] From 3146129f09012ed6f11596fe7689481b53d82275 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 11 Jun 2021 02:43:56 +0000 Subject: [PATCH 3/4] fix ci converage. test=develop --- .../test_multiprocess_dataloader_dataset.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index 9eadd6ba37599..ec5f7c9cec7c1 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -342,15 +342,19 @@ def __getitem__(self, idx): class TestSingleFieldDataset(unittest.TestCase): + def init_dataset(self): + self.sample_num = 16 + self.dataset = SingleFieldDataset(self.sample_num) + def run_main(self, num_workers): paddle.static.default_startup_program().random_seed = 1 paddle.static.default_main_program().random_seed = 1 place = paddle.CPUPlace() with fluid.dygraph.guard(place): - dataset = SingleFieldDataset(16) - assert len(dataset) == 16 + self.init_dataset() + assert len(self.dataset) == self.sample_num dataloader = DataLoader( - dataset, + self.dataset, places=place, num_workers=num_workers, batch_size=2, @@ -365,6 +369,21 @@ def test_main(self): self.run_main(num_workers) +class SingleFieldIterableDataset(IterableDataset): + def __init__(self, sample_num): + self.sample_num = sample_num + + def __iter__(self): + for _ in range(self.sample_num): + yield np.random.random((2, 3)).astype('float32') + + +class TestSingleFieldIterableDataset(unittest.TestCase): + def init_dataset(self): + self.sample_num = 16 + self.dataset = SingleFieldIterableDataset(self.sample_num) + + class TestDataLoaderGenerateStates(unittest.TestCase): def setUp(self): self.inputs = [(0, 1), (0, 2), (1, 3)] From 83baba43c9907563067fa66f75e5670e61ca0313 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 11 Jun 2021 05:56:09 +0000 Subject: [PATCH 4/4] fix ci coverage. test=develop --- .../tests/unittests/test_multiprocess_dataloader_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index ec5f7c9cec7c1..30e70a77c369c 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -352,7 +352,6 @@ def run_main(self, num_workers): place = paddle.CPUPlace() with fluid.dygraph.guard(place): self.init_dataset() - assert len(self.dataset) == self.sample_num dataloader = DataLoader( self.dataset, places=place, @@ -378,7 +377,7 @@ def __iter__(self): yield np.random.random((2, 3)).astype('float32') -class TestSingleFieldIterableDataset(unittest.TestCase): +class TestSingleFieldIterableDataset(TestSingleFieldDataset): def init_dataset(self): self.sample_num = 16 self.dataset = SingleFieldIterableDataset(self.sample_num)