Lightning-Universe · tchaton · May 7, 2021 · Apr 28, 2021 · Apr 29, 2021 · Apr 29, 2021
@@ -20,20 +20,15 @@
 from pytorch_lightning.utilities import rank_zero_warn
 
 from flash.core.model import Task
-from flash.data.process import ProcessState, Serializer
+from flash.data.data_source import LabelsState
+from flash.data.process import Serializer
 
 
 def binary_cross_entropy_with_logits(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Calls BCE with logits and cast the target one_hot (y) encoding to floating point precision."""
     return F.binary_cross_entropy_with_logits(x, y.float())
 
 
-@dataclass(unsafe_hash=True, frozen=True)
-class ClassificationState(ProcessState):
-
-    labels: Optional[List[str]]
-
-
 class ClassificationTask(Task):
 
     def __init__(
@@ -140,15 +135,16 @@ class Labels(Classes):
     def __init__(self, labels: Optional[List[str]] = None, multi_label: bool = False, threshold: float = 0.5):
         super().__init__(multi_label=multi_label, threshold=threshold)
         self._labels = labels
-        self.set_state(ClassificationState(labels))
+        if labels is not None:
+            self.set_state(LabelsState(labels))
 
     def serialize(self, sample: Any) -> Union[int, List[int], str, List[str]]:
         labels = None
 
         if self._labels is not None:
             labels = self._labels
         else:
-            state = self.get_state(ClassificationState)
+            state = self.get_state(LabelsState)
             if state is not None:
                 labels = state.labels
 

@@ -30,6 +30,7 @@
 from flash.core.schedulers import _SCHEDULERS_REGISTRY
 from flash.core.utils import get_callable_dict
 from flash.data.data_pipeline import DataPipeline
+from flash.data.data_source import DataSource, DefaultDataSource
 from flash.data.process import Postprocess, Preprocess, Serializer, SerializerMapping
 
 
@@ -110,7 +111,8 @@ def step(self, batch: Any, batch_idx: int) -> Any:
         """
         The training/validation/test step. Override for custom behavior.
         """
-        x, y = batch
+        x, y = batch['input'], batch['target']
+        # x, y = batch
         y_hat = self(x)
         output = {"y_hat": y_hat}
         losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
@@ -154,6 +156,7 @@ def test_step(self, batch: Any, batch_idx: int) -> None:
     def predict(
         self,
         x: Any,
+        data_source: Union[str, DefaultDataSource, DataSource] = DefaultDataSource.FILES,
         data_pipeline: Optional[DataPipeline] = None,
     ) -> Any:
         """
@@ -171,7 +174,13 @@ def predict(
 
         data_pipeline = self.build_data_pipeline(data_pipeline)
 
-        x = [x for x in data_pipeline._generate_auto_dataset(x, running_stage)]
+        if str(data_source) == data_source:
+            data_source = DefaultDataSource(data_source)
+
+        if not isinstance(data_source, DataSource):
+            data_source = data_pipeline._preprocess_pipeline.data_source_of_type(data_source.as_type())()
+
+        x = [x for x in data_source.generate_dataset(x, running_stage, data_pipeline)]
         x = data_pipeline.worker_preprocessor(running_stage)(x)
         # switch to self.device when #7188 merge in Lightning
         x = self.transfer_batch_to_device(x, next(self.parameters()).device)
@@ -181,6 +190,7 @@ def predict(
         return predictions
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+        batch = batch['input']
         if isinstance(batch, tuple):
             batch = batch[0]
         elif isinstance(batch, list):

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from inspect import signature
-from typing import Any, Callable, Iterable, Iterator, Optional, TYPE_CHECKING
+from typing import Any, Callable, Generic, Iterable, Iterator, Optional, Sequence, TYPE_CHECKING, TypeVar
 
 import torch
 from pytorch_lightning.trainer.states import RunningStage
@@ -25,9 +25,12 @@
 
 if TYPE_CHECKING:
     from flash.data.data_pipeline import DataPipeline
+    from flash.data.data_source import DataSource
 
+DATA_TYPE = TypeVar('DATA_TYPE')
 
-class BaseAutoDataset:
+
+class BaseAutoDataset(Generic[DATA_TYPE]):
 
     DATASET_KEY = "dataset"
     """
@@ -38,45 +41,41 @@ class BaseAutoDataset:
 
     def __init__(
         self,
-        data: Any,
-        load_data: Optional[Callable] = None,
-        load_sample: Optional[Callable] = None,
+        data: DATA_TYPE,
+        data_source: 'DataSource',
+        running_stage: RunningStage,
         data_pipeline: Optional['DataPipeline'] = None,
-        running_stage: Optional[RunningStage] = None
     ) -> None:
         super().__init__()
 
-        if load_data or load_sample:
-            if data_pipeline:
-                rank_zero_warn(
-                    "``datapipeline`` is specified but load_sample and/or load_data are also specified. "
-                    "Won't use datapipeline"
-                )
-        # initial states
-        self._load_data_called = False
-        self._running_stage = None
-
         self.data = data
+        self.data_source = data_source
         self.data_pipeline = data_pipeline
-        self.load_data = load_data
-        self.load_sample = load_sample
 
-        # trigger the setup only if `running_stage` is provided
+        self._running_stage = None
         self.running_stage = running_stage
 
     @property
-    def running_stage(self) -> Optional[RunningStage]:
+    def running_stage(self) -> RunningStage:
         return self._running_stage
 
     @running_stage.setter
     def running_stage(self, running_stage: RunningStage) -> None:
-        if self._running_stage != running_stage or (not self._running_stage):
-            self._running_stage = running_stage
-            self._load_data_context = CurrentRunningStageFuncContext(self._running_stage, "load_data", self.preprocess)
-            self._load_sample_context = CurrentRunningStageFuncContext(
-                self._running_stage, "load_sample", self.preprocess
+        from flash.data.data_source import DataSource  # Hack to avoid circular import TODO: something better than this
+
+        self._running_stage = running_stage
+
+        self._load_sample_context = CurrentRunningStageFuncContext(self.running_stage, "load_sample", self.data_source)
+
+        self.load_sample = getattr(
+            self.data_source,
+            self.data_pipeline._resolve_function_hierarchy(
+                'load_sample',
+                self.data_source,
+                self.running_stage,
+                DataSource,
             )
-            self._setup(running_stage)
+        )
 
     @property
     def preprocess(self) -> Optional[Preprocess]:
@@ -89,90 +88,33 @@ def control_flow_callback(self) -> Optional[ControlFlow]:
         if preprocess is not None:
             return ControlFlow(preprocess.callbacks)
 
-    def _call_load_data(self, data: Any) -> Iterable:
-        parameters = signature(self.load_data).parameters
-        if len(parameters) > 1 and self.DATASET_KEY in parameters:
-            return self.load_data(data, self)
-        else:
-            return self.load_data(data)
-
     def _call_load_sample(self, sample: Any) -> Any:
-        parameters = signature(self.load_sample).parameters
-        if len(parameters) > 1 and self.DATASET_KEY in parameters:
-            return self.load_sample(sample, self)
-        else:
-            return self.load_sample(sample)
-
-    def _setup(self, stage: Optional[RunningStage]) -> None:
-        assert not stage or _STAGES_PREFIX[stage] in _STAGES_PREFIX_VALUES
-        previous_load_data = self.load_data.__code__ if self.load_data else None
-
-        if self._running_stage and self.data_pipeline and (not self.load_data or not self.load_sample) and stage:
-            self.load_data = getattr(
-                self.preprocess,
-                self.data_pipeline._resolve_function_hierarchy('load_data', self.preprocess, stage, Preprocess)
-            )
-            self.load_sample = getattr(
-                self.preprocess,
-                self.data_pipeline._resolve_function_hierarchy('load_sample', self.preprocess, stage, Preprocess)
-            )
-        if self.load_data and (previous_load_data != self.load_data.__code__ or not self._load_data_called):
-            if previous_load_data:
-                rank_zero_warn(
-                    "The load_data function of the Autogenerated Dataset changed. "
-                    "This is not expected! Preloading Data again to ensure compatibility. This may take some time."
-                )
-            self.setup()
-            self._load_data_called = True
-
-    def setup(self):
-        raise NotImplementedError
-
+        if self.load_sample:
+            with self._load_sample_context:
+                parameters = signature(self.load_sample).parameters
+                if len(parameters) > 1 and self.DATASET_KEY in parameters:
+                    sample = self.load_sample(sample, self)
+                else:
+                    sample = self.load_sample(sample)
+                if self.control_flow_callback:
+                    self.control_flow_callback.on_load_sample(sample, self.running_stage)
+        return sample
 
-class AutoDataset(BaseAutoDataset, Dataset):
 
-    def setup(self):
-        with self._load_data_context:
-            self.preprocessed_data = self._call_load_data(self.data)
+class AutoDataset(BaseAutoDataset[Sequence[Any]], Dataset):
 
     def __getitem__(self, index: int) -> Any:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
-        if self.load_sample:
-            with self._load_sample_context:
-                data: Any = self._call_load_sample(self.preprocessed_data[index])
-                if self.control_flow_callback:
-                    self.control_flow_callback.on_load_sample(data, self.running_stage)
-                return data
-        return self.preprocessed_data[index]
+        return self._call_load_sample(self.data[index])
 
     def __len__(self) -> int:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__len__` for `load_sample` and `load_data` could not be inferred.")
-        return len(self.preprocessed_data)
+        return len(self.data)
 
 
-class IterableAutoDataset(BaseAutoDataset, IterableDataset):
-
-    def setup(self):
-        with self._load_data_context:
-            self.dataset = self._call_load_data(self.data)
-            self.dataset_iter = None
+class IterableAutoDataset(BaseAutoDataset[Iterable[Any]], IterableDataset):
 
     def __iter__(self):
-        self.dataset_iter = iter(self.dataset)
+        self.data_iter = iter(self.data)
         return self
 
     def __next__(self) -> Any:
-        if not self.load_sample and not self.load_data:
-            raise RuntimeError("`__getitem__` for `load_sample` and `load_data` could not be inferred.")
-
-        data = next(self.dataset_iter)
-
-        if self.load_sample:
-            with self._load_sample_context:
-                data: Any = self._call_load_sample(data)
-                if self.control_flow_callback:
-                    self.control_flow_callback.on_load_sample(data, self.running_stage)
-                return data
-        return data
+        return self._call_load_sample(next(self.data_iter))
@@ -190,9 +190,6 @@ def enable(self):
         yield
         self.enabled = False
 
-    def attach_to_datamodule(self, datamodule) -> None:
-        datamodule.data_fetcher = self
-
     def attach_to_preprocess(self, preprocess: 'flash.data.process.Preprocess') -> None:
         preprocess.add_callbacks([self])
         self._preprocess = preprocess