feat: store window size and forecast horizon in dataset (#794)

### Summary of Changes Window size and forecast horizon are no longer specified in the input conversion but in the time series dataset. The intuition is that a dataset describes 1. what to predict (target, forecast horizon), 2. with what (time, features, window size). --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
Safe-DS · May 21, 2024 · f07bc5a · f07bc5a
1 parent e0cd47b
commit f07bc5a
Show file tree

Hide file tree

Showing 38 changed files with 352 additions and 276 deletions.
diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb
@@ -113,7 +113,7 @@
    "source": [
     "extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n",
     "\n",
-    "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)"
+    "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names=extra_names)"
    ],
    "metadata": {
     "collapsed": false
@@ -185,7 +185,7 @@
     "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
     "testing_table = encoder.transform(testing_table)\n",
     "\n",
-    "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n",
+    "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names=extra_names)\n",
     "fitted_model.accuracy(test_tabular_dataset)\n"
    ],
    "metadata": {

diff --git a/docs/tutorials/regression.ipynb b/docs/tutorials/regression.ipynb
@@ -32,8 +32,8 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -55,8 +55,8 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -70,13 +70,13 @@
    "source": [
     "extra_names = [\"id\"]\n",
     "\n",
-    "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n"
+    "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n"
    ],
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -96,8 +96,8 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -120,8 +120,8 @@
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   },
   {
    "cell_type": "markdown",
@@ -135,15 +135,15 @@
   {
    "cell_type": "code",
    "source": [
-    "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n",
+    "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n",
     "\n",
     "fitted_model.mean_absolute_error(test_tabular_dataset)\n"
    ],
    "metadata": {
     "collapsed": false
    },
-   "outputs": [],
-   "execution_count": null
+   "execution_count": null,
+   "outputs": []
   }
  ],
  "metadata": {

diff --git a/src/safeds/data/labeled/containers/_dataset.py b/src/safeds/data/labeled/containers/_dataset.py
@@ -1,9 +1,13 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from typing import Generic, TypeVar
 
+In_co = TypeVar("In_co", covariant=True)
+Out_co = TypeVar("Out_co", covariant=True)
 
-class Dataset(ABC):
+
+class Dataset(Generic[In_co, Out_co], ABC):
     """A dataset is used as input to machine learning models."""
 
     # ------------------------------------------------------------------------------------------------------------------

diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py
@@ -3,7 +3,7 @@
 import copy
 import sys
 import warnings
-from typing import TYPE_CHECKING, Generic, TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 from safeds._config import _get_device, _init_default_device
 from safeds._utils import _structural_hash
@@ -27,10 +27,10 @@
 if TYPE_CHECKING:
     from torch import Tensor
 
-T = TypeVar("T", Column, Table, ImageList)
+Out_co = TypeVar("Out_co", Column, ImageList, Table, covariant=True)
 
 
-class ImageDataset(Generic[T], Dataset):
+class ImageDataset(Dataset[ImageList, Out_co]):
     """
     A Dataset for ImageLists as input and ImageLists, Tables or Columns as output.
 
@@ -46,7 +46,7 @@ class ImageDataset(Generic[T], Dataset):
         weather the data should be shuffled after each epoch of training
     """
 
-    def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, shuffle: bool = False) -> None:
+    def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
         import torch
 
         _init_default_device()
@@ -207,7 +207,7 @@ def get_input(self) -> ImageList:
         """
         return self._sort_image_list_with_shuffle_tensor_indices(self._input)
 
-    def get_output(self) -> T:
+    def get_output(self) -> Out_co:
         """
         Get the output data of this dataset.
 
@@ -280,7 +280,7 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
             output_tensor = self._output._tensor[self._shuffle_tensor_indices[batch_size * batch_number : max_index]]
         return input_tensor, output_tensor
 
-    def shuffle(self) -> ImageDataset[T]:
+    def shuffle(self) -> ImageDataset[Out_co]:
         """
         Return a new `ImageDataset` with shuffled data.
 
@@ -295,7 +295,7 @@ def shuffle(self) -> ImageDataset[T]:
 
         _init_default_device()
 
-        im_dataset: ImageDataset[T] = copy.copy(self)
+        im_dataset: ImageDataset[Out_co] = copy.copy(self)
         im_dataset._shuffle_tensor_indices = torch.randperm(len(self))
         im_dataset._next_batch_index = 0
         return im_dataset

diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py
@@ -5,6 +5,7 @@
 
 from safeds._config import _get_device, _init_default_device
 from safeds._utils import _structural_hash
+from safeds.data.tabular.containers import Column, Table
 
 from ._dataset import Dataset
 
@@ -15,10 +16,8 @@
     from torch.utils.data import DataLoader
     from torch.utils.data import Dataset as TorchDataset
 
-    from safeds.data.tabular.containers import Column, Table
 
-
-class TabularDataset(Dataset):
+class TabularDataset(Dataset[Table, Column]):
     """
     A dataset containing tabular data. It can be used to train machine learning models.
 
@@ -37,7 +36,7 @@ class TabularDataset(Dataset):
     data:
         The data.
     target_name:
-        Name of the target column.
+        The name of the target column.
     extra_names:
         Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
         the target column are used as features.
@@ -72,6 +71,7 @@ def __init__(
         self,
         data: Table | Mapping[str, Sequence[Any]],
         target_name: str,
+        *,
         extra_names: list[str] | None = None,
     ):
         from safeds.data.tabular.containers import Table

diff --git a/src/safeds/data/labeled/containers/_time_series_dataset.py b/src/safeds/data/labeled/containers/_time_series_dataset.py
@@ -6,6 +6,9 @@
 from safeds._config import _get_device, _init_default_device
 from safeds._utils import _structural_hash
 from safeds._validation import _check_bounds, _ClosedBound
+from safeds.data.tabular.containers import Column, Table
+
+from ._dataset import Dataset
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
@@ -14,10 +17,8 @@
     from torch.utils.data import DataLoader
     from torch.utils.data import Dataset as TorchDataset
 
-    from safeds.data.tabular.containers import Column, Table
-
 
-class TimeSeriesDataset:
+class TimeSeriesDataset(Dataset[Table, Column]):
     """
     A time series dataset maps feature and time columns to a target column.
 
@@ -28,12 +29,16 @@ class TimeSeriesDataset:
     data:
         The data.
     target_name:
-        Name of the target column.
+        The name of the target column.
     time_name:
-        Name of the time column.
+        The name of the time column.
+    window_size:
+        The number of consecutive sample to use as input for prediction.
     extra_names:
         Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
         the target column are used as features.
+    forecast_horizon:
+        The number of time steps to predict into the future.
 
     Raises
     ------
@@ -51,7 +56,8 @@ class TimeSeriesDataset:
     ...     {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3], "error":[0,0,1]},
     ...     target_name="target",
     ...     time_name = "id",
-    ...     extra_names=["error"]
+    ...     window_size=1,
+    ...     extra_names=["error"],
     ... )
     """
 
@@ -63,7 +69,10 @@ def __init__(
         data: Table | Mapping[str, Sequence[Any]],
         target_name: str,
         time_name: str,
+        window_size: int,
+        *,
         extra_names: list[str] | None = None,
+        forecast_horizon: int = 1,
     ):
         from safeds.data.tabular.containers import Table
 
@@ -90,6 +99,8 @@ def __init__(
         self._features: Table = data.remove_columns_except(feature_names)
         self._target: Column = data.get_column(target_name)
         self._time: Column = data.get_column(time_name)
+        self._window_size: int = window_size
+        self._forecast_horizon: int = forecast_horizon
         self._extras: Table = data.remove_columns_except(extra_names)
 
     def __eq__(self, other: object) -> bool:
@@ -104,7 +115,9 @@ def __eq__(self, other: object) -> bool:
         if not isinstance(other, TimeSeriesDataset):
             return NotImplemented
         return (self is other) or (
-            self.target == other.target
+            self._window_size == other._window_size
+            and self._forecast_horizon == other._forecast_horizon
+            and self.target == other.target
             and self.features == other.features
             and self.extras == other.extras
             and self.time == other.time
@@ -119,7 +132,14 @@ def __hash__(self) -> int:
         hash:
             The hash value.
         """
-        return _structural_hash(self.target, self.features, self.extras, self.time)
+        return _structural_hash(
+            self.target,
+            self.features,
+            self.extras,
+            self.time,
+            self._window_size,
+            self._forecast_horizon,
+        )
 
     def __sizeof__(self) -> int:
         """
@@ -135,6 +155,8 @@ def __sizeof__(self) -> int:
             + sys.getsizeof(self._features)
             + sys.getsizeof(self.extras)
             + sys.getsizeof(self._time)
+            + sys.getsizeof(self._window_size)
+            + sys.getsizeof(self._forecast_horizon)
         )
 
     # ------------------------------------------------------------------------------------------------------------------
@@ -156,6 +178,16 @@ def time(self) -> Column:
         """The time column of the time series dataset."""
         return self._time
 
+    @property
+    def window_size(self) -> int:
+        """The number of consecutive sample to use as input for prediction."""
+        return self._window_size
+
+    @property
+    def forecast_horizon(self) -> int:
+        """The number of time steps to predict into the future."""
+        return self._forecast_horizon
+
     @property
     def extras(self) -> Table:
         """

diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py
@@ -565,7 +565,7 @@ def rename(self, new_name: str) -> Column[T_co]:
         """
         Return a new column with a new name.
 
-        The original column is not modified.
+        **Note:** The original column is not modified.
 
         Parameters
         ----------
@@ -601,7 +601,7 @@ def transform(
         """
         Return a new column with values transformed by the transformer.
 
-        The original column is not modified.
+        **Note:** The original column is not modified.
 
         Parameters
         ----------
@@ -968,6 +968,13 @@ def missing_value_ratio(self) -> float:
         -------
         missing_value_ratio:
             The ratio of missing values in the column.
+
+        Examples
+        --------
+        >>> from safeds.data.tabular.containers import Column
+        >>> column = Column("test", [1, None, 3, None])
+        >>> column.missing_value_ratio()
+        0.5
         """
         if self.row_count == 0:
             return 1.0  # All values are missing (since there are none)