From f07bc5a51c737ce47fb1c90e6ca68b837cdd6989 Mon Sep 17 00:00:00 2001 From: Lars Reimann Date: Tue, 21 May 2024 10:57:41 +0200 Subject: [PATCH] feat: store window size and forecast horizon in dataset (#794) ### Summary of Changes Window size and forecast horizon are no longer specified in the input conversion but in the time series dataset. The intuition is that a dataset describes 1. what to predict (target, forecast horizon), 2. with what (time, features, window size). --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- docs/tutorials/classification.ipynb | 4 +- docs/tutorials/regression.ipynb | 28 +++---- .../data/labeled/containers/_dataset.py | 6 +- .../data/labeled/containers/_image_dataset.py | 14 ++-- .../labeled/containers/_tabular_dataset.py | 8 +- .../containers/_time_series_dataset.py | 48 ++++++++++-- src/safeds/data/tabular/containers/_column.py | 11 ++- src/safeds/data/tabular/containers/_table.py | 40 +++++++--- .../data/tabular/plotting/_table_plotter.py | 2 +- src/safeds/ml/classical/regression/_arima.py | 1 + src/safeds/ml/nn/_model.py | 2 - .../ml/nn/converters/_input_converter.py | 7 +- .../nn/converters/_input_converter_image.py | 9 +-- .../_input_converter_image_to_column.py | 13 ++-- .../_input_converter_image_to_image.py | 3 +- .../_input_converter_image_to_table.py | 12 +-- .../nn/converters/_input_converter_table.py | 8 +- .../_input_converter_time_series.py | 40 +++------- .../containers/_tabular_dataset/test_eq.py | 12 +-- .../containers/_tabular_dataset/test_hash.py | 12 +-- .../_tabular_dataset/test_into_dataloader.py | 2 +- .../_tabular_dataset/test_sizeof.py | 2 +- .../_tabular_dataset/test_to_table.py | 2 +- .../_time_series_dataset/test_eq.py | 74 +++++++++++++++---- .../_time_series_dataset/test_extras.py | 2 + .../_time_series_dataset/test_features.py | 2 + .../_time_series_dataset/test_hash.py | 64 ++++++++++++---- .../_time_series_dataset/test_init.py | 10 ++- .../test_into_dataloader.py | 30 ++++++-- .../_time_series_dataset/test_repr_html.py | 6 +- .../_time_series_dataset/test_sizeof.py | 4 +- .../_time_series_dataset/test_target.py | 1 + .../_time_series_dataset/test_time.py | 1 + .../_time_series_dataset/test_to_table.py | 12 +-- .../classical/regression/test_arima_model.py | 12 +-- .../test_input_converter_image_2.py | 36 +-------- .../test_input_converter_time_series.py | 67 ++++------------- tests/safeds/ml/nn/test_lstm_workflow.py | 21 +++++- 38 files changed, 352 insertions(+), 276 deletions(-) diff --git a/docs/tutorials/classification.ipynb b/docs/tutorials/classification.ipynb index 2f80999dc..778880c99 100644 --- a/docs/tutorials/classification.ipynb +++ b/docs/tutorials/classification.ipynb @@ -113,7 +113,7 @@ "source": [ "extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n", "\n", - "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)" + "train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names=extra_names)" ], "metadata": { "collapsed": false @@ -185,7 +185,7 @@ "encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n", "testing_table = encoder.transform(testing_table)\n", "\n", - "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n", + "test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names=extra_names)\n", "fitted_model.accuracy(test_tabular_dataset)\n" ], "metadata": { diff --git a/docs/tutorials/regression.ipynb b/docs/tutorials/regression.ipynb index bf2904182..60d64c93c 100644 --- a/docs/tutorials/regression.ipynb +++ b/docs/tutorials/regression.ipynb @@ -32,8 +32,8 @@ "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -55,8 +55,8 @@ "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -70,13 +70,13 @@ "source": [ "extra_names = [\"id\"]\n", "\n", - "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n" + "train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n" ], "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -96,8 +96,8 @@ "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -120,8 +120,8 @@ "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] }, { "cell_type": "markdown", @@ -135,15 +135,15 @@ { "cell_type": "code", "source": [ - "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n", + "test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n", "\n", "fitted_model.mean_absolute_error(test_tabular_dataset)\n" ], "metadata": { "collapsed": false }, - "outputs": [], - "execution_count": null + "execution_count": null, + "outputs": [] } ], "metadata": { diff --git a/src/safeds/data/labeled/containers/_dataset.py b/src/safeds/data/labeled/containers/_dataset.py index 28f8cba6b..38e9d7fbd 100644 --- a/src/safeds/data/labeled/containers/_dataset.py +++ b/src/safeds/data/labeled/containers/_dataset.py @@ -1,9 +1,13 @@ from __future__ import annotations from abc import ABC, abstractmethod +from typing import Generic, TypeVar +In_co = TypeVar("In_co", covariant=True) +Out_co = TypeVar("Out_co", covariant=True) -class Dataset(ABC): + +class Dataset(Generic[In_co, Out_co], ABC): """A dataset is used as input to machine learning models.""" # ------------------------------------------------------------------------------------------------------------------ diff --git a/src/safeds/data/labeled/containers/_image_dataset.py b/src/safeds/data/labeled/containers/_image_dataset.py index c185e47a7..61eebbf4e 100644 --- a/src/safeds/data/labeled/containers/_image_dataset.py +++ b/src/safeds/data/labeled/containers/_image_dataset.py @@ -3,7 +3,7 @@ import copy import sys import warnings -from typing import TYPE_CHECKING, Generic, TypeVar +from typing import TYPE_CHECKING, TypeVar from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash @@ -27,10 +27,10 @@ if TYPE_CHECKING: from torch import Tensor -T = TypeVar("T", Column, Table, ImageList) +Out_co = TypeVar("Out_co", Column, ImageList, Table, covariant=True) -class ImageDataset(Generic[T], Dataset): +class ImageDataset(Dataset[ImageList, Out_co]): """ A Dataset for ImageLists as input and ImageLists, Tables or Columns as output. @@ -46,7 +46,7 @@ class ImageDataset(Generic[T], Dataset): weather the data should be shuffled after each epoch of training """ - def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, shuffle: bool = False) -> None: + def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None: import torch _init_default_device() @@ -207,7 +207,7 @@ def get_input(self) -> ImageList: """ return self._sort_image_list_with_shuffle_tensor_indices(self._input) - def get_output(self) -> T: + def get_output(self) -> Out_co: """ Get the output data of this dataset. @@ -280,7 +280,7 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[ output_tensor = self._output._tensor[self._shuffle_tensor_indices[batch_size * batch_number : max_index]] return input_tensor, output_tensor - def shuffle(self) -> ImageDataset[T]: + def shuffle(self) -> ImageDataset[Out_co]: """ Return a new `ImageDataset` with shuffled data. @@ -295,7 +295,7 @@ def shuffle(self) -> ImageDataset[T]: _init_default_device() - im_dataset: ImageDataset[T] = copy.copy(self) + im_dataset: ImageDataset[Out_co] = copy.copy(self) im_dataset._shuffle_tensor_indices = torch.randperm(len(self)) im_dataset._next_batch_index = 0 return im_dataset diff --git a/src/safeds/data/labeled/containers/_tabular_dataset.py b/src/safeds/data/labeled/containers/_tabular_dataset.py index dc81919fd..e95c182ed 100644 --- a/src/safeds/data/labeled/containers/_tabular_dataset.py +++ b/src/safeds/data/labeled/containers/_tabular_dataset.py @@ -5,6 +5,7 @@ from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash +from safeds.data.tabular.containers import Column, Table from ._dataset import Dataset @@ -15,10 +16,8 @@ from torch.utils.data import DataLoader from torch.utils.data import Dataset as TorchDataset - from safeds.data.tabular.containers import Column, Table - -class TabularDataset(Dataset): +class TabularDataset(Dataset[Table, Column]): """ A dataset containing tabular data. It can be used to train machine learning models. @@ -37,7 +36,7 @@ class TabularDataset(Dataset): data: The data. target_name: - Name of the target column. + The name of the target column. extra_names: Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but the target column are used as features. @@ -72,6 +71,7 @@ def __init__( self, data: Table | Mapping[str, Sequence[Any]], target_name: str, + *, extra_names: list[str] | None = None, ): from safeds.data.tabular.containers import Table diff --git a/src/safeds/data/labeled/containers/_time_series_dataset.py b/src/safeds/data/labeled/containers/_time_series_dataset.py index 4d5751bba..d2e284619 100644 --- a/src/safeds/data/labeled/containers/_time_series_dataset.py +++ b/src/safeds/data/labeled/containers/_time_series_dataset.py @@ -6,6 +6,9 @@ from safeds._config import _get_device, _init_default_device from safeds._utils import _structural_hash from safeds._validation import _check_bounds, _ClosedBound +from safeds.data.tabular.containers import Column, Table + +from ._dataset import Dataset if TYPE_CHECKING: from collections.abc import Mapping, Sequence @@ -14,10 +17,8 @@ from torch.utils.data import DataLoader from torch.utils.data import Dataset as TorchDataset - from safeds.data.tabular.containers import Column, Table - -class TimeSeriesDataset: +class TimeSeriesDataset(Dataset[Table, Column]): """ A time series dataset maps feature and time columns to a target column. @@ -28,12 +29,16 @@ class TimeSeriesDataset: data: The data. target_name: - Name of the target column. + The name of the target column. time_name: - Name of the time column. + The name of the time column. + window_size: + The number of consecutive sample to use as input for prediction. extra_names: Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but the target column are used as features. + forecast_horizon: + The number of time steps to predict into the future. Raises ------ @@ -51,7 +56,8 @@ class TimeSeriesDataset: ... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3], "error":[0,0,1]}, ... target_name="target", ... time_name = "id", - ... extra_names=["error"] + ... window_size=1, + ... extra_names=["error"], ... ) """ @@ -63,7 +69,10 @@ def __init__( data: Table | Mapping[str, Sequence[Any]], target_name: str, time_name: str, + window_size: int, + *, extra_names: list[str] | None = None, + forecast_horizon: int = 1, ): from safeds.data.tabular.containers import Table @@ -90,6 +99,8 @@ def __init__( self._features: Table = data.remove_columns_except(feature_names) self._target: Column = data.get_column(target_name) self._time: Column = data.get_column(time_name) + self._window_size: int = window_size + self._forecast_horizon: int = forecast_horizon self._extras: Table = data.remove_columns_except(extra_names) def __eq__(self, other: object) -> bool: @@ -104,7 +115,9 @@ def __eq__(self, other: object) -> bool: if not isinstance(other, TimeSeriesDataset): return NotImplemented return (self is other) or ( - self.target == other.target + self._window_size == other._window_size + and self._forecast_horizon == other._forecast_horizon + and self.target == other.target and self.features == other.features and self.extras == other.extras and self.time == other.time @@ -119,7 +132,14 @@ def __hash__(self) -> int: hash: The hash value. """ - return _structural_hash(self.target, self.features, self.extras, self.time) + return _structural_hash( + self.target, + self.features, + self.extras, + self.time, + self._window_size, + self._forecast_horizon, + ) def __sizeof__(self) -> int: """ @@ -135,6 +155,8 @@ def __sizeof__(self) -> int: + sys.getsizeof(self._features) + sys.getsizeof(self.extras) + sys.getsizeof(self._time) + + sys.getsizeof(self._window_size) + + sys.getsizeof(self._forecast_horizon) ) # ------------------------------------------------------------------------------------------------------------------ @@ -156,6 +178,16 @@ def time(self) -> Column: """The time column of the time series dataset.""" return self._time + @property + def window_size(self) -> int: + """The number of consecutive sample to use as input for prediction.""" + return self._window_size + + @property + def forecast_horizon(self) -> int: + """The number of time steps to predict into the future.""" + return self._forecast_horizon + @property def extras(self) -> Table: """ diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index e94c1e2e0..475fd0097 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -565,7 +565,7 @@ def rename(self, new_name: str) -> Column[T_co]: """ Return a new column with a new name. - The original column is not modified. + **Note:** The original column is not modified. Parameters ---------- @@ -601,7 +601,7 @@ def transform( """ Return a new column with values transformed by the transformer. - The original column is not modified. + **Note:** The original column is not modified. Parameters ---------- @@ -968,6 +968,13 @@ def missing_value_ratio(self) -> float: ------- missing_value_ratio: The ratio of missing values in the column. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, None, 3, None]) + >>> column.missing_value_ratio() + 0.5 """ if self.row_count == 0: return 1.0 # All values are missing (since there are none) diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 2cb20bdaf..68a541974 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -8,7 +8,6 @@ from safeds._utils._random import _get_random_seed from safeds._validation import _check_bounds, _check_columns_exist, _ClosedBound, _normalize_and_check_file_path from safeds._validation._check_columns_dont_exist import _check_columns_dont_exist -from safeds.data.labeled.containers import TabularDataset, TimeSeriesDataset from safeds.data.tabular.plotting import TablePlotter from safeds.data.tabular.typing._polars_data_type import _PolarsDataType from safeds.data.tabular.typing._polars_schema import _PolarsSchema @@ -30,6 +29,7 @@ from torch import Tensor from torch.utils.data import DataLoader, Dataset + from safeds.data.labeled.containers import TabularDataset, TimeSeriesDataset from safeds.data.tabular.transformation import ( InvertibleTableTransformer, TableTransformer, @@ -161,7 +161,7 @@ def from_csv_file(path: str | Path, *, separator: str = ",") -> Table: path = _normalize_and_check_file_path(path, ".csv", [".csv"], check_if_file_exists=True) - return Table._from_polars_lazy_frame(pl.scan_csv(path, separator=separator)) + return Table._from_polars_lazy_frame(pl.scan_csv(path, separator=separator, raise_if_empty=False)) @staticmethod def from_dict(data: dict[str, list[Any]]) -> Table: @@ -1919,7 +1919,7 @@ def to_parquet_file(self, path: str | Path) -> None: self._lazy_frame.sink_parquet(path) - def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = None) -> TabularDataset: + def to_tabular_dataset(self, target_name: str, *, extra_names: list[str] | None = None) -> TabularDataset: """ Return a new `TabularDataset` with columns marked as a target, feature, or extra. @@ -1934,7 +1934,7 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N Parameters ---------- target_name: - Name of the target column. + The name of the target column. extra_names: Names of the columns that are neither feature nor target. If None, no extra columns are used, i.e. all but the target column are used as features. @@ -1963,13 +1963,22 @@ def to_tabular_dataset(self, target_name: str, extra_names: list[str] | None = N ... ) >>> dataset = table.to_tabular_dataset(target_name="amount_bought", extra_names=["item"]) """ - return TabularDataset(self, target_name, extra_names) + from safeds.data.labeled.containers import TabularDataset # circular import + + return TabularDataset( + self, + target_name=target_name, + extra_names=extra_names, + ) def to_time_series_dataset( self, target_name: str, time_name: str, + window_size: int, + *, extra_names: list[str] | None = None, + forecast_horizon: int = 1, ) -> TimeSeriesDataset: """ Return a new `TimeSeriesDataset` with columns marked as a target column, time or feature columns. @@ -1979,12 +1988,16 @@ def to_time_series_dataset( Parameters ---------- target_name: - Name of the target column. + The name of the target column. time_name: - Name of the time column. + The name of the time column. + window_size: + The number of consecutive sample to use as input for prediction. extra_names: Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but the target column are used as features. + forecast_horizon: + The number of time steps to predict into the future. Returns ------- @@ -2002,11 +2015,18 @@ def to_time_series_dataset( -------- >>> from safeds.data.tabular.containers import Table >>> table = Table({"day": [0, 1, 2], "price": [1.10, 1.19, 1.79], "amount_bought": [74, 72, 51]}) - >>> dataset = table.to_time_series_dataset(target_name="amount_bought", time_name= "day") + >>> dataset = table.to_time_series_dataset(target_name="amount_bought", time_name= "day", window_size=2) """ - from safeds.data.labeled.containers import TimeSeriesDataset + from safeds.data.labeled.containers import TimeSeriesDataset # circular import - return TimeSeriesDataset(self, target_name, time_name, extra_names) + return TimeSeriesDataset( + self, + target_name=target_name, + time_name=time_name, + window_size=window_size, + extra_names=extra_names, + forecast_horizon=forecast_horizon, + ) # ------------------------------------------------------------------------------------------------------------------ # Dataframe interchange protocol diff --git a/src/safeds/data/tabular/plotting/_table_plotter.py b/src/safeds/data/tabular/plotting/_table_plotter.py index 5526dda48..a2a22a301 100644 --- a/src/safeds/data/tabular/plotting/_table_plotter.py +++ b/src/safeds/data/tabular/plotting/_table_plotter.py @@ -91,7 +91,7 @@ def correlation_heatmap(self) -> Image: Examples -------- >>> from safeds.data.tabular.containers import Table - >>> table = Table.from_dict({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) + >>> table = Table({"temperature": [10, 15, 20, 25, 30], "sales": [54, 74, 90, 206, 210]}) >>> image = table.plot.correlation_heatmap() """ # TODO: implement using matplotlib and polars diff --git a/src/safeds/ml/classical/regression/_arima.py b/src/safeds/ml/classical/regression/_arima.py index 7944b2571..117e4018b 100644 --- a/src/safeds/ml/classical/regression/_arima.py +++ b/src/safeds/ml/classical/regression/_arima.py @@ -151,6 +151,7 @@ def predict(self, time_series: TimeSeriesDataset) -> TimeSeriesDataset: target_name=time_series.target.name + " " + "forecasted", time_name=time_series.time.name, extra_names=time_series.extras.column_names, + window_size=1, ) def plot_predictions(self, test_series: TimeSeriesDataset) -> Image: diff --git a/src/safeds/ml/nn/_model.py b/src/safeds/ml/nn/_model.py index 6247b22a4..210f73e6a 100644 --- a/src/safeds/ml/nn/_model.py +++ b/src/safeds/ml/nn/_model.py @@ -299,7 +299,6 @@ def predict(self, test_data: IPT) -> IFT: return self._input_conversion._data_conversion_output( test_data, torch.cat(predictions, dim=0), - **self._input_conversion._get_output_configuration(), ) @property @@ -603,7 +602,6 @@ def predict(self, test_data: IPT) -> IFT: return self._input_conversion._data_conversion_output( test_data, torch.cat(predictions, dim=0), - **self._input_conversion._get_output_configuration(), ) @property diff --git a/src/safeds/ml/nn/converters/_input_converter.py b/src/safeds/ml/nn/converters/_input_converter.py index 595fbe688..1a6c2972e 100644 --- a/src/safeds/ml/nn/converters/_input_converter.py +++ b/src/safeds/ml/nn/converters/_input_converter.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Generic, TypeVar +from typing import TYPE_CHECKING, Generic, TypeVar from safeds.data.image.containers import ImageList from safeds.data.labeled.containers import ImageDataset, TabularDataset, TimeSeriesDataset @@ -37,13 +37,10 @@ def _data_conversion_fit( def _data_conversion_predict(self, input_data: PT, batch_size: int) -> DataLoader | _SingleSizeImageList: ... @abstractmethod - def _data_conversion_output(self, input_data: PT, output_data: Tensor, **kwargs: Any) -> FT: ... + def _data_conversion_output(self, input_data: PT, output_data: Tensor) -> FT: ... @abstractmethod def _is_fit_data_valid(self, input_data: FT) -> bool: ... @abstractmethod def _is_predict_data_valid(self, input_data: PT) -> bool: ... - - @abstractmethod - def _get_output_configuration(self) -> dict[str, Any]: ... diff --git a/src/safeds/ml/nn/converters/_input_converter_image.py b/src/safeds/ml/nn/converters/_input_converter_image.py index 795241e2b..7f319441e 100644 --- a/src/safeds/ml/nn/converters/_input_converter_image.py +++ b/src/safeds/ml/nn/converters/_input_converter_image.py @@ -2,7 +2,7 @@ import sys from abc import ABC -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds._utils import _structural_hash from safeds.data.image.containers import ImageList @@ -107,10 +107,3 @@ def _is_fit_data_valid(self, input_data: ImageDataset) -> bool: def _is_predict_data_valid(self, input_data: ImageList) -> bool: return isinstance(input_data, _SingleSizeImageList) and input_data.sizes[0] == self._input_size - - def _get_output_configuration(self) -> dict[str, Any]: - return { - "column_names": self._column_names, - "column_name": self._column_name, - "one_hot_encoder": self._one_hot_encoder, - } diff --git a/src/safeds/ml/nn/converters/_input_converter_image_to_column.py b/src/safeds/ml/nn/converters/_input_converter_image_to_column.py index b3fc6e95a..da03c03a0 100644 --- a/src/safeds/ml/nn/converters/_input_converter_image_to_column.py +++ b/src/safeds/ml/nn/converters/_input_converter_image_to_column.py @@ -1,13 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds._config import _init_default_device from safeds.data.image.containers._single_size_image_list import _SingleSizeImageList from safeds.data.labeled.containers import ImageDataset from safeds.data.labeled.containers._image_dataset import _ColumnAsTensor from safeds.data.tabular.containers import Column -from safeds.data.tabular.transformation import OneHotEncoder from ._input_converter_image import _InputConversionImage @@ -15,6 +14,7 @@ from torch import Tensor from safeds.data.image.containers import ImageList + from safeds.data.tabular.transformation import OneHotEncoder class InputConversionImageToColumn(_InputConversionImage): @@ -22,7 +22,6 @@ def _data_conversion_output( self, input_data: ImageList, output_data: Tensor, - **kwargs: Any, ) -> ImageDataset[Column]: import torch @@ -30,16 +29,16 @@ def _data_conversion_output( if not isinstance(input_data, _SingleSizeImageList): raise ValueError("The given input ImageList contains images of different sizes.") # noqa: TRY004 - if "column_name" not in kwargs or not isinstance(kwargs.get("column_name"), str): + if self._column_name is None: raise ValueError( "The column_name is not set. The data can only be converted if the column_name is provided as `str` in the kwargs.", ) - if "one_hot_encoder" not in kwargs or not isinstance(kwargs.get("one_hot_encoder"), OneHotEncoder): + if self._one_hot_encoder is None: raise ValueError( "The one_hot_encoder is not set. The data can only be converted if the one_hot_encoder is provided as `OneHotEncoder` in the kwargs.", ) - one_hot_encoder: OneHotEncoder = kwargs["one_hot_encoder"] - column_name: str = kwargs["column_name"] + one_hot_encoder: OneHotEncoder = self._one_hot_encoder + column_name: str = self._column_name output = torch.zeros(len(input_data), len(one_hot_encoder._get_names_of_added_columns())) output[torch.arange(len(input_data)), output_data] = 1 diff --git a/src/safeds/ml/nn/converters/_input_converter_image_to_image.py b/src/safeds/ml/nn/converters/_input_converter_image_to_image.py index 59687cc43..4ba04faef 100644 --- a/src/safeds/ml/nn/converters/_input_converter_image_to_image.py +++ b/src/safeds/ml/nn/converters/_input_converter_image_to_image.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds._config import _init_default_device from safeds.data.image.containers import ImageList @@ -18,7 +18,6 @@ def _data_conversion_output( self, input_data: ImageList, output_data: Tensor, - **_kwargs: Any, ) -> ImageDataset[ImageList]: import torch diff --git a/src/safeds/ml/nn/converters/_input_converter_image_to_table.py b/src/safeds/ml/nn/converters/_input_converter_image_to_table.py index c8f8ef3e7..fe5ce6ad4 100644 --- a/src/safeds/ml/nn/converters/_input_converter_image_to_table.py +++ b/src/safeds/ml/nn/converters/_input_converter_image_to_table.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds._config import _init_default_device from safeds.data.image.containers._single_size_image_list import _SingleSizeImageList @@ -17,22 +17,18 @@ class InputConversionImageToTable(_InputConversionImage): - def _data_conversion_output(self, input_data: ImageList, output_data: Tensor, **kwargs: Any) -> ImageDataset[Table]: + def _data_conversion_output(self, input_data: ImageList, output_data: Tensor) -> ImageDataset[Table]: import torch _init_default_device() if not isinstance(input_data, _SingleSizeImageList): raise ValueError("The given input ImageList contains images of different sizes.") # noqa: TRY004 - if ( - "column_names" not in kwargs - or not isinstance(kwargs.get("column_names"), list) - and all(isinstance(element, str) for element in kwargs["column_names"]) - ): + if self._column_names is None: raise ValueError( "The column_names are not set. The data can only be converted if the column_names are provided as `list[str]` in the kwargs.", ) - column_names: list[str] = kwargs["column_names"] + column_names: list[str] = self._column_names output = torch.zeros(len(input_data), len(column_names)) output[torch.arange(len(input_data)), output_data] = 1 diff --git a/src/safeds/ml/nn/converters/_input_converter_table.py b/src/safeds/ml/nn/converters/_input_converter_table.py index e8b912a94..153eace53 100644 --- a/src/safeds/ml/nn/converters/_input_converter_table.py +++ b/src/safeds/ml/nn/converters/_input_converter_table.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Column, Table @@ -22,7 +22,6 @@ class InputConversionTable(InputConversion[TabularDataset, Table]): def __init__(self, *, prediction_name: str = "prediction") -> None: self._target_name = "" - self._time_name = "" self._feature_names: list[str] = [] self._first = True self._prediction_name = prediction_name # TODO: use target name, override existing column @@ -40,7 +39,7 @@ def _data_conversion_fit(self, input_data: TabularDataset, batch_size: int, num_ def _data_conversion_predict(self, input_data: Table, batch_size: int) -> DataLoader: return input_data._into_dataloader(batch_size) - def _data_conversion_output(self, input_data: Table, output_data: Tensor, **_kwargs: Any) -> TabularDataset: + def _data_conversion_output(self, input_data: Table, output_data: Tensor) -> TabularDataset: return input_data.add_columns([Column(self._prediction_name, output_data.tolist())]).to_tabular_dataset( self._prediction_name, ) @@ -54,6 +53,3 @@ def _is_fit_data_valid(self, input_data: TabularDataset) -> bool: def _is_predict_data_valid(self, input_data: Table) -> bool: return (sorted(input_data.column_names)).__eq__(sorted(self._feature_names)) - - def _get_output_configuration(self) -> dict[str, Any]: - return {} diff --git a/src/safeds/ml/nn/converters/_input_converter_time_series.py b/src/safeds/ml/nn/converters/_input_converter_time_series.py index 050a8cb1c..745c0622a 100644 --- a/src/safeds/ml/nn/converters/_input_converter_time_series.py +++ b/src/safeds/ml/nn/converters/_input_converter_time_series.py @@ -1,7 +1,7 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from safeds._utils import _structural_hash from safeds.data.labeled.containers import TimeSeriesDataset @@ -15,26 +15,15 @@ class InputConversionTimeSeries(InputConversion[TimeSeriesDataset, TimeSeriesDataset]): - """ - The input conversion for a neural network, defines the input parameters for the neural network. - - Parameters - ---------- - window_size: - The size of the created windows - forecast_horizon: - The forecast horizon defines the future lag of the predicted values - """ + """The input conversion for a neural network, defines the input parameters for the neural network.""" def __init__( self, - window_size: int, - forecast_horizon: int, *, prediction_name: str = "prediction_nn", ) -> None: - self._window_size = window_size - self._forecast_horizon = forecast_horizon + self._window_size = 0 + self._forecast_horizon = 0 self._first = True self._target_name: str = "" self._time_name: str = "" @@ -84,20 +73,9 @@ def _data_conversion_output( self, input_data: TimeSeriesDataset, output_data: Tensor, - **kwargs: Any, ) -> TimeSeriesDataset: - if "window_size" not in kwargs or not isinstance(kwargs.get("window_size"), int): - raise ValueError( - "The window_size is not set. " - "The data can only be converted if the window_size is provided as `int` in the kwargs.", - ) - if "forecast_horizon" not in kwargs or not isinstance(kwargs.get("forecast_horizon"), int): - raise ValueError( - "The forecast_horizon is not set. " - "The data can only be converted if the forecast_horizon is provided as `int` in the kwargs.", - ) - window_size: int = kwargs["window_size"] - forecast_horizon: int = kwargs["forecast_horizon"] + window_size: int = self._window_size + forecast_horizon: int = self._forecast_horizon input_data_table = input_data.to_table() input_data_table = input_data_table.slice_rows(start=window_size + forecast_horizon) @@ -107,10 +85,13 @@ def _data_conversion_output( target_name=self._prediction_name, time_name=input_data.time.name, extra_names=input_data.extras.column_names, + window_size=window_size, ) def _is_fit_data_valid(self, input_data: TimeSeriesDataset) -> bool: if self._first: + self._window_size = input_data.window_size + self._forecast_horizon = input_data.forecast_horizon self._time_name = input_data.time.name self._feature_names = input_data.features.column_names self._target_name = input_data.target.name @@ -123,6 +104,3 @@ def _is_fit_data_valid(self, input_data: TimeSeriesDataset) -> bool: def _is_predict_data_valid(self, input_data: TimeSeriesDataset) -> bool: return self._is_fit_data_valid(input_data) - - def _get_output_configuration(self) -> dict[str, Any]: - return {"window_size": self._window_size, "forecast_horizon": self._forecast_horizon} diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py index 4826bd846..e9f3236c2 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_eq.py @@ -19,13 +19,13 @@ True, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["b"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", extra_names=["b"]), False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["d"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", extra_names=["d"]), False, ), ( @@ -39,8 +39,8 @@ False, ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["a"]), False, ), ], diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py index e86e5197f..c40d6c43a 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_hash.py @@ -32,12 +32,12 @@ def test_should_return_same_hash_for_equal_tabular_datasets(table1: TabularDatas ("table1", "table2"), [ ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", ["b"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", extra_names=["b"]), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", ["d"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", extra_names=["d"]), ), ( TabularDataset( @@ -47,8 +47,8 @@ def test_should_return_same_hash_for_equal_tabular_datasets(table1: TabularDatas TabularDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b"), ), ( - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["c"]), - TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", ["a"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["c"]), + TabularDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", extra_names=["a"]), ), ], ids=[ diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py index 512cc1b14..43329ec8f 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_into_dataloader.py @@ -33,7 +33,7 @@ def test_should_create_dataloader( device: Device, ) -> None: configure_test_with_device(device) - tabular_dataset = Table.from_dict(data).to_tabular_dataset(target_name, extra_names) + tabular_dataset = Table.from_dict(data).to_tabular_dataset(target_name, extra_names=extra_names) data_loader = tabular_dataset._into_dataloader_with_classes(1, 2) batch = next(iter(data_loader)) assert batch[0].device == _get_device() diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py index a7097deec..1edcc1163 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_sizeof.py @@ -23,7 +23,7 @@ "target": [1, 3, 2], }, "target", - ["other"], + extra_names=["other"], ), ], ids=["normal", "table_with_extra_column"], diff --git a/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py b/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py index 71e9d5db8..17cb64b3c 100644 --- a/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py +++ b/tests/safeds/data/labeled/containers/_tabular_dataset/test_to_table.py @@ -32,7 +32,7 @@ "target": [1, 3, 2], }, "target", - ["other"], + extra_names=["other"], ), Table( { diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py index d14a1b92d..754011bac 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_eq.py @@ -9,38 +9,74 @@ ("table1", "table2", "expected"), [ ( - TimeSeriesDataset({"a": [], "b": [], "c": []}, "b", "c"), - TimeSeriesDataset({"a": [], "b": [], "c": []}, "b", "c"), + TimeSeriesDataset({"a": [], "b": [], "c": []}, "b", "c", window_size=1), + TimeSeriesDataset({"a": [], "b": [], "c": []}, "b", "c", window_size=1), True, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [4, 5, 6]}, "b", "c"), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [4, 5, 6]}, "b", "c"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [4, 5, 6]}, "b", "c", window_size=1), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [4, 5, 6]}, "b", "c", window_size=1), True, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", "a", ["b"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "c", + "a", + window_size=1, + extra_names=["b"], + ), False, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", "a", ["d"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["d"], + ), False, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), - TimeSeriesDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", "a"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), + TimeSeriesDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), False, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), - TimeSeriesDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", "a"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), + TimeSeriesDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", "a", window_size=1), False, ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "c", ["a"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "c", + window_size=1, + extra_names=["a"], + ), False, ), ], @@ -65,8 +101,14 @@ def test_should_return_whether_two_tabular_datasets_are_equal( @pytest.mark.parametrize( ("table", "other"), [ - (TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c"), None), - (TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c"), Table()), + ( + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c", window_size=1), + None, + ), + ( + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0, 0, 0]}, "b", "c", window_size=1), + Table(), + ), ], ids=[ "TabularDataset vs. None", diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_extras.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_extras.py index bd93075d6..16c21d6fe 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_extras.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_extras.py @@ -16,6 +16,7 @@ }, target_name="T", time_name="C", + window_size=1, ), Table(), ), @@ -29,6 +30,7 @@ }, target_name="T", time_name="B", + window_size=1, extra_names=["A", "C"], ), Table({"A": [1, 4], "C": [3, 6]}), diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_features.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_features.py index dcc55c06c..6734ab0ee 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_features.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_features.py @@ -16,6 +16,7 @@ }, target_name="T", time_name="C", + window_size=1, ), Table({"A": [1, 4], "B": [2, 5]}), ), @@ -30,6 +31,7 @@ }, target_name="T", time_name="time", + window_size=1, extra_names=["B"], ), Table({"A": [1, 4], "C": [3, 6]}), diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_hash.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_hash.py index 5df6d0170..d6f06e3b2 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_hash.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_hash.py @@ -6,16 +6,16 @@ ("table1", "table2"), [ ( - TimeSeriesDataset({"a": [], "b": []}, "b", "a"), - TimeSeriesDataset({"a": [], "b": []}, "b", "a"), + TimeSeriesDataset({"a": [], "b": []}, "b", "a", window_size=1), + TimeSeriesDataset({"a": [], "b": []}, "b", "a", window_size=1), ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), - TimeSeriesDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", "a"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), + TimeSeriesDataset({"a": [1, 1, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), ), ], ids=[ @@ -35,20 +35,56 @@ def test_should_return_same_hash_for_equal_tabular_datasets( ("table1", "table2"), [ ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "c", "a", ["b"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "c", + "a", + window_size=1, + extra_names=["b"], + ), ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, "b", "a", ["d"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "d": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["d"], + ), ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a"), - TimeSeriesDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", "a"), + TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6]}, "b", "a", window_size=1), + TimeSeriesDataset({"a": ["1", "2", "3"], "b": [4, 5, 6]}, "b", "a", window_size=1), ), ( - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "a", ["c"]), - TimeSeriesDataset({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, "b", "c", ["a"]), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "a", + window_size=1, + extra_names=["c"], + ), + TimeSeriesDataset( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + "b", + "c", + window_size=1, + extra_names=["a"], + ), ), ], ids=[ diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py index c4542f8ea..752ba44dc 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_init.py @@ -131,7 +131,7 @@ def test_should_raise_error( error_msg: str | None, ) -> None: with pytest.raises(error, match=error_msg): - TimeSeriesDataset(data, target_name=target_name, time_name=time_name, extra_names=extra_names) + TimeSeriesDataset(data, target_name=target_name, time_name=time_name, window_size=1, extra_names=extra_names) @pytest.mark.parametrize( @@ -231,7 +231,13 @@ def test_should_create_a_tabular_dataset( time_name: str, extra_names: list[str] | None, ) -> None: - tabular_dataset = TimeSeriesDataset(data, target_name=target_name, time_name=time_name, extra_names=extra_names) + tabular_dataset = TimeSeriesDataset( + data, + target_name=target_name, + time_name=time_name, + window_size=1, + extra_names=extra_names, + ) if not isinstance(data, Table): data = Table(data) diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py index 6437363ac..cbdbd0d3c 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_into_dataloader.py @@ -37,7 +37,12 @@ def test_should_create_dataloader( device: Device, ) -> None: configure_test_with_device(device) - tabular_dataset = Table.from_dict(data).to_time_series_dataset(target_name, time_name, extra_names) + tabular_dataset = Table.from_dict(data).to_time_series_dataset( + target_name, + time_name, + window_size=1, + extra_names=extra_names, + ) data_loader = tabular_dataset._into_dataloader_with_window(1, 1, 1) batch = next(iter(data_loader)) assert batch[0].device == _get_device() @@ -73,7 +78,12 @@ def test_should_create_dataloader_predict( device: Device, ) -> None: configure_test_with_device(device) - tabular_dataset = Table.from_dict(data).to_time_series_dataset(target_name, time_name, extra_names) + tabular_dataset = Table.from_dict(data).to_time_series_dataset( + target_name, + time_name, + window_size=1, + extra_names=extra_names, + ) data_loader = tabular_dataset._into_dataloader_with_window_predict(1, 1, 1) batch = next(iter(data_loader)) assert batch[0].device == _get_device() @@ -91,7 +101,7 @@ def test_should_create_dataloader_predict( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset(target_name="T", time_name="B", window_size=1), 1, 2, ValueError, @@ -105,7 +115,7 @@ def test_should_create_dataloader_predict( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset(target_name="T", time_name="B", window_size=1), 1, 0, OutOfBoundsError, @@ -119,7 +129,7 @@ def test_should_create_dataloader_predict( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset(target_name="T", time_name="B", window_size=1), 0, 1, OutOfBoundsError, @@ -157,7 +167,7 @@ def test_should_create_dataloader_invalid( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset(target_name="T", time_name="B", window_size=1), 1, 2, ValueError, @@ -171,7 +181,7 @@ def test_should_create_dataloader_invalid( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset(target_name="T", time_name="B", window_size=1), 1, 0, OutOfBoundsError, @@ -185,7 +195,11 @@ def test_should_create_dataloader_invalid( "C": [3, 6], "T": [0, 1], }, - ).to_time_series_dataset("T", "B"), + ).to_time_series_dataset( + target_name="T", + time_name="B", + window_size=1, + ), 0, 1, OutOfBoundsError, diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_repr_html.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_repr_html.py index e1c40de42..ac224cde6 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_repr_html.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_repr_html.py @@ -7,7 +7,7 @@ @pytest.mark.parametrize( "tabular_dataset", [ - TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a"), + TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a", window_size=1), ], ids=[ "non-empty", @@ -21,7 +21,7 @@ def test_should_contain_tabular_dataset_element(tabular_dataset: TimeSeriesDatas @pytest.mark.parametrize( "tabular_dataset", [ - TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a"), + TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a", window_size=1), ], ids=[ "non-empty", @@ -35,7 +35,7 @@ def test_should_contain_th_element_for_each_column_name(tabular_dataset: TimeSer @pytest.mark.parametrize( "tabular_dataset", [ - TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a"), + TimeSeriesDataset({"a": [1, 2], "b": [3, 4]}, target_name="b", time_name="a", window_size=1), ], ids=[ "non-empty", diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_sizeof.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_sizeof.py index 461f27a79..314da3782 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_sizeof.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_sizeof.py @@ -16,6 +16,7 @@ }, "target", "time", + window_size=1, ), TimeSeriesDataset( { @@ -27,7 +28,8 @@ }, "target", "time", - ["other"], + window_size=1, + extra_names=["other"], ), ], ids=["normal", "table_with_extra_column"], diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_target.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_target.py index d4c189f71..019998a75 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_target.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_target.py @@ -16,6 +16,7 @@ }, target_name="T", time_name="A", + window_size=1, ), Column("T", [0, 1]), ), diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_time.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_time.py index 57f1655e9..051c13878 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_time.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_time.py @@ -16,6 +16,7 @@ }, target_name="T", time_name="A", + window_size=1, ), Column("A", [1, 4]), ), diff --git a/tests/safeds/data/labeled/containers/_time_series_dataset/test_to_table.py b/tests/safeds/data/labeled/containers/_time_series_dataset/test_to_table.py index acdc10da3..3d3f4644d 100644 --- a/tests/safeds/data/labeled/containers/_time_series_dataset/test_to_table.py +++ b/tests/safeds/data/labeled/containers/_time_series_dataset/test_to_table.py @@ -13,8 +13,9 @@ "feature_2": [6, 12, 9], "target": [1, 3, 2], }, - "target", - "feature_1", + target_name="target", + time_name="feature_1", + window_size=1, ), Table( { @@ -32,9 +33,10 @@ "other": [3, 9, 12], "target": [1, 3, 2], }, - "target", - "feature_1", - ["other"], + target_name="target", + time_name="feature_1", + window_size=1, + extra_names=["other"], ), Table( { diff --git a/tests/safeds/ml/classical/regression/test_arima_model.py b/tests/safeds/ml/classical/regression/test_arima_model.py index 5a317e07a..bdd4e4c64 100644 --- a/tests/safeds/ml/classical/regression/test_arima_model.py +++ b/tests/safeds/ml/classical/regression/test_arima_model.py @@ -22,8 +22,8 @@ def test_arima_model() -> None: ) train_ts, test_ts = time_series.split_rows(0.8) model = ArimaModelRegressor() - trained_model = model.fit(train_ts.to_time_series_dataset("value", "date")) - trained_model.predict(test_ts.to_time_series_dataset("value", "date")) + trained_model = model.fit(train_ts.to_time_series_dataset("value", "date", window_size=1)) + trained_model.predict(test_ts.to_time_series_dataset("value", "date", window_size=1)) # suggest it ran through assert True @@ -33,6 +33,7 @@ def create_test_data() -> TimeSeriesDataset: {"time": [1, 2, 3, 4, 5, 6, 7, 8, 9], "value": [1, 2, 3, 4, 5, 6, 7, 8, 9]}, time_name="time", target_name="value", + window_size=1, ) @@ -45,6 +46,7 @@ def create_test_data_with_feature() -> TimeSeriesDataset: }, time_name="time", target_name="value", + window_size=1, ) @@ -89,7 +91,7 @@ def test_should_succeed_on_valid_data_plot() -> None: "feat2": [3, 6], "target": ["0", 1], }, - ).to_time_series_dataset(target_name="target", time_name="id"), + ).to_time_series_dataset(target_name="target", time_name="id", window_size=1), NonNumericColumnError, r"Tried to do a numerical operation on one or multiple non-numerical columns: \ntarget", ), @@ -101,7 +103,7 @@ def test_should_succeed_on_valid_data_plot() -> None: "feat2": [3, 6], "target": [None, 1], }, - ).to_time_series_dataset(target_name="target", time_name="id"), + ).to_time_series_dataset(target_name="target", time_name="id", window_size=1), MissingValuesColumnError, r"Tried to do an operation on one or multiple columns containing missing values: \ntarget\nYou can use the Imputer to replace the missing values based on different strategies.\nIf you want toremove the missing values entirely you can use the method `TimeSeries.remove_rows_with_missing_values`.", ), @@ -113,7 +115,7 @@ def test_should_succeed_on_valid_data_plot() -> None: "feat2": [], "target": [], }, - ).to_time_series_dataset(target_name="target", time_name="id"), + ).to_time_series_dataset(target_name="target", time_name="id", window_size=1), DatasetMissesDataError, r"Dataset contains no rows", ), diff --git a/tests/safeds/ml/nn/converters/test_input_converter_image_2.py b/tests/safeds/ml/nn/converters/test_input_converter_image_2.py index 5ea2f4828..191adc45e 100644 --- a/tests/safeds/ml/nn/converters/test_input_converter_image_2.py +++ b/tests/safeds/ml/nn/converters/test_input_converter_image_2.py @@ -6,7 +6,6 @@ from safeds.data.image.containers._single_size_image_list import _SingleSizeImageList from safeds.data.image.typing import ImageSize from safeds.data.tabular.containers import Table -from safeds.data.tabular.transformation import OneHotEncoder from safeds.ml.nn.converters import ( InputConversionImageToColumn, InputConversionImageToImage, @@ -17,26 +16,19 @@ class TestDataConversionImage: @pytest.mark.parametrize( - ("input_conversion", "kwargs"), + "input_conversion", [ - ( - InputConversionImageToColumn(ImageSize(1, 1, 1)), - {"column_name": "a", "one_hot_encoder": OneHotEncoder()}, - ), - (InputConversionImageToTable(ImageSize(1, 1, 1)), {"column_names": ["a"]}), - (InputConversionImageToImage(ImageSize(1, 1, 1)), {}), + InputConversionImageToColumn(ImageSize(1, 1, 1)), ], ) def test_should_raise_if_input_data_is_multi_size( self, input_conversion: _InputConversionImage, - kwargs: dict, ) -> None: with pytest.raises(ValueError, match=r"The given input ImageList contains images of different sizes."): input_conversion._data_conversion_output( input_data=_MultiSizeImageList(), output_data=torch.empty(1), - **kwargs, ) class TestEq: @@ -110,30 +102,6 @@ def test_should_size_be_greater_than_normal_object( assert sys.getsizeof(output_conversion_image) > sys.getsizeof(object()) -class TestInputConversionImageToColumn: - def test_should_raise_if_column_name_not_set(self) -> None: - with pytest.raises( - ValueError, - match=r"The column_name is not set. The data can only be converted if the column_name is provided as `str` in the kwargs.", - ): - InputConversionImageToColumn(ImageSize(1, 1, 1))._data_conversion_output( - input_data=_SingleSizeImageList(), - output_data=torch.empty(1), - one_hot_encoder=OneHotEncoder(), - ) - - def test_should_raise_if_one_hot_encoder_not_set(self) -> None: - with pytest.raises( - ValueError, - match=r"The one_hot_encoder is not set. The data can only be converted if the one_hot_encoder is provided as `OneHotEncoder` in the kwargs.", - ): - InputConversionImageToColumn(ImageSize(1, 1, 1))._data_conversion_output( - input_data=_SingleSizeImageList(), - output_data=torch.empty(1), - column_name="column_name", - ) - - class TestInputConversionImageToTable: def test_should_raise_if_column_names_not_set(self) -> None: with pytest.raises( diff --git a/tests/safeds/ml/nn/converters/test_input_converter_time_series.py b/tests/safeds/ml/nn/converters/test_input_converter_time_series.py index d0e46a8a9..a7c7cebbb 100644 --- a/tests/safeds/ml/nn/converters/test_input_converter_time_series.py +++ b/tests/safeds/ml/nn/converters/test_input_converter_time_series.py @@ -1,7 +1,6 @@ import sys import pytest -import torch from safeds.data.tabular.containers import Table from safeds.ml.nn import ( NeuralNetworkRegressor, @@ -16,12 +15,13 @@ def test_should_raise_if_is_fitted_is_set_correctly_lstm() -> None: model = NeuralNetworkRegressor( - InputConversionTimeSeries(1, 1, prediction_name="predicted"), + InputConversionTimeSeries(prediction_name="predicted"), [LSTMLayer(input_size=2, output_size=1)], ) ts = Table.from_dict({"target": [1, 1, 1, 1], "time": [0, 0, 0, 0], "feat": [0, 0, 0, 0]}).to_time_series_dataset( - "target", - "time", + target_name="target", + time_name="time", + window_size=1, ) assert not model.is_fitted model = model.fit(ts) @@ -29,48 +29,11 @@ def test_should_raise_if_is_fitted_is_set_correctly_lstm() -> None: assert model.is_fitted -def test_get_output_config() -> None: - test_val = {"window_size": 1, "forecast_horizon": 1} - it = InputConversionTimeSeries(1, 1) - di = it._get_output_configuration() - assert di == test_val - - -def test_output_conversion_time_series() -> None: - ot = InputConversionTimeSeries(1, 1) - - with pytest.raises( - ValueError, - match=r"The window_size is not set. The data can only be converted if the window_size is provided as `int` in the kwargs.", - ): - ot._data_conversion_output( - input_data=Table({"a": [1], "c": [1], "b": [1]}).to_time_series_dataset("a", "b"), - output_data=torch.Tensor([0]), - win=2, - kappa=3, - ) - - -def test_output_conversion_time_series_2() -> None: - ot = InputConversionTimeSeries(1, 1) - - with pytest.raises( - ValueError, - match=r"The forecast_horizon is not set. The data can only be converted if the forecast_horizon is provided as `int` in the kwargs.", - ): - ot._data_conversion_output( - input_data=Table({"a": [1], "c": [1], "b": [1]}).to_time_series_dataset("a", "b"), - output_data=torch.Tensor([0]), - window_size=2, - kappa=3, - ) - - class TestEq: @pytest.mark.parametrize( ("output_conversion_ts1", "output_conversion_ts2"), [ - (InputConversionTimeSeries(1, 1), InputConversionTimeSeries(1, 1)), + (InputConversionTimeSeries(), InputConversionTimeSeries()), ], ) def test_should_be_equal( @@ -84,12 +47,12 @@ def test_should_be_equal( ("output_conversion_ts1", "output_conversion_ts2"), [ ( - InputConversionTimeSeries(1, 1), + InputConversionTimeSeries(), Table(), ), ( - InputConversionTimeSeries(1, 1, prediction_name="2"), - InputConversionTimeSeries(1, 1, prediction_name="1"), + InputConversionTimeSeries(prediction_name="2"), + InputConversionTimeSeries(prediction_name="1"), ), ], ) @@ -105,7 +68,7 @@ class TestHash: @pytest.mark.parametrize( ("output_conversion_ts1", "output_conversion_ts2"), [ - (InputConversionTimeSeries(1, 1), InputConversionTimeSeries(1, 1)), + (InputConversionTimeSeries(), InputConversionTimeSeries()), ], ) def test_hash_should_be_equal( @@ -116,9 +79,9 @@ def test_hash_should_be_equal( assert hash(output_conversion_ts1) == hash(output_conversion_ts2) def test_hash_should_not_be_equal(self) -> None: - output_conversion_ts1 = InputConversionTimeSeries(1, 1, prediction_name="1") - output_conversion_ts2 = InputConversionTimeSeries(1, 1, prediction_name="2") - output_conversion_ts3 = InputConversionTimeSeries(1, 1, prediction_name="3") + output_conversion_ts1 = InputConversionTimeSeries(prediction_name="1") + output_conversion_ts2 = InputConversionTimeSeries(prediction_name="2") + output_conversion_ts3 = InputConversionTimeSeries(prediction_name="3") assert hash(output_conversion_ts1) != hash(output_conversion_ts3) assert hash(output_conversion_ts2) != hash(output_conversion_ts1) assert hash(output_conversion_ts3) != hash(output_conversion_ts2) @@ -128,9 +91,9 @@ class TestSizeOf: @pytest.mark.parametrize( "output_conversion_ts", [ - InputConversionTimeSeries(1, 1, prediction_name="1"), - InputConversionTimeSeries(1, 1, prediction_name="2"), - InputConversionTimeSeries(1, 1, prediction_name="3"), + InputConversionTimeSeries(prediction_name="1"), + InputConversionTimeSeries(prediction_name="2"), + InputConversionTimeSeries(prediction_name="3"), ], ) def test_should_size_be_greater_than_normal_object( diff --git a/tests/safeds/ml/nn/test_lstm_workflow.py b/tests/safeds/ml/nn/test_lstm_workflow.py index f0d19a8ae..85e396222 100644 --- a/tests/safeds/ml/nn/test_lstm_workflow.py +++ b/tests/safeds/ml/nn/test_lstm_workflow.py @@ -29,11 +29,26 @@ def test_lstm_model(device: Device) -> None: train_table, test_table = table.split_rows(0.8) model = NeuralNetworkRegressor( - InputConversionTimeSeries(window_size=7, forecast_horizon=12, prediction_name="predicted"), + InputConversionTimeSeries(prediction_name="predicted"), [ForwardLayer(input_size=7, output_size=256), LSTMLayer(input_size=256, output_size=1)], ) - trained_model = model.fit(train_table.to_time_series_dataset("value", "date"), epoch_size=1) + trained_model = model.fit( + train_table.to_time_series_dataset( + "value", + "date", + window_size=7, + forecast_horizon=12, + ), + epoch_size=1, + ) - trained_model.predict(test_table.to_time_series_dataset("value", "date")) + trained_model.predict( + test_table.to_time_series_dataset( + "value", + "date", + window_size=7, + forecast_horizon=12, + ), + ) assert trained_model._model is not None assert trained_model._model.state_dict()["_pytorch_layers.0._layer.weight"].device == _get_device()