Skip to content

Commit

Permalink
feat: store window size and forecast horizon in dataset (#794)
Browse files Browse the repository at this point in the history
### Summary of Changes

Window size and forecast horizon are no longer specified in the input
conversion but in the time series dataset.

The intuition is that a dataset describes
1. what to predict (target, forecast horizon),
2. with what (time, features, window size).

---------

Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com>
  • Loading branch information
lars-reimann and megalinter-bot authored May 21, 2024
1 parent e0cd47b commit f07bc5a
Show file tree
Hide file tree
Showing 38 changed files with 352 additions and 276 deletions.
4 changes: 2 additions & 2 deletions docs/tutorials/classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"source": [
"extra_names = [\"id\", \"name\", \"ticket\", \"cabin\", \"port_embarked\", \"age\", \"fare\"]\n",
"\n",
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names)"
"train_tabular_dataset = transformed_table.to_tabular_dataset(\"survived\", extra_names=extra_names)"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -185,7 +185,7 @@
"encoder = OneHotEncoder().fit(test_table, [\"sex\"])\n",
"testing_table = encoder.transform(testing_table)\n",
"\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names)\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"survived\", extra_names=extra_names)\n",
"fitted_model.accuracy(test_tabular_dataset)\n"
],
"metadata": {
Expand Down
28 changes: 14 additions & 14 deletions docs/tutorials/regression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
Expand All @@ -55,8 +55,8 @@
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
Expand All @@ -70,13 +70,13 @@
"source": [
"extra_names = [\"id\"]\n",
"\n",
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names)\n"
"train_tabular_dataset = train_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n"
],
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
Expand All @@ -96,8 +96,8 @@
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
Expand All @@ -120,8 +120,8 @@
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
Expand All @@ -135,15 +135,15 @@
{
"cell_type": "code",
"source": [
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names)\n",
"test_tabular_dataset = testing_table.to_tabular_dataset(\"price\", extra_names=extra_names)\n",
"\n",
"fitted_model.mean_absolute_error(test_tabular_dataset)\n"
],
"metadata": {
"collapsed": false
},
"outputs": [],
"execution_count": null
"execution_count": null,
"outputs": []
}
],
"metadata": {
Expand Down
6 changes: 5 additions & 1 deletion src/safeds/data/labeled/containers/_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import Generic, TypeVar

In_co = TypeVar("In_co", covariant=True)
Out_co = TypeVar("Out_co", covariant=True)

class Dataset(ABC):

class Dataset(Generic[In_co, Out_co], ABC):
"""A dataset is used as input to machine learning models."""

# ------------------------------------------------------------------------------------------------------------------
Expand Down
14 changes: 7 additions & 7 deletions src/safeds/data/labeled/containers/_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import copy
import sys
import warnings
from typing import TYPE_CHECKING, Generic, TypeVar
from typing import TYPE_CHECKING, TypeVar

from safeds._config import _get_device, _init_default_device
from safeds._utils import _structural_hash
Expand All @@ -27,10 +27,10 @@
if TYPE_CHECKING:
from torch import Tensor

T = TypeVar("T", Column, Table, ImageList)
Out_co = TypeVar("Out_co", Column, ImageList, Table, covariant=True)


class ImageDataset(Generic[T], Dataset):
class ImageDataset(Dataset[ImageList, Out_co]):
"""
A Dataset for ImageLists as input and ImageLists, Tables or Columns as output.
Expand All @@ -46,7 +46,7 @@ class ImageDataset(Generic[T], Dataset):
weather the data should be shuffled after each epoch of training
"""

def __init__(self, input_data: ImageList, output_data: T, batch_size: int = 1, shuffle: bool = False) -> None:
def __init__(self, input_data: ImageList, output_data: Out_co, batch_size: int = 1, shuffle: bool = False) -> None:
import torch

_init_default_device()
Expand Down Expand Up @@ -207,7 +207,7 @@ def get_input(self) -> ImageList:
"""
return self._sort_image_list_with_shuffle_tensor_indices(self._input)

def get_output(self) -> T:
def get_output(self) -> Out_co:
"""
Get the output data of this dataset.
Expand Down Expand Up @@ -280,7 +280,7 @@ def _get_batch(self, batch_number: int, batch_size: int | None = None) -> tuple[
output_tensor = self._output._tensor[self._shuffle_tensor_indices[batch_size * batch_number : max_index]]
return input_tensor, output_tensor

def shuffle(self) -> ImageDataset[T]:
def shuffle(self) -> ImageDataset[Out_co]:
"""
Return a new `ImageDataset` with shuffled data.
Expand All @@ -295,7 +295,7 @@ def shuffle(self) -> ImageDataset[T]:

_init_default_device()

im_dataset: ImageDataset[T] = copy.copy(self)
im_dataset: ImageDataset[Out_co] = copy.copy(self)
im_dataset._shuffle_tensor_indices = torch.randperm(len(self))
im_dataset._next_batch_index = 0
return im_dataset
Expand Down
8 changes: 4 additions & 4 deletions src/safeds/data/labeled/containers/_tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from safeds._config import _get_device, _init_default_device
from safeds._utils import _structural_hash
from safeds.data.tabular.containers import Column, Table

from ._dataset import Dataset

Expand All @@ -15,10 +16,8 @@
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset

from safeds.data.tabular.containers import Column, Table


class TabularDataset(Dataset):
class TabularDataset(Dataset[Table, Column]):
"""
A dataset containing tabular data. It can be used to train machine learning models.
Expand All @@ -37,7 +36,7 @@ class TabularDataset(Dataset):
data:
The data.
target_name:
Name of the target column.
The name of the target column.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.
Expand Down Expand Up @@ -72,6 +71,7 @@ def __init__(
self,
data: Table | Mapping[str, Sequence[Any]],
target_name: str,
*,
extra_names: list[str] | None = None,
):
from safeds.data.tabular.containers import Table
Expand Down
48 changes: 40 additions & 8 deletions src/safeds/data/labeled/containers/_time_series_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from safeds._config import _get_device, _init_default_device
from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound
from safeds.data.tabular.containers import Column, Table

from ._dataset import Dataset

if TYPE_CHECKING:
from collections.abc import Mapping, Sequence
Expand All @@ -14,10 +17,8 @@
from torch.utils.data import DataLoader
from torch.utils.data import Dataset as TorchDataset

from safeds.data.tabular.containers import Column, Table


class TimeSeriesDataset:
class TimeSeriesDataset(Dataset[Table, Column]):
"""
A time series dataset maps feature and time columns to a target column.
Expand All @@ -28,12 +29,16 @@ class TimeSeriesDataset:
data:
The data.
target_name:
Name of the target column.
The name of the target column.
time_name:
Name of the time column.
The name of the time column.
window_size:
The number of consecutive sample to use as input for prediction.
extra_names:
Names of the columns that are neither features nor target. If None, no extra columns are used, i.e. all but
the target column are used as features.
forecast_horizon:
The number of time steps to predict into the future.
Raises
------
Expand All @@ -51,7 +56,8 @@ class TimeSeriesDataset:
... {"id": [1, 2, 3], "feature": [4, 5, 6], "target": [1, 2, 3], "error":[0,0,1]},
... target_name="target",
... time_name = "id",
... extra_names=["error"]
... window_size=1,
... extra_names=["error"],
... )
"""

Expand All @@ -63,7 +69,10 @@ def __init__(
data: Table | Mapping[str, Sequence[Any]],
target_name: str,
time_name: str,
window_size: int,
*,
extra_names: list[str] | None = None,
forecast_horizon: int = 1,
):
from safeds.data.tabular.containers import Table

Expand All @@ -90,6 +99,8 @@ def __init__(
self._features: Table = data.remove_columns_except(feature_names)
self._target: Column = data.get_column(target_name)
self._time: Column = data.get_column(time_name)
self._window_size: int = window_size
self._forecast_horizon: int = forecast_horizon
self._extras: Table = data.remove_columns_except(extra_names)

def __eq__(self, other: object) -> bool:
Expand All @@ -104,7 +115,9 @@ def __eq__(self, other: object) -> bool:
if not isinstance(other, TimeSeriesDataset):
return NotImplemented
return (self is other) or (
self.target == other.target
self._window_size == other._window_size
and self._forecast_horizon == other._forecast_horizon
and self.target == other.target
and self.features == other.features
and self.extras == other.extras
and self.time == other.time
Expand All @@ -119,7 +132,14 @@ def __hash__(self) -> int:
hash:
The hash value.
"""
return _structural_hash(self.target, self.features, self.extras, self.time)
return _structural_hash(
self.target,
self.features,
self.extras,
self.time,
self._window_size,
self._forecast_horizon,
)

def __sizeof__(self) -> int:
"""
Expand All @@ -135,6 +155,8 @@ def __sizeof__(self) -> int:
+ sys.getsizeof(self._features)
+ sys.getsizeof(self.extras)
+ sys.getsizeof(self._time)
+ sys.getsizeof(self._window_size)
+ sys.getsizeof(self._forecast_horizon)
)

# ------------------------------------------------------------------------------------------------------------------
Expand All @@ -156,6 +178,16 @@ def time(self) -> Column:
"""The time column of the time series dataset."""
return self._time

@property
def window_size(self) -> int:
"""The number of consecutive sample to use as input for prediction."""
return self._window_size

@property
def forecast_horizon(self) -> int:
"""The number of time steps to predict into the future."""
return self._forecast_horizon

@property
def extras(self) -> Table:
"""
Expand Down
11 changes: 9 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def rename(self, new_name: str) -> Column[T_co]:
"""
Return a new column with a new name.
The original column is not modified.
**Note:** The original column is not modified.
Parameters
----------
Expand Down Expand Up @@ -601,7 +601,7 @@ def transform(
"""
Return a new column with values transformed by the transformer.
The original column is not modified.
**Note:** The original column is not modified.
Parameters
----------
Expand Down Expand Up @@ -968,6 +968,13 @@ def missing_value_ratio(self) -> float:
-------
missing_value_ratio:
The ratio of missing values in the column.
Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("test", [1, None, 3, None])
>>> column.missing_value_ratio()
0.5
"""
if self.row_count == 0:
return 1.0 # All values are missing (since there are none)
Expand Down
Loading

0 comments on commit f07bc5a

Please sign in to comment.