Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added and improved errors and warnings in the table transformers #372

Merged
merged 8 commits into from
Jun 23, 2023
2 changes: 0 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,10 +627,8 @@ def add_column(self, column: Column) -> Table:
------
DuplicateColumnNameError
If the new column already exists.

ColumnSizeError
If the size of the column does not match the amount of rows.

"""
if self.has_column(column.name):
raise DuplicateColumnNameError(column.name)
Expand Down
74 changes: 67 additions & 7 deletions src/safeds/data/tabular/transformation/_imputer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import warnings
from typing import Any

import pandas as pd
Expand All @@ -8,7 +9,7 @@
from safeds.data.tabular.containers import Table
from safeds.data.tabular.transformation._table_transformer import TableTransformer
from safeds.data.tabular.typing import ImputerStrategy
from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError
from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError


class Imputer(TableTransformer):
Expand Down Expand Up @@ -75,7 +76,7 @@ def _augment_imputer(self, imputer: sk_SimpleImputer) -> None:
imputer.strategy = "median"

class Mode(ImputerStrategy):
"""An imputation strategy for imputing missing data with mode values."""
"""An imputation strategy for imputing missing data with mode values. The lowest value will be used if there are multiple values with the same highest count."""

def __str__(self) -> str:
return "Mode"
Expand Down Expand Up @@ -107,18 +108,59 @@ def fit(self, table: Table, column_names: list[str] | None) -> Imputer:
-------
fitted_transformer : TableTransformer
The fitted transformer.

Raises
------
UnknownColumnNameError
If column_names contain a column name that is missing in the table
ValueError
If the table contains 0 rows
NonNumericColumnError
If the strategy is set to either Mean or Median and the specified columns of the table contain non-numerical data
"""
if column_names is None:
column_names = table.column_names
else:
missing_columns = set(column_names) - set(table.column_names)
missing_columns = sorted(set(column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The Imputer cannot be fitted because the table contains 0 rows")

if (isinstance(self._strategy, Imputer.Strategy.Mean | Imputer.Strategy.Median)) and table.keep_only_columns(
column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(
column_names,
):
raise NonNumericColumnError(
str(
sorted(
set(table.keep_only_columns(column_names).column_names)
- set(
table.keep_only_columns(column_names)
.remove_columns_with_non_numerical_values()
.column_names,
),
),
),
)
Comment on lines +128 to +147
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe put the NonNumericColumnError before the ValueError to keep the order of Errors the same as in the other transformations


if isinstance(self._strategy, Imputer.Strategy.Mode):
multiple_most_frequent = {}
for name in column_names:
if len(table.get_column(name).mode()) > 1:
raise IndexError("There are multiple most frequent values in a column given for the Imputer")
multiple_most_frequent[name] = table.get_column(name).mode()
if len(multiple_most_frequent) > 0:
warnings.warn(
(
"There are multiple most frequent values in a column given to the Imputer.\nThe lowest values"
" are being chosen in this cases. The following columns have multiple most frequent"
f" values:\n{multiple_most_frequent}"
),
UserWarning,
stacklevel=2,
)

wrapped_transformer = sk_SimpleImputer()
self._strategy._augment_imputer(wrapped_transformer)
Expand Down Expand Up @@ -151,15 +193,33 @@ def transform(self, table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.column_names)
missing_columns = sorted(set(self._column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The Imputer cannot transform the table because it contains 0 rows")

if table.keep_only_columns(self._column_names).remove_columns_with_missing_values().number_of_columns > 0:
warnings.warn(
(
"The columns"
f" {table.keep_only_columns(self._column_names).remove_columns_with_missing_values().column_names} have"
" no missing values, so the Imputer did not change these columns"
),
UserWarning,
stacklevel=2,
)

data = table._data.copy()
data[self._column_names] = pd.DataFrame(
Expand Down
70 changes: 65 additions & 5 deletions src/safeds/data/tabular/transformation/_label_encoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

import warnings

from sklearn.preprocessing import OrdinalEncoder as sk_OrdinalEncoder

from safeds.data.tabular.containers import Table
from safeds.data.tabular.transformation._table_transformer import (
InvertibleTableTransformer,
)
from safeds.exceptions import TransformerNotFittedError, UnknownColumnNameError
from safeds.exceptions import NonNumericColumnError, TransformerNotFittedError, UnknownColumnNameError


# noinspection PyProtectedMember
Expand Down Expand Up @@ -34,13 +36,35 @@ def fit(self, table: Table, column_names: list[str] | None) -> LabelEncoder:
-------
fitted_transformer : TableTransformer
The fitted transformer.

Raises
------
UnknownColumnNameError
If column_names contain a column name that is missing in the table
ValueError
If the table contains 0 rows
"""
if column_names is None:
column_names = table.column_names
else:
missing_columns = set(column_names) - set(table.column_names)
missing_columns = sorted(set(column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")

if table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().number_of_columns > 0:
warnings.warn(
(
"The columns"
f" {table.keep_only_columns(column_names).remove_columns_with_non_numerical_values().column_names} contain"
" numerical data. The LabelEncoder is designed to encode non-numerical values into numerical"
" values"
),
UserWarning,
stacklevel=2,
)

wrapped_transformer = sk_OrdinalEncoder()
wrapped_transformer.fit(table._data[column_names])
Expand Down Expand Up @@ -71,15 +95,22 @@ def transform(self, table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

# Input table does not contain all columns used to fit the transformer
missing_columns = set(self._column_names) - set(table.column_names)
missing_columns = sorted(set(self._column_names) - set(table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(list(missing_columns))
raise UnknownColumnNameError(missing_columns)

if table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot transform the table because it contains 0 rows")

data = table._data.copy()
data.columns = table.column_names
Expand All @@ -106,11 +137,40 @@ def inverse_transform(self, transformed_table: Table) -> Table:
------
TransformerNotFittedError
If the transformer has not been fitted yet.
UnknownColumnNameError
If the input table does not contain all columns used to fit the transformer
NonNumericColumnError
If the specified columns of the input table contain non-numerical data
ValueError
If the table contains 0 rows
"""
# Transformer has not been fitted yet
if self._wrapped_transformer is None or self._column_names is None:
raise TransformerNotFittedError

missing_columns = sorted(set(self._column_names) - set(transformed_table.column_names))
if len(missing_columns) > 0:
raise UnknownColumnNameError(missing_columns)

if transformed_table.keep_only_columns(
self._column_names,
).remove_columns_with_non_numerical_values().number_of_columns < len(self._column_names):
raise NonNumericColumnError(
str(
sorted(
set(self._column_names)
- set(
transformed_table.keep_only_columns(self._column_names)
.remove_columns_with_non_numerical_values()
.column_names,
),
),
),
)

if transformed_table.number_of_rows == 0:
raise ValueError("The LabelEncoder cannot inverse transform the table because it contains 0 rows")

data = transformed_table._data.copy()
data.columns = transformed_table.column_names
data[self._column_names] = self._wrapped_transformer.inverse_transform(data[self._column_names])
Expand Down
Loading