Skip to content

Commit

Permalink
feat: remove OrdinalEncoder (#107)
Browse files Browse the repository at this point in the history
### Summary of Changes

The `OrdinalEncoder` was a bit of an outlier compared to the other
`Transformer` classes:

* It could only be applied to a single column instead of a list of
columns. Because of this, it was not possible to implement #61.
* Nothing was "learned" since the user had to specify the value order
explicitly. The `fit` step was completely unnecessary.

Therefore, I've removed the class `OrdinalEncoder`. Instead the
`transform_column` method on a `Table` can be used. If eventually find
this to be too cumbersome, we can implement a new method
`transform_column_into_ordered_labels` on `Table`.

---------

Co-authored-by: lars-reimann <lars-reimann@users.noreply.github.com>
  • Loading branch information
lars-reimann and lars-reimann authored Mar 28, 2023
1 parent fe68426 commit b92bba5
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 308 deletions.
75 changes: 19 additions & 56 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,7 @@ def from_json_file(path: str) -> Table:
except FileNotFoundError as exception:
raise FileNotFoundError(f'File "{path}" does not exist') from exception
except Exception as exception:
raise ValueError(
f'Could not read file from "{path}" as JSON'
) from exception
raise ValueError(f'Could not read file from "{path}" as JSON') from exception

@staticmethod
def from_columns(columns: list[Column]) -> Table:
Expand Down Expand Up @@ -143,9 +141,7 @@ def from_columns(columns: list[Column]) -> Table:
for column in columns:
if column._data.size != columns[0]._data.size:
raise ColumnLengthMismatchError(
"\n".join(
[f"{column.name}: {column._data.size}" for column in columns]
)
"\n".join([f"{column.name}: {column._data.size}" for column in columns])
)
dataframe[column.name] = column._data

Expand Down Expand Up @@ -193,9 +189,7 @@ def from_rows(rows: list[Row]) -> Table:
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, data: typing.Iterable, schema: Optional[TableSchema] = None):
self._data: pd.Dataframe = (
data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
)
self._data: pd.Dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
if schema is None:
if self.count_columns() == 0:
raise MissingSchemaError()
Expand Down Expand Up @@ -272,9 +266,7 @@ def get_column(self, column_name: str) -> Column:
if self._schema.has_column(column_name):
output_column = Column(
column_name,
self._data.iloc[
:, [self._schema._get_column_index_by_name(column_name)]
].squeeze(),
self._data.iloc[:, [self._schema._get_column_index_by_name(column_name)]].squeeze(),
self._schema.get_type_of_column(column_name),
)
return output_column
Expand Down Expand Up @@ -533,9 +525,7 @@ def add_rows(self, rows: Union[list[Row], Table]) -> Table:
for row in rows:
if self._schema != row.schema:
raise SchemaMismatchError()
result = pd.concat(
[result, *[row._data.to_frame().T for row in rows]]
).infer_objects()
result = pd.concat([result, *[row._data.to_frame().T for row in rows]]).infer_objects()
result.columns = self._schema.get_column_names()
return Table(result)

Expand Down Expand Up @@ -568,9 +558,7 @@ def drop_columns(self, column_names: list[str]) -> Table:
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns)
transformed_data = self._data.drop(labels=column_indices, axis="columns")
transformed_data.columns = list(
name for name in self._schema.get_column_names() if name not in column_names
)
transformed_data.columns = list(name for name in self._schema.get_column_names() if name not in column_names)
return Table(transformed_data)

def drop_columns_with_missing_values(self) -> Table:
Expand All @@ -582,9 +570,7 @@ def drop_columns_with_missing_values(self) -> Table:
table : Table
A table without the columns that contain missing values.
"""
return Table.from_columns(
[column for column in self.to_columns() if not column.has_missing_values()]
)
return Table.from_columns([column for column in self.to_columns() if not column.has_missing_values()])

def drop_columns_with_non_numerical_values(self) -> Table:
"""
Expand All @@ -596,9 +582,7 @@ def drop_columns_with_non_numerical_values(self) -> Table:
A table without the columns that contain non-numerical values.
"""
return Table.from_columns(
[column for column in self.to_columns() if column.type.is_numeric()]
)
return Table.from_columns([column for column in self.to_columns() if column.type.is_numeric()])

def drop_duplicate_rows(self) -> Table:
"""
Expand Down Expand Up @@ -642,9 +626,7 @@ def drop_rows_with_outliers(self) -> Table:
copy = self._data.copy(deep=True)

table_without_nonnumericals = self.drop_columns_with_non_numerical_values()
z_scores = np.absolute(
stats.zscore(table_without_nonnumericals._data, nan_policy="omit")
)
z_scores = np.absolute(stats.zscore(table_without_nonnumericals._data, nan_policy="omit"))
filter_ = ((z_scores < 3) | np.isnan(z_scores)).all(axis=1)

return Table(copy[filter_], self._schema)
Expand Down Expand Up @@ -699,9 +681,7 @@ def keep_only_columns(self, column_names: list[str]) -> Table:
if len(invalid_columns) != 0:
raise UnknownColumnNameError(invalid_columns)
transformed_data = self._data[column_indices]
transformed_data.columns = list(
name for name in self._schema.get_column_names() if name in column_names
)
transformed_data.columns = list(name for name in self._schema.get_column_names() if name in column_names)
return Table(transformed_data)

def rename_column(self, old_name: str, new_name: str) -> Table:
Expand Down Expand Up @@ -769,10 +749,7 @@ def replace_column(self, old_column_name: str, new_column: Column) -> Table:
if old_column_name not in self._schema.get_column_names():
raise UnknownColumnNameError([old_column_name])

if (
new_column.name in self._schema.get_column_names()
and new_column.name != old_column_name
):
if new_column.name in self._schema.get_column_names() and new_column.name != old_column_name:
raise DuplicateColumnNameError(new_column.name)

if self.count_rows() != new_column._data.size:
Expand Down Expand Up @@ -838,13 +815,7 @@ def slice(
if end is None:
end = self.count_rows()

if (
start < 0
or end < 0
or start >= self.count_rows()
or end > self.count_rows()
or end < start
):
if start < 0 or end < 0 or start >= self.count_rows() or end > self.count_rows() or end < start:
raise ValueError("the given index is out of bounds")

new_df = self._data.iloc[start:end:step]
Expand All @@ -853,9 +824,7 @@ def slice(

def sort_columns(
self,
comparator: Callable[[Column, Column], int] = lambda col1, col2: (
col1.name > col2.name
)
comparator: Callable[[Column, Column], int] = lambda col1, col2: (col1.name > col2.name)
- (col1.name < col2.name),
) -> Table:
"""
Expand Down Expand Up @@ -891,9 +860,9 @@ def sort_rows(self, comparator: Callable[[Row, Row], int]) -> Table:
The comparator is a function that takes two rows `row1` and `row2` and returns an integer:
* If `col1` should be ordered before `col2`, the function should return a negative number.
* If `col1` should be ordered after `col2`, the function should return a positive number.
* If the original order of `col1` and `col2` should be kept, the function should return 0.
* If `row1` should be ordered before `row2`, the function should return a negative number.
* If `row1` should be ordered after `row2`, the function should return a positive number.
* If the original order of `row1` and `row2` should be kept, the function should return 0.
Parameters
----------
Expand Down Expand Up @@ -933,9 +902,7 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
self.slice(round(percentage_in_first * self.count_rows())),
)

def transform_column(
self, name: str, transformer: Callable[[Row], typing.Any]
) -> Table:
def transform_column(self, name: str, transformer: Callable[[Row], typing.Any]) -> Table:
"""
Transform provided column by calling provided transformer.
Expand Down Expand Up @@ -1103,9 +1070,7 @@ def to_rows(self) -> list[Row]:
rows : list[Row]
List of rows.
"""
return [
Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()
]
return [Row(series_row, self._schema) for (_, series_row) in self._data.iterrows()]

# ------------------------------------------------------------------------------------------------------------------
# Other
Expand All @@ -1123,7 +1088,5 @@ def _ipython_display_(self) -> DisplayHandle:
tmp = self._data.copy(deep=True)
tmp.columns = self.get_column_names()

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
with pd.option_context("display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]):
return display(tmp)
1 change: 0 additions & 1 deletion src/safeds/data/tabular/transformation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from ._imputer import Imputer
from ._label_encoder import LabelEncoder
from ._one_hot_encoder import OneHotEncoder
from ._ordinal_encoder import OrdinalEncoder
144 changes: 0 additions & 144 deletions src/safeds/data/tabular/transformation/_ordinal_encoder.py

This file was deleted.

Empty file.

This file was deleted.

Loading

0 comments on commit b92bba5

Please sign in to comment.