Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: __repr__ method to Schema, Row, Column, SupervisedDataset & Table; summary() to Column & Table #333

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,432 changes: 888 additions & 544 deletions Runtime/safe-ds/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Runtime/safe-ds/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ python = "^3.10"
pandas = "^1.5.3"
scikit-learn = "^1.2.0"
seaborn = "^0.12.2"
ipython = "^8.8.0"
matplotlib = "^3.6.3"

[tool.poetry.dev-dependencies]
Expand Down
135 changes: 87 additions & 48 deletions Runtime/safe-ds/safe_ds/data/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import (
ColumnLengthMismatchError,
ColumnSizeError,
Expand Down Expand Up @@ -73,24 +74,6 @@ def get_value(self, index: int) -> Any:

return self._data[index]

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).

Returns
-------
idness: float
The idness of the column

Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.nunique() / self._data.size

@property
def statistics(self) -> ColumnStatistics:
return ColumnStatistics(self)
Expand Down Expand Up @@ -204,25 +187,6 @@ def has_missing_values(self) -> bool:
or (isinstance(value, Number) and np.isnan(value))
)

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.

Returns
-------
stability: float
Stability of this column

Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.value_counts()[self.statistics.mode()] / self._data.count()

def correlation_with(self, other_column: Column) -> float:
"""
Calculates Pearson correlation between this and another column, if both are numerical
Expand Down Expand Up @@ -270,6 +234,33 @@ def __eq__(self, other: object) -> bool:
def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Column to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame()
tmp.columns = [self.name]

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)


class ColumnStatistics:
def __init__(self, column: Column):
Expand All @@ -286,11 +277,13 @@ def max(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.max()

def min(self) -> float:
Expand All @@ -304,11 +297,13 @@ def min(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.min()

def mean(self) -> float:
Expand All @@ -322,11 +317,13 @@ def mean(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.mean()

def mode(self) -> Any:
Expand All @@ -351,11 +348,13 @@ def median(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.median()

def sum(self) -> float:
Expand All @@ -369,7 +368,7 @@ def sum(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -391,7 +390,7 @@ def variance(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -414,7 +413,7 @@ def standard_deviation(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -423,3 +422,43 @@ def standard_deviation(self) -> float:
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.std()

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.

Returns
-------
stability: float
Stability of this column

Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return (
self.column._data.value_counts()[self.column.statistics.mode()]
/ self.column._data.count()
)

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).

Returns
-------
idness: float
The idness of the column

Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self.column._data.nunique() / self.column._data.size
66 changes: 65 additions & 1 deletion Runtime/safe-ds/safe_ds/data/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from typing import Any

import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import UnknownColumnNameError

from ._column_type import ColumnType
from ._table_schema import TableSchema


Expand Down Expand Up @@ -35,7 +37,8 @@ def get_value(self, column_name: str) -> Any:

def has_column(self, column_name: str) -> bool:
"""
Returns if the row contains a given column
Alias for self.schema.hasColumn(column_name: str) -> bool.
Returns if the row contains a given column.

Parameters
----------
Expand All @@ -49,6 +52,40 @@ def has_column(self, column_name: str) -> bool:
"""
return self.schema.has_column(column_name)

def get_column_names(self) -> list[str]:
"""
Alias for self.schema.get_column_names() -> list[str].
Returns a list of all column names saved in this schema

Returns
-------
column_names: list[str]
the column names
"""
return self.schema.get_column_names()

def get_type_of_column(self, column_name: str) -> ColumnType:
"""
Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType.
Returns the type of the given column.

Parameters
----------
column_name : str
The name of the column you want the type of

Returns
-------
type: ColumnType
The type of the column

Raises
------
ColumnNameError
If the specified target column name doesn't exist
"""
return self.schema.get_type_of_column(column_name)

def __eq__(self, other: typing.Any) -> bool:
if not isinstance(other, Row):
return NotImplemented
Expand All @@ -58,3 +95,30 @@ def __eq__(self, other: typing.Any) -> bool:

def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Row to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)
26 changes: 26 additions & 0 deletions Runtime/safe-ds/safe_ds/data/_supervised_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from IPython.core.display_functions import DisplayHandle

from ._column import Column
from ._table import Table

Expand Down Expand Up @@ -26,3 +28,27 @@ def feature_vectors(self) -> Table:
@property
def target_values(self) -> Column:
return self._y

def __repr__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__repr__()

def __str__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__str__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Table to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
print(header_info)
return tmp._ipython_display_()
Loading