Skip to content

Commit

Permalink
feat: __repr__ method to Schema, Row, Column, SupervisedDataset & Tab…
Browse files Browse the repository at this point in the history
…le; summary() to Column & Table (#333)

Closes #319.

### Summary of Changes

- Added `__str__()`, `__repr__()` and `_ipython_display_()` methods to
the classes `Table`, `Column`, `Row`, `TableSchema`and
`SupervisedDataset`.
- Added the method `summary()` to the `Table` class
- Changed the raised error in some of the ColumnStatistics methods from
`TypeError` to `NonNumericColumnError`
- Moved the methods `idness()` and `stability()` from the class `Column`
to `ColumnStatistics`
  - Refactored the test accordingly

### Testing Instructions

Create an instance of one of the mentioned classes and either call the
print method on it or use the jupyter notebook interactive display
method to visualize it in a prettier way.

Co-authored-by: Marvin Walter <walter@zbmed.de>
Co-authored-by: GideonKoenig <GideonKoenig@users.noreply.github.com>
Co-authored-by: WinPlay02 <winplay02_gh@woberlaender.de>
Co-authored-by: SmiteDeluxe <smitedeluxe@gmail.com>
Co-authored-by: SmiteDeluxe <SmiteDeluxe@users.noreply.github.com>
  • Loading branch information
6 people authored Jan 27, 2023
1 parent 665686d commit 2c32eed
Show file tree
Hide file tree
Showing 17 changed files with 1,282 additions and 618 deletions.
1,432 changes: 888 additions & 544 deletions Runtime/safe-ds/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Runtime/safe-ds/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ python = "^3.10"
pandas = "^1.5.3"
scikit-learn = "^1.2.0"
seaborn = "^0.12.2"
ipython = "^8.8.0"
matplotlib = "^3.6.3"

[tool.poetry.dev-dependencies]
Expand Down
135 changes: 87 additions & 48 deletions Runtime/safe-ds/safe_ds/data/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import (
ColumnLengthMismatchError,
ColumnSizeError,
Expand Down Expand Up @@ -73,24 +74,6 @@ def get_value(self, index: int) -> Any:

return self._data[index]

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).
Returns
-------
idness: float
The idness of the column
Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.nunique() / self._data.size

@property
def statistics(self) -> ColumnStatistics:
return ColumnStatistics(self)
Expand Down Expand Up @@ -204,25 +187,6 @@ def has_missing_values(self) -> bool:
or (isinstance(value, Number) and np.isnan(value))
)

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.
Returns
-------
stability: float
Stability of this column
Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.value_counts()[self.statistics.mode()] / self._data.count()

def correlation_with(self, other_column: Column) -> float:
"""
Calculates Pearson correlation between this and another column, if both are numerical
Expand Down Expand Up @@ -270,6 +234,33 @@ def __eq__(self, other: object) -> bool:
def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Column to be used in Jupyter Notebooks
Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame()
tmp.columns = [self.name]

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)


class ColumnStatistics:
def __init__(self, column: Column):
Expand All @@ -286,11 +277,13 @@ def max(self) -> float:
Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.max()

def min(self) -> float:
Expand All @@ -304,11 +297,13 @@ def min(self) -> float:
Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.min()

def mean(self) -> float:
Expand All @@ -322,11 +317,13 @@ def mean(self) -> float:
Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.mean()

def mode(self) -> Any:
Expand All @@ -351,11 +348,13 @@ def median(self) -> float:
Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.median()

def sum(self) -> float:
Expand All @@ -369,7 +368,7 @@ def sum(self) -> float:
Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical
"""
Expand All @@ -391,7 +390,7 @@ def variance(self) -> float:
Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical
"""
Expand All @@ -414,7 +413,7 @@ def standard_deviation(self) -> float:
Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical
"""
Expand All @@ -423,3 +422,43 @@ def standard_deviation(self) -> float:
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.std()

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.
Returns
-------
stability: float
Stability of this column
Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return (
self.column._data.value_counts()[self.column.statistics.mode()]
/ self.column._data.count()
)

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).
Returns
-------
idness: float
The idness of the column
Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self.column._data.nunique() / self.column._data.size
66 changes: 65 additions & 1 deletion Runtime/safe-ds/safe_ds/data/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from typing import Any

import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import UnknownColumnNameError

from ._column_type import ColumnType
from ._table_schema import TableSchema


Expand Down Expand Up @@ -35,7 +37,8 @@ def get_value(self, column_name: str) -> Any:

def has_column(self, column_name: str) -> bool:
"""
Returns if the row contains a given column
Alias for self.schema.hasColumn(column_name: str) -> bool.
Returns if the row contains a given column.
Parameters
----------
Expand All @@ -49,6 +52,40 @@ def has_column(self, column_name: str) -> bool:
"""
return self.schema.has_column(column_name)

def get_column_names(self) -> list[str]:
"""
Alias for self.schema.get_column_names() -> list[str].
Returns a list of all column names saved in this schema
Returns
-------
column_names: list[str]
the column names
"""
return self.schema.get_column_names()

def get_type_of_column(self, column_name: str) -> ColumnType:
"""
Alias for self.schema.get_type_of_column(column_name: str) -> ColumnType.
Returns the type of the given column.
Parameters
----------
column_name : str
The name of the column you want the type of
Returns
-------
type: ColumnType
The type of the column
Raises
------
ColumnNameError
If the specified target column name doesn't exist
"""
return self.schema.get_type_of_column(column_name)

def __eq__(self, other: typing.Any) -> bool:
if not isinstance(other, Row):
return NotImplemented
Expand All @@ -58,3 +95,30 @@ def __eq__(self, other: typing.Any) -> bool:

def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Row to be used in Jupyter Notebooks
Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)
26 changes: 26 additions & 0 deletions Runtime/safe-ds/safe_ds/data/_supervised_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from IPython.core.display_functions import DisplayHandle

from ._column import Column
from ._table import Table

Expand Down Expand Up @@ -26,3 +28,27 @@ def feature_vectors(self) -> Table:
@property
def target_values(self) -> Column:
return self._y

def __repr__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__repr__()

def __str__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__str__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Table to be used in Jupyter Notebooks
Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
print(header_info)
return tmp._ipython_display_()
Loading

0 comments on commit 2c32eed

Please sign in to comment.