Skip to content

Commit

Permalink
FEA/MAINT Add column-wise transforms & refactor TableVectorizer (#902)
Browse files Browse the repository at this point in the history
Co-authored-by: Théo Jolivet <57430673+TheooJ@users.noreply.github.com>
  • Loading branch information
jeromedockes and TheooJ authored May 28, 2024
1 parent 42b5f90 commit 5b30ddd
Show file tree
Hide file tree
Showing 48 changed files with 5,220 additions and 2,517 deletions.
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ It is currently undergoing fast development and backward compatibility is not en

Major changes
-------------
* The :class:`TableVectorizer` now consistently applies the same transformation
across different calls to `transform`. There also have been some breaking
changes to its functionality: (i) all transformations are now applied
independently to each column, i.e. it does not perform multivariate
transformations (ii) in ``specific_transformers`` the same column may not be
used twice (go through 2 different transformers).
:pr:`902` by :user:`Jérôme Dockès <jeromedockes>`.

* Added the :class:`MultiAggJoiner` that allows to augment a main table with
multiple auxiliary tables. :pr:`876` by :user:`Théo Jolivet <TheooJ>`.

Expand Down
11 changes: 10 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ This page lists all available functions and classes of `skrub`.
GapEncoder
MinHashEncoder
SimilarityEncoder
ToCategorical

.. raw:: html

Expand All @@ -98,10 +99,18 @@ This page lists all available functions and classes of `skrub`.

.. autosummary::
:toctree: generated/
:template: function.rst
:template: class.rst
:nosignatures:
:caption: Converting datetime columns in a table

ToDatetime


.. autosummary::
:toctree: generated/
:template: function.rst
:nosignatures:

to_datetime

.. raw:: html
Expand Down
226 changes: 168 additions & 58 deletions examples/01_encodings.py

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions examples/03_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,12 @@

encoder = make_column_transformer(
(OneHotEncoder(handle_unknown="ignore"), ["city"]),
(DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]),
(DatetimeEncoder(add_weekday=True, resolution="minute"), "date.utc"),
remainder="drop",
)

X_enc = encoder.fit_transform(X)
pprint(encoder.get_feature_names_out())
# pprint(encoder.get_feature_names_out())

###############################################################################
# We see that the encoder is working as expected: the ``"date.utc"`` column has
Expand All @@ -119,7 +119,7 @@
# Here, for example, we want it to extract the day of the week.

table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
datetime_transformer=DatetimeEncoder(add_weekday=True),
).fit(X)
pprint(table_vec.get_feature_names_out())

Expand Down Expand Up @@ -257,7 +257,7 @@
from sklearn.inspection import permutation_importance

table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
datetime_transformer=DatetimeEncoder(add_weekday=True),
)

# In this case, we don't use a pipeline, because we want to compute the
Expand All @@ -280,8 +280,8 @@
y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9)
)
plt.tight_layout()
plt.show()

###############################################################################
# We can see that the total seconds since Epoch and the hour of the day
# are the most important feature, which seems reasonable.
#
Expand Down
22 changes: 11 additions & 11 deletions examples/08_join_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,8 @@


table_vectorizer = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True)
datetime_transformer=DatetimeEncoder(add_weekday=True)
)
table_vectorizer.set_output(transform="pandas")
X_date_encoded = table_vectorizer.fit_transform(X)
X_date_encoded.head()

Expand All @@ -103,19 +102,19 @@


def make_barplot(x, y, title):
fig, ax = plt.subplots(layout="constrained")
norm = plt.Normalize(y.min(), y.max())
cmap = plt.get_cmap("magma")

sns.barplot(x=x, y=y, palette=cmap(norm(y)))
plt.title(title)
plt.xticks(rotation=30)
plt.ylabel(None)
plt.tight_layout()
sns.barplot(x=x, y=y, palette=cmap(norm(y)), ax=ax)
ax.set_title(title)
ax.set_xticks(ax.get_xticks(), labels=ax.get_xticklabels(), rotation=30)
ax.set_ylabel(None)


# O is Monday, 6 is Sunday

daily_volume = X_date_encoded["timestamp_day_of_week"].value_counts().sort_index()
daily_volume = X_date_encoded["timestamp_weekday"].value_counts().sort_index()

make_barplot(
x=daily_volume.index,
Expand Down Expand Up @@ -287,9 +286,10 @@ def baseline_r2(X, y, train_idx, test_idx):

# we only keep the 5 out of 10 last results
# because the initial size of the train set is rather small
sns.boxplot(results.tail(5), palette="magma")
plt.ylabel("R2 score")
plt.title("Hyper parameters grid-search results")
fig, ax = plt.subplots(layout="constrained")
sns.boxplot(results.tail(5), palette="magma", ax=ax)
ax.set_ylabel("R2 score")
ax.set_title("Hyper parameters grid-search results")
plt.tight_layout()

###############################################################################
Expand Down
6 changes: 5 additions & 1 deletion skrub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ._agg_joiner import AggJoiner, AggTarget
from ._check_dependencies import check_dependencies
from ._datetime_encoder import DatetimeEncoder, to_datetime
from ._datetime_encoder import DatetimeEncoder
from ._deduplicate import compute_ngram_distance, deduplicate
from ._fuzzy_join import fuzzy_join
from ._gap_encoder import GapEncoder
Expand All @@ -16,6 +16,8 @@
from ._select_cols import DropCols, SelectCols
from ._similarity_encoder import SimilarityEncoder
from ._table_vectorizer import TableVectorizer
from ._to_categorical import ToCategorical
from ._to_datetime import ToDatetime, to_datetime

check_dependencies()

Expand All @@ -25,6 +27,7 @@

__all__ = [
"DatetimeEncoder",
"ToDatetime",
"Joiner",
"fuzzy_join",
"GapEncoder",
Expand All @@ -34,6 +37,7 @@
"TableVectorizer",
"deduplicate",
"compute_ngram_distance",
"ToCategorical",
"to_datetime",
"AggJoiner",
"MultiAggJoiner",
Expand Down
184 changes: 184 additions & 0 deletions skrub/_check_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import warnings

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from . import _dataframe as sbd
from . import _join_utils, _utils
from ._dispatch import dispatch

__all__ = ["CheckInputDataFrame"]


def _column_names_to_strings(column_names):
non_string = [c for c in column_names if not isinstance(c, str)]
if not non_string:
return column_names
warnings.warn(
f"Some column names are not strings: {non_string}. All column names"
" must be strings; converting to strings."
)
return list(map(str, column_names))


def _deduplicated_column_names(column_names):
duplicates = _utils.get_duplicates(column_names)
if not duplicates:
return column_names
warnings.warn(
f"Found duplicated column names: {duplicates}. Please make sure column names"
" are unique. Renaming columns that have duplicated names."
)
return _join_utils.pick_column_names(column_names)


def _cleaned_column_names(column_names):
return _deduplicated_column_names(_column_names_to_strings(column_names))


@dispatch
def _check_not_pandas_sparse(df):
pass


@_check_not_pandas_sparse.specialize("pandas")
def _check_not_pandas_sparse_pandas(df):
import pandas as pd

sparse_cols = [
col for col in df.columns if isinstance(df[col].dtype, pd.SparseDtype)
]
if sparse_cols:
raise TypeError(
f"Columns {sparse_cols} are sparse Pandas series, but dense "
"data is required. Use ``df[col].sparse.to_dense()`` to convert "
"a series from sparse to dense."
)


def _check_is_dataframe(df):
if not sbd.is_dataframe(df):
raise TypeError(
"Only pandas and polars DataFrames are supported. Cannot handle X of"
f" type: {type(df)}."
)


def _collect_lazyframe(df):
if not sbd.is_lazyframe(df):
return df
warnings.warn(
"At the moment, skrub only works on eager DataFrames, calling collect()."
)
return sbd.collect(df)


class CheckInputDataFrame(TransformerMixin, BaseEstimator):
"""Check the dataframe entering a skrub pipeline.
This transformer ensures that:
- The input is a dataframe.
- Numpy arrays are converted to pandas dataframes with a warning.
- The dataframe library is the same during ``fit`` and ``transform``, e.g.
fitting on a polars dataframe and then transforming a pandas dataframe is
not allowed.
- A TypeError is raised otherwise.
- Column names are unique strings.
- Non-strings are cast to strings.
- A random suffix is added to duplicated names.
- If either of these operations is needed, a warning is emitted.
- Only applies to pandas; polars column names are always unique strings.
- The input is not sparse.
- A TypeError is raised otherwise.
- The input is not a ``LazyFrame``.
- A ``LazyFrame`` is ``collect``ed with a warning.
- The column names are the same during ``fit`` and ``transform``.
- A ValueError is raised otherwise.
Attributes
----------
module_name_ : str
The name of the dataframe module, 'polars' or 'pandas'.
feature_names_in_ : list
The column names of the input (before cleaning).
n_features_in_ : int
The number of input columns.
feature_names_out_ : list of str
The column names after converting to string and deduplication.
"""

def fit(self, X, y=None):
self.fit_transform(X, y)
return self

def fit_transform(self, X, y=None):
del y
X = self._handle_array(X)
_check_is_dataframe(X)
self.module_name_ = sbd.dataframe_module_name(X)
# TODO check schema (including dtypes) not just names.
# Need to decide how strict we should be about types
column_names = sbd.column_names(X)
self.feature_names_in_ = column_names
self.n_features_in_ = len(column_names)
self.feature_names_out_ = _cleaned_column_names(column_names)
if sbd.column_names(X) != self.feature_names_out_:
X = sbd.set_column_names(X, self.feature_names_out_)
_check_not_pandas_sparse(X)
X = _collect_lazyframe(X)
return X

def transform(self, X):
check_is_fitted(self, "module_name_")
X = self._handle_array(X)
_check_is_dataframe(X)
module_name = sbd.dataframe_module_name(X)
if module_name != self.module_name_:
raise TypeError(
f"Pipeline was fitted to a {self.module_name_} dataframe "
f"but is being applied to a {module_name} dataframe. "
"This is likely to produce errors and is not supported."
)
column_names = sbd.column_names(X)
if column_names != self.feature_names_in_:
import difflib

diff = "\n".join(
difflib.Differ().compare(self.feature_names_in_, column_names)
)
message = (
f"Columns of dataframes passed to fit() and transform() differ:\n{diff}"
)
raise ValueError(message)
if sbd.column_names(X) != self.feature_names_out_:
X = sbd.set_column_names(X, self.feature_names_out_)
_check_not_pandas_sparse(X)
X = _collect_lazyframe(X)
return X

def _handle_array(self, X):
if not isinstance(X, np.ndarray):
return X
if X.ndim != 2:
raise ValueError(
"Input should be a DataFrame. Found an array with incompatible shape:"
f" {X.shape}."
)
warnings.warn(
"Only pandas and polars DataFrames are supported, but input is a Numpy"
" array. Please convert Numpy arrays to DataFrames before passing them to"
" skrub transformers. Converting to pandas DataFrame with columns"
" ['0', '1', …]."
)
import pandas as pd

columns = list(map(str, range(X.shape[1])))
X = pd.DataFrame(X, columns=columns)
return X

# set_output api compatibility

def get_feature_names_out(self):
return self.feature_names_out_
Loading

0 comments on commit 5b30ddd

Please sign in to comment.