diff --git a/docs/source/api.rst b/docs/source/api.rst index b88fe7317e..926a51b22b 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -137,6 +137,9 @@ Feature and Label Encoding (Dask-based Multi-GPU) .. autoclass:: cuml.dask.preprocessing.LabelBinarizer :members: + .. autoclass:: cuml.dask.preprocessing.LabelEncoder.LabelEncoder + :members: + .. autoclass:: cuml.dask.preprocessing.OneHotEncoder :members: diff --git a/python/cuml/dask/preprocessing/LabelEncoder.py b/python/cuml/dask/preprocessing/LabelEncoder.py index 29fb0570ca..de0b1a1fdf 100644 --- a/python/cuml/dask/preprocessing/LabelEncoder.py +++ b/python/cuml/dask/preprocessing/LabelEncoder.py @@ -29,7 +29,7 @@ class LabelEncoder(BaseEstimator, DelayedTransformMixin, DelayedInverseTransformMixin): """ - An nvcategory based implementation of ordinal label encoding + A cuDF-based implementation of ordinal label encoding Parameters ---------- @@ -43,78 +43,80 @@ class LabelEncoder(BaseEstimator, -------- Converting a categorical implementation to a numerical one - >>> from dask_cuda import LocalCUDACluster - >>> from dask.distributed import Client - >>> import cudf - >>> import dask_cudf - >>> from cuml.dask.preprocessing import LabelEncoder - - >>> import pandas as pd - >>> pd.set_option('display.max_colwidth', 2000) - - >>> cluster = LocalCUDACluster(threads_per_worker=1) - >>> client = Client(cluster) - >>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30], - ... 'cat_col':['a','b','c','a','a']}) - >>> ddf = dask_cudf.from_cudf(df, npartitions=2) - - >>> # There are two functionally equivalent ways to do this - >>> le = LabelEncoder() - >>> le.fit(ddf.cat_col) # le = le.fit(data.category) also works - - >>> encoded = le.transform(ddf.cat_col) - >>> print(encoded.compute()) - 0 0 - 1 1 - 2 2 - 3 0 - 4 0 - dtype: uint8 - - >>> # This method is preferred - >>> le = LabelEncoder() - >>> encoded = le.fit_transform(ddf.cat_col) - >>> print(encoded.compute()) - 0 0 - 1 1 - 2 2 - 3 0 - 4 0 - dtype: uint8 - - >>> # We can assign this to a new column - >>> ddf = ddf.assign(encoded=encoded.values) - >>> print(ddf.compute()) - num_col cat_col encoded - 0 10 a 0 - 1 20 b 1 - 2 30 c 2 - 3 30 a 0 - 4 30 a 0 - >>> # We can also encode more data - >>> test_data = cudf.Series(['c', 'a']) - >>> encoded = le.transform(dask_cudf.from_cudf(test_data, - ... npartitions=2)) - >>> print(encoded.compute()) - 0 2 - 1 0 - dtype: uint8 - - >>> # After train, ordinal label can be inverse_transform() back to - >>> # string labels - >>> ord_label = cudf.Series([0, 0, 1, 2, 1]) - >>> ord_label = le.inverse_transform( - ... dask_cudf.from_cudf(ord_label,npartitions=2)) - - >>> print(ord_label.compute()) - 0 a - 1 a - 2 b - 0 c - 1 b - dtype: object - >>> client.close() - >>> cluster.close() + .. code-block:: python + + >>> from dask_cuda import LocalCUDACluster + >>> from dask.distributed import Client + >>> import cudf + >>> import dask_cudf + >>> from cuml.dask.preprocessing import LabelEncoder + + >>> import pandas as pd + >>> pd.set_option('display.max_colwidth', 2000) + + >>> cluster = LocalCUDACluster(threads_per_worker=1) + >>> client = Client(cluster) + >>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30], + ... 'cat_col':['a','b','c','a','a']}) + >>> ddf = dask_cudf.from_cudf(df, npartitions=2) + + >>> # There are two functionally equivalent ways to do this + >>> le = LabelEncoder() + >>> le.fit(ddf.cat_col) # le = le.fit(data.category) also works + + >>> encoded = le.transform(ddf.cat_col) + >>> print(encoded.compute()) + 0 0 + 1 1 + 2 2 + 3 0 + 4 0 + dtype: uint8 + + >>> # This method is preferred + >>> le = LabelEncoder() + >>> encoded = le.fit_transform(ddf.cat_col) + >>> print(encoded.compute()) + 0 0 + 1 1 + 2 2 + 3 0 + 4 0 + dtype: uint8 + + >>> # We can assign this to a new column + >>> ddf = ddf.assign(encoded=encoded.values) + >>> print(ddf.compute()) + num_col cat_col encoded + 0 10 a 0 + 1 20 b 1 + 2 30 c 2 + 3 30 a 0 + 4 30 a 0 + >>> # We can also encode more data + >>> test_data = cudf.Series(['c', 'a']) + >>> encoded = le.transform(dask_cudf.from_cudf(test_data, + ... npartitions=2)) + >>> print(encoded.compute()) + 0 2 + 1 0 + dtype: uint8 + + >>> # After train, ordinal label can be inverse_transform() back to + >>> # string labels + >>> ord_label = cudf.Series([0, 0, 1, 2, 1]) + >>> ord_label = le.inverse_transform( + ... dask_cudf.from_cudf(ord_label,npartitions=2)) + + >>> print(ord_label.compute()) + 0 a + 1 a + 2 b + 0 c + 1 b + dtype: object + >>> client.close() + >>> cluster.close() """ def __init__(self, *, client=None, verbose=False, **kwargs): @@ -124,7 +126,7 @@ def __init__(self, *, client=None, verbose=False, **kwargs): def fit(self, y): """ - Fit a LabelEncoder (nvcategory) instance to a set of categories + Fit a LabelEncoder instance to a set of categories Parameters ---------- @@ -138,7 +140,7 @@ def fit(self, y): A fitted instance of itself to allow method chaining Notes - -------- + ----- Number of unique classes will be collected at the client. It'll consume memory proportional to the number of unique classes. """