Add Dask LabelEncoder to the documentation (rapidsai#5023)

This will close rapidsai#4931 Authors: - Nick Becker (https://github.com/beckernick) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#5023
jakirkham · Dec 18, 2022 · 87f593e · 87f593e
1 parent 28e748d
commit 87f593e
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 75 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -137,6 +137,9 @@ Feature and Label Encoding (Dask-based Multi-GPU)
  .. autoclass:: cuml.dask.preprocessing.LabelBinarizer
     :members:
 
+ .. autoclass:: cuml.dask.preprocessing.LabelEncoder.LabelEncoder
+    :members:
+
  .. autoclass:: cuml.dask.preprocessing.OneHotEncoder
     :members:
 

diff --git a/python/cuml/dask/preprocessing/LabelEncoder.py b/python/cuml/dask/preprocessing/LabelEncoder.py
@@ -29,7 +29,7 @@ class LabelEncoder(BaseEstimator,
                    DelayedTransformMixin,
                    DelayedInverseTransformMixin):
     """
-    An nvcategory based implementation of ordinal label encoding
+    A cuDF-based implementation of ordinal label encoding
 
     Parameters
     ----------
@@ -43,78 +43,80 @@ class LabelEncoder(BaseEstimator,
     --------
     Converting a categorical implementation to a numerical one
 
-    >>> from dask_cuda import LocalCUDACluster
-    >>> from dask.distributed import Client
-    >>> import cudf
-    >>> import dask_cudf
-    >>> from cuml.dask.preprocessing import LabelEncoder
-
-    >>> import pandas as pd
-    >>> pd.set_option('display.max_colwidth', 2000)
-
-    >>> cluster = LocalCUDACluster(threads_per_worker=1)
-    >>> client = Client(cluster)
-    >>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30],
-    ...                    'cat_col':['a','b','c','a','a']})
-    >>> ddf = dask_cudf.from_cudf(df, npartitions=2)
-
-    >>> # There are two functionally equivalent ways to do this
-    >>> le = LabelEncoder()
-    >>> le.fit(ddf.cat_col)  # le = le.fit(data.category) also works
-    <cuml.dask.preprocessing.LabelEncoder.LabelEncoder object at 0x...>
-    >>> encoded = le.transform(ddf.cat_col)
-    >>> print(encoded.compute())
-    0    0
-    1    1
-    2    2
-    3    0
-    4    0
-    dtype: uint8
-
-    >>> # This method is preferred
-    >>> le = LabelEncoder()
-    >>> encoded = le.fit_transform(ddf.cat_col)
-    >>> print(encoded.compute())
-    0    0
-    1    1
-    2    2
-    3    0
-    4    0
-    dtype: uint8
-
-    >>> # We can assign this to a new column
-    >>> ddf = ddf.assign(encoded=encoded.values)
-    >>> print(ddf.compute())
-    num_col cat_col  encoded
-    0       10       a        0
-    1       20       b        1
-    2       30       c        2
-    3       30       a        0
-    4       30       a        0
-    >>> # We can also encode more data
-    >>> test_data = cudf.Series(['c', 'a'])
-    >>> encoded = le.transform(dask_cudf.from_cudf(test_data,
-    ...                                            npartitions=2))
-    >>> print(encoded.compute())
-    0    2
-    1    0
-    dtype: uint8
-
-    >>> # After train, ordinal label can be inverse_transform() back to
-    >>> # string labels
-    >>> ord_label = cudf.Series([0, 0, 1, 2, 1])
-    >>> ord_label = le.inverse_transform(
-    ...    dask_cudf.from_cudf(ord_label,npartitions=2))
-
-    >>> print(ord_label.compute())
-    0    a
-    1    a
-    2    b
-    0    c
-    1    b
-    dtype: object
-    >>> client.close()
-    >>> cluster.close()
+    .. code-block:: python
+
+        >>> from dask_cuda import LocalCUDACluster
+        >>> from dask.distributed import Client
+        >>> import cudf
+        >>> import dask_cudf
+        >>> from cuml.dask.preprocessing import LabelEncoder
+
+        >>> import pandas as pd
+        >>> pd.set_option('display.max_colwidth', 2000)
+
+        >>> cluster = LocalCUDACluster(threads_per_worker=1)
+        >>> client = Client(cluster)
+        >>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30],
+        ...                    'cat_col':['a','b','c','a','a']})
+        >>> ddf = dask_cudf.from_cudf(df, npartitions=2)
+
+        >>> # There are two functionally equivalent ways to do this
+        >>> le = LabelEncoder()
+        >>> le.fit(ddf.cat_col)  # le = le.fit(data.category) also works
+        <cuml.dask.preprocessing.LabelEncoder.LabelEncoder object at 0x...>
+        >>> encoded = le.transform(ddf.cat_col)
+        >>> print(encoded.compute())
+        0    0
+        1    1
+        2    2
+        3    0
+        4    0
+        dtype: uint8
+
+        >>> # This method is preferred
+        >>> le = LabelEncoder()
+        >>> encoded = le.fit_transform(ddf.cat_col)
+        >>> print(encoded.compute())
+        0    0
+        1    1
+        2    2
+        3    0
+        4    0
+        dtype: uint8
+
+        >>> # We can assign this to a new column
+        >>> ddf = ddf.assign(encoded=encoded.values)
+        >>> print(ddf.compute())
+        num_col cat_col  encoded
+        0       10       a        0
+        1       20       b        1
+        2       30       c        2
+        3       30       a        0
+        4       30       a        0
+        >>> # We can also encode more data
+        >>> test_data = cudf.Series(['c', 'a'])
+        >>> encoded = le.transform(dask_cudf.from_cudf(test_data,
+        ...                                            npartitions=2))
+        >>> print(encoded.compute())
+        0    2
+        1    0
+        dtype: uint8
+
+        >>> # After train, ordinal label can be inverse_transform() back to
+        >>> # string labels
+        >>> ord_label = cudf.Series([0, 0, 1, 2, 1])
+        >>> ord_label = le.inverse_transform(
+        ...    dask_cudf.from_cudf(ord_label,npartitions=2))
+
+        >>> print(ord_label.compute())
+        0    a
+        1    a
+        2    b
+        0    c
+        1    b
+        dtype: object
+        >>> client.close()
+        >>> cluster.close()
 
     """
     def __init__(self, *, client=None, verbose=False, **kwargs):
@@ -124,7 +126,7 @@ def __init__(self, *, client=None, verbose=False, **kwargs):
 
     def fit(self, y):
         """
-        Fit a LabelEncoder (nvcategory) instance to a set of categories
+        Fit a LabelEncoder instance to a set of categories
 
         Parameters
         ----------
@@ -138,7 +140,7 @@ def fit(self, y):
             A fitted instance of itself to allow method chaining
 
         Notes
-        --------
+        -----
         Number of unique classes will be collected at the client. It'll
         consume memory proportional to the number of unique classes.
         """