COMPAT/API: DataFrame.categorize missing values

Closes dask#1565 For compatability with pandas-dev/pandas#10929 where it was decided that `pd.Categorical(['a', np.nan], categories=['a', np.nan])` Should raise a `FutureWarning`. Now we just drop missing values before computing the distincts for the categories.
TomAugspurger · Sep 24, 2016 · 0ebee19 · 0ebee19
1 parent 32ad1a0
commit 0ebee19
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 1 deletion.
diff --git a/dask/dataframe/categorical.py b/dask/dataframe/categorical.py
@@ -31,7 +31,7 @@ def categorize(df, columns=None, **kwargs):
     if not isinstance(columns, (list, tuple)):
         columns = [columns]
 
-    distincts = [df[col].drop_duplicates() for col in columns]
+    distincts = [df[col].dropna().drop_duplicates() for col in columns]
     values = compute(*distincts, **kwargs)
 
     func = partial(_categorize_block, categories=dict(zip(columns, values)))

diff --git a/dask/dataframe/tests/test_categorical.py b/dask/dataframe/tests/test_categorical.py
@@ -1,3 +1,5 @@
+import warnings
+
 import pandas as pd
 import pandas.util.testing as tm
 import pytest
@@ -78,3 +80,11 @@ def test_categories():
 
     df3 = dd.categorical._categorize(categories, df2)
     tm.assert_frame_equal(df, df3)
+
+
+def test_categorize_nan():
+    df = dd.from_pandas(pd.DataFrame({"A": ['a', 'b', 'a', float('nan')]}),
+                        npartitions=2)
+    with warnings.catch_warnings(record=True) as record:
+        df.categorize().compute()
+    assert len(record) == 0
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -9,6 +9,8 @@ DataFrame
 - Return a series when functions given to ``dataframe.map_partitions`` return
   scalars (:pr:`1514`)
 - Fix type size inference for series (:pr:`1513`)
+- ``dataframe.DataFrame.categorize`` no longer includes missing values
+  in the ``categories``. This is for compatibility with a `pandas change<https://github.com/pydata/pandas/pull/10929>` (:pr:`1565`)
 - Fix head parser error in ``dataframe.read_csv`` when some lines have quotes
   (:pr:`1495`)
 - Add ``dataframe.reduction`` and ``series.reduction`` methods to apply generic