BUG: Have object dtype for empty Categorical.categories (pandas-dev#1…

…7249) * BUG: Have object dtype for empty Categorical ctor Previously we had a `Float64Index`, which is inconsistent with, e.g., the regular Index constructor. * TST: Update tests in multi for new return Previously these relied worked around the return type by wrapping list-likes in `np.array` and relying on that to cast to float. These workarounds are no longer nescessary. * TST: Update union_categorical tests This relied on `NaN` being a float and empty being a float. Not a necessary test anymore. * TST: set object dtype
jowens · Sep 20, 2017 · c148dd2 · c148dd2
1 parent 0bbda54
commit c148dd2
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 16 deletions.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -386,6 +386,9 @@ Numeric
 Categorical
 ^^^^^^^^^^^
 - Bug in :func:`Series.isin` when called with a categorical (:issue`16639`)
+- Bug in the categorical constructor with empty values and categories causing
+  the ``.categories`` to be an empty ``Float64Index`` rather than an empty
+  ``Index`` with object dtype (:issue:`17248`)
 
 
 Other

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -290,7 +290,10 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
                 # On list with NaNs, int values will be converted to float. Use
                 # "object" dtype to prevent this. In the end objects will be
                 # casted to int/... in the category assignment step.
-                dtype = 'object' if isna(values).any() else None
+                if len(values) == 0 or isna(values).any():
+                    dtype = 'object'
+                else:
+                    dtype = None
                 values = _sanitize_array(values, None, dtype=dtype)
 
         if categories is None:

diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -776,7 +776,7 @@ def test_from_arrays_empty(self):
             arrays = [[]] * N
             names = list('ABC')[:N]
             result = MultiIndex.from_arrays(arrays=arrays, names=names)
-            expected = MultiIndex(levels=[np.array([])] * N, labels=[[]] * N,
+            expected = MultiIndex(levels=[[]] * N, labels=[[]] * N,
                                   names=names)
             tm.assert_index_equal(result, expected)
 
@@ -829,7 +829,7 @@ def test_from_product_empty(self):
 
         # 1 level
         result = MultiIndex.from_product([[]], names=['A'])
-        expected = pd.Float64Index([], name='A')
+        expected = pd.Index([], name='A')
         tm.assert_index_equal(result, expected)
 
         # 2 levels
@@ -838,7 +838,7 @@ def test_from_product_empty(self):
         names = ['A', 'B']
         for first, second in zip(l1, l2):
             result = MultiIndex.from_product([first, second], names=names)
-            expected = MultiIndex(levels=[np.array(first), np.array(second)],
+            expected = MultiIndex(levels=[first, second],
                                   labels=[[], []], names=names)
             tm.assert_index_equal(result, expected)
 
@@ -847,8 +847,7 @@ def test_from_product_empty(self):
         for N in range(4):
             lvl2 = lrange(N)
             result = MultiIndex.from_product([[], lvl2, []], names=names)
-            expected = MultiIndex(levels=[np.array(A)
-                                          for A in [[], lvl2, []]],
+            expected = MultiIndex(levels=[[], lvl2, []],
                                   labels=[[], [], []], names=names)
             tm.assert_index_equal(result, expected)
 

diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -680,7 +680,7 @@ def test_concat_categorical_empty(self):
         tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
 
         s1 = pd.Series([], dtype='category')
-        s2 = pd.Series([])
+        s2 = pd.Series([], dtype='object')
 
         # different dtype => not-category
         tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)

diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py
@@ -107,17 +107,11 @@ def test_union_categoricals_empty(self):
         exp = Categorical([])
         tm.assert_categorical_equal(res, exp)
 
-        res = union_categoricals([pd.Categorical([]),
-                                  pd.Categorical([1.0])])
-        exp = Categorical([1.0])
+        res = union_categoricals([Categorical([]),
+                                  Categorical(['1'])])
+        exp = Categorical(['1'])
         tm.assert_categorical_equal(res, exp)
 
-        # to make dtype equal
-        nanc = pd.Categorical(np.array([np.nan], dtype=np.float64))
-        res = union_categoricals([nanc,
-                                  pd.Categorical([])])
-        tm.assert_categorical_equal(res, nanc)
-
     def test_union_categorical_same_category(self):
         # check fastpath
         c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -112,6 +112,16 @@ def test_setitem_listlike(self):
         result = c.codes[np.array([100000]).astype(np.int64)]
         tm.assert_numpy_array_equal(result, np.array([5], dtype='int8'))
 
+    def test_constructor_empty(self):
+        # GH 17248
+        c = Categorical([])
+        expected = Index([])
+        tm.assert_index_equal(c.categories, expected)
+
+        c = Categorical([], categories=[1, 2, 3])
+        expected = pd.Int64Index([1, 2, 3])
+        tm.assert_index_equal(c.categories, expected)
+
     def test_constructor_unsortable(self):
 
         # it works!