Skip to content

Commit

Permalink
BUG: fix dtype of all-NaN categories and MultiIndex levels (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
toobaz authored and No-Stream committed Nov 28, 2017
1 parent 7410239 commit b945703
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 14 deletions.
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def setup(self):
self.datetimes = pd.Series(pd.date_range(
'1995-01-01 00:00:00', periods=10000, freq='s'))

self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
self.values_all_nan = [np.nan] * len(self.values)

def time_concat(self):
concat([self.s, self.s])

Expand All @@ -46,6 +49,12 @@ def time_constructor_datetimes_with_nat(self):
t.iloc[-1] = pd.NaT
Categorical(t)

def time_constructor_with_nan(self):
Categorical(self.values_some_nan)

def time_constructor_all_nan(self):
Categorical(self.values_all_nan)


class Categoricals2(object):
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Other API Changes
^^^^^^^^^^^^^^^^^

- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
- All-NaN levels in ``MultiIndex`` are now assigned float rather than object dtype, coherently with flat indexes (:issue:`17929`).
- :class:`Timestamp` will no longer silently ignore unused or invalid `tz` or `tzinfo` arguments (:issue:`17690`)
- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the `tseries.offsets` module (:issue:`17830`)
-
Expand Down
19 changes: 15 additions & 4 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
self._dtype = dtype
return

# null_mask indicates missing values we want to exclude from inference.
# This means: only missing values in list-likes (not arrays/ndframes).
null_mask = np.array(False)

# sanitize input
if is_categorical_dtype(values):

Expand Down Expand Up @@ -316,13 +320,14 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
if not isinstance(values, np.ndarray):
values = _convert_to_list_like(values)
from pandas.core.series import _sanitize_array
# On list with NaNs, int values will be converted to float. Use
# "object" dtype to prevent this. In the end objects will be
# casted to int/... in the category assignment step.
if len(values) == 0 or isna(values).any():
# By convention, empty lists result in object dtype:
if len(values) == 0:
sanitize_dtype = 'object'
else:
sanitize_dtype = None
null_mask = isna(values)
if null_mask.any():
values = [values[idx] for idx in np.where(~null_mask)[0]]
values = _sanitize_array(values, None, dtype=sanitize_dtype)

if dtype.categories is None:
Expand Down Expand Up @@ -370,6 +375,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
"mean to use\n'Categorical.from_codes(codes, "
"categories)'?", RuntimeWarning, stacklevel=2)

if null_mask.any():
# Reinsert -1 placeholders for previously removed missing values
full_codes = - np.ones(null_mask.shape, dtype=codes.dtype)
full_codes[~null_mask] = codes
codes = full_codes

self._dtype = dtype
self._codes = coerce_indexer_dtype(codes, dtype.categories)

Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,12 +970,13 @@ def test_get_level_values_na(self):

arrays = [[np.nan, np.nan, np.nan], ['a', np.nan, 1]]
index = pd.MultiIndex.from_arrays(arrays)
values = index.get_level_values(0)
expected = np.array([np.nan, np.nan, np.nan])
tm.assert_numpy_array_equal(values.values.astype(float), expected)
values = index.get_level_values(1)
expected = np.array(['a', np.nan, 1], dtype=object)
tm.assert_numpy_array_equal(values.values, expected)
result = index.get_level_values(0)
expected = pd.Index([np.nan, np.nan, np.nan])
tm.assert_index_equal(result, expected)

result = index.get_level_values(1)
expected = pd.Index(['a', np.nan, 1])
tm.assert_index_equal(result, expected)

arrays = [['a', 'b', 'b'], pd.DatetimeIndex([0, 1, pd.NaT])]
index = pd.MultiIndex.from_arrays(arrays)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ def test_concat_categorical_coercion_nan(self):
s1 = pd.Series([np.nan, np.nan], dtype='category')
s2 = pd.Series([np.nan, np.nan])

exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object)
exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/reshape/test_union_categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ def test_union_categoricals_nan(self):
tm.assert_categorical_equal(res, exp)

# all NaN
res = union_categoricals([pd.Categorical([np.nan, np.nan]),
res = union_categoricals([pd.Categorical(np.array([np.nan, np.nan],
dtype=object)),
pd.Categorical(['X'])])
exp = Categorical([np.nan, np.nan, 'X'])
tm.assert_categorical_equal(res, exp)
Expand Down Expand Up @@ -250,7 +251,7 @@ def test_union_categoricals_sort(self):
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=True)
expected = Categorical([np.nan, np.nan], categories=[])
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([])
Expand Down Expand Up @@ -299,7 +300,7 @@ def test_union_categoricals_sort_false(self):
c1 = Categorical([np.nan])
c2 = Categorical([np.nan])
result = union_categoricals([c1, c2], sort_categories=False)
expected = Categorical([np.nan, np.nan], categories=[])
expected = Categorical([np.nan, np.nan])
tm.assert_categorical_equal(result, expected)

c1 = Categorical([])
Expand Down

0 comments on commit b945703

Please sign in to comment.