Skip to content

Commit

Permalink
BUG in merging categorical dates
Browse files Browse the repository at this point in the history
closes pandas-dev#16900

Author: Dave Willmer <dave.willmer@gmail.com>

This patch had conflicts when merged, resolved by
Committer: Jeff Reback <jeff@reback.net>

Closes pandas-dev#16986 from dwillmer/cat_fix and squashes the following commits:

1ea1977 [Dave Willmer] Minor tweaks + comment
21a35a0 [Dave Willmer] Merge branch 'cat_fix' of https://github.com/dwillmer/pandas into cat_fix
04d5404 [Dave Willmer] Update tests
3cc5c24 [Dave Willmer] Merge branch 'master' into cat_fix
5e8e23b [Dave Willmer] Add whatsnew item
b82d117 [Dave Willmer] Lint fixes
a81933d [Dave Willmer] Remove unused import
218da66 [Dave Willmer] Generic solution to categorical problem
48e7163 [Dave Willmer] Test inner join
8843c10 [Dave Willmer] Fix TypeError when merging categorical dates
  • Loading branch information
dwillmer authored and alanbato committed Nov 10, 2017
1 parent a8238c2 commit 99769f1
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,9 @@ Sparse

Reshaping
^^^^^^^^^
- Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`)
- Joining/Merging with a non unique ``PeriodIndex`` raised a ``TypeError`` (:issue:`16871`)
- Bug in :func:`crosstab` where non-aligned series of integers were casted to float (:issue:`17005`)
- Bug in merging with categorical dtypes with datetimelikes incorrectly raised a ``TypeError`` (:issue:`16900`)
- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`)
- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`)

Expand Down
25 changes: 18 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ def _get_merge_keys(self):
return left_keys, right_keys, join_names

def _maybe_coerce_merge_keys(self):
# we have valid mergee's but we may have to further
# we have valid mergees but we may have to further
# coerce these if they are originally incompatible types
#
# for example if these are categorical, but are not dtype_equal
Expand All @@ -890,12 +890,16 @@ def _maybe_coerce_merge_keys(self):
if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
continue

lk_is_cat = is_categorical_dtype(lk)
rk_is_cat = is_categorical_dtype(rk)

# if either left or right is a categorical
# then the must match exactly in categories & ordered
if is_categorical_dtype(lk) and is_categorical_dtype(rk):
if lk_is_cat and rk_is_cat:
if lk.is_dtype_equal(rk):
continue
elif is_categorical_dtype(lk) or is_categorical_dtype(rk):

elif lk_is_cat or rk_is_cat:
pass

elif is_dtype_equal(lk.dtype, rk.dtype):
Expand All @@ -905,7 +909,7 @@ def _maybe_coerce_merge_keys(self):
# kinds to proceed, eg. int64 and int8
# further if we are object, but we infer to
# the same, then proceed
if (is_numeric_dtype(lk) and is_numeric_dtype(rk)):
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
continue

Expand All @@ -914,13 +918,20 @@ def _maybe_coerce_merge_keys(self):
continue

# Houston, we have a problem!
# let's coerce to object
# let's coerce to object if the dtypes aren't
# categorical, otherwise coerce to the category
# dtype. If we coerced categories to object,
# then we would lose type information on some
# columns, and end up trying to merge
# incompatible dtypes. See GH 16900.
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
self.left = self.left.assign(
**{name: self.left[name].astype(object)})
**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
self.right = self.right.assign(
**{name: self.right[name].astype(object)})
**{name: self.right[name].astype(typ)})

def _validate_specification(self):
# Hm, any way to make this logic less complicated??
Expand Down
36 changes: 35 additions & 1 deletion pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pylint: disable=E1103

import pytest
from datetime import datetime
from datetime import datetime, date
from numpy.random import randn
from numpy import nan
import numpy as np
Expand Down Expand Up @@ -1515,6 +1515,40 @@ def test_self_join_multiple_categories(self):

assert_frame_equal(result, df)

def test_dtype_on_categorical_dates(self):
# GH 16900
# dates should not be coerced to ints

df = pd.DataFrame(
[[date(2001, 1, 1), 1.1],
[date(2001, 1, 2), 1.3]],
columns=['date', 'num2']
)
df['date'] = df['date'].astype('category')

df2 = pd.DataFrame(
[[date(2001, 1, 1), 1.3],
[date(2001, 1, 3), 1.4]],
columns=['date', 'num4']
)
df2['date'] = df2['date'].astype('category')

expected_outer = pd.DataFrame([
[pd.Timestamp('2001-01-01'), 1.1, 1.3],
[pd.Timestamp('2001-01-02'), 1.3, np.nan],
[pd.Timestamp('2001-01-03'), np.nan, 1.4]],
columns=['date', 'num2', 'num4']
)
result_outer = pd.merge(df, df2, how='outer', on=['date'])
assert_frame_equal(result_outer, expected_outer)

expected_inner = pd.DataFrame(
[[pd.Timestamp('2001-01-01'), 1.1, 1.3]],
columns=['date', 'num2', 'num4']
)
result_inner = pd.merge(df, df2, how='inner', on=['date'])
assert_frame_equal(result_inner, expected_inner)


@pytest.fixture
def left_df():
Expand Down

0 comments on commit 99769f1

Please sign in to comment.