From a2d03d4a63147e2f56615852814de4d2f77c373c Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Fri, 21 Jul 2017 12:38:11 +0200 Subject: [PATCH] BUG: do not cast ints to floats if inputs o crosstab are not aligned (#17011) closes #17005 --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/reshape/pivot.py | 31 +++++++++++++++++++----------- pandas/tests/reshape/test_pivot.py | 16 +++++++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5146bd35dff30..9aead6ab2386c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -225,10 +225,10 @@ Sparse Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Bug in :func:`crosstab` where non-aligned series of integers were casted to float (:issue:`17005`) - Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) - Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) - Numeric ^^^^^^^ - Bug in ``.clip()`` with ``axis=1`` and a list-like for ``threshold`` is passed; previously this raised ``ValueError`` (:issue:`15390`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0581ec7484c49..fbb7e6f970309 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -6,6 +6,7 @@ from pandas import Series, DataFrame, MultiIndex, Index from pandas.core.groupby import Grouper from pandas.core.reshape.util import cartesian_product +from pandas.core.index import _get_combined_index from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com @@ -493,6 +494,13 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') + obs_idxes = [obj.index for objs in (index, columns) for obj in objs + if hasattr(obj, 'index')] + if obs_idxes: + common_idx = _get_combined_index(obs_idxes, intersect=True) + else: + common_idx = None + data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) @@ -503,20 +511,21 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") + df = DataFrame(data, index=common_idx) if values is None: - df = DataFrame(data) df['__dummy__'] = 0 - table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=len, margins=margins, - margins_name=margins_name, dropna=dropna) - table = table.fillna(0).astype(np.int64) - + kwargs = {'aggfunc': len, 'fill_value': 0} else: - data['__dummy__'] = values - df = DataFrame(data) - table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=aggfunc, margins=margins, - margins_name=margins_name, dropna=dropna) + df['__dummy__'] = values + kwargs = {'aggfunc': aggfunc} + + table = df.pivot_table('__dummy__', index=rownames, columns=colnames, + margins=margins, margins_name=margins_name, + dropna=dropna, **kwargs) + + # GH 17013: + if values is None and margins: + table = table.fillna(0).astype(np.int64) # Post-process if normalize is not False: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9881ab72f3ef5..ff9f35b0253b0 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1058,6 +1058,22 @@ def test_crosstab_ndarray(self): assert result.index.name == 'row_0' assert result.columns.name == 'col_0' + def test_crosstab_non_aligned(self): + # GH 17005 + a = pd.Series([0, 1, 1], index=['a', 'b', 'c']) + b = pd.Series([3, 4, 3, 4, 3], index=['a', 'b', 'c', 'd', 'f']) + c = np.array([3, 4, 3]) + + expected = pd.DataFrame([[1, 0], [1, 1]], + index=Index([0, 1], name='row_0'), + columns=Index([3, 4], name='col_0')) + + result = crosstab(a, b) + tm.assert_frame_equal(result, expected) + + result = crosstab(a, c) + tm.assert_frame_equal(result, expected) + def test_crosstab_margins(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100)