-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: DataFrame reductions inconsistent with Series counterparts #37827
Changes from all commits
9358552
bba4256
4a34b4a
2174ef6
c8cd631
5a0749c
034a124
647104f
61fd8d9
2132ea0
470f7d2
1a90a02
15381ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,6 +284,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) | |
In [6]: df[["B", "C"]].all(bool_only=True) | ||
|
||
|
||
Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid | ||
this pathological behavior (:issue:`37827`): | ||
|
||
.. ipython:: python | ||
|
||
df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) | ||
|
||
|
||
*Previous behavior*: | ||
|
||
.. code-block:: ipython | ||
|
||
In [3]: df.mean() | ||
Out[3]: Series([], dtype: float64) | ||
|
||
In [4]: df[["A"]].mean() | ||
Out[4]: | ||
A 1.0 | ||
dtype: float64 | ||
|
||
*New behavior*: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needs to be an ipython block & remove the prompts |
||
|
||
.. ipython:: python | ||
|
||
df.mean() | ||
|
||
df[["A"]].mean() | ||
|
||
Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be | ||
consistent with their :class:`Series` counterparts. In particular, for | ||
reductions where the :class:`Series` method raises ``TypeError``, the | ||
:class:`DataFrame` reduction will now consider that column non-numeric | ||
instead of casting to NumPy which may have different semantics (:issue:`36076`, | ||
:issue:`28949`, :issue:`21020`). | ||
|
||
.. ipython:: python | ||
|
||
ser = pd.Series([0, 1], dtype="category", name="A") | ||
df = ser.to_frame() | ||
|
||
|
||
*Previous behavior*: | ||
|
||
.. code-block:: ipython | ||
|
||
In [5]: df.any() | ||
Out[5]: | ||
A True | ||
dtype: bool | ||
|
||
*New behavior*: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make an ipython block & remove the prompts |
||
|
||
.. ipython:: python | ||
|
||
df.any() | ||
|
||
|
||
.. _whatsnew_120.api_breaking.python: | ||
|
||
Increased minimum version for Python | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -464,7 +464,9 @@ def _split(self) -> List["Block"]: | |
new_blocks.append(nb) | ||
return new_blocks | ||
|
||
def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: | ||
def split_and_operate( | ||
self, mask, f, inplace: bool, ignore_failures: bool = False | ||
) -> List["Block"]: | ||
""" | ||
split the block per-column, and apply the callable f | ||
per-column, return a new block for each. Handle | ||
|
@@ -474,7 +476,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: | |
---------- | ||
mask : 2-d boolean mask | ||
f : callable accepting (1d-mask, 1d values, indexer) | ||
inplace : boolean | ||
inplace : bool | ||
ignore_failures : bool, default False | ||
|
||
Returns | ||
------- | ||
|
@@ -513,8 +516,16 @@ def make_a_block(nv, ref_loc): | |
v = new_values[i] | ||
|
||
# need a new block | ||
if m.any(): | ||
nv = f(m, v, i) | ||
if m.any() or m.size == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a comment here on what is going on |
||
# Apply our function; we may ignore_failures if this is a | ||
# reduction that is dropping nuisance columns GH#37827 | ||
try: | ||
nv = f(m, v, i) | ||
except TypeError: | ||
if ignore_failures: | ||
continue | ||
else: | ||
raise | ||
else: | ||
nv = v if inplace else v.copy() | ||
|
||
|
@@ -2459,7 +2470,9 @@ def mask_func(mask, values, inplace): | |
values = values.reshape(1, -1) | ||
return func(values) | ||
|
||
return self.split_and_operate(None, mask_func, False) | ||
return self.split_and_operate( | ||
None, mask_func, False, ignore_failures=ignore_failures | ||
) | ||
|
||
try: | ||
res = func(values) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
from pandas import ( | ||
Categorical, | ||
DataFrame, | ||
Index, | ||
MultiIndex, | ||
Series, | ||
Timestamp, | ||
|
@@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self): | |
pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), | ||
pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), | ||
pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), | ||
(np.all, {"A": Series([0, 1], dtype="category")}, False), | ||
(np.any, {"A": Series([0, 1], dtype="category")}, True), | ||
# np.all on Categorical raises, so the reduction drops the | ||
# column, so all is being done on an empty Series, so is True | ||
(np.all, {"A": Series([0, 1], dtype="category")}, True), | ||
(np.any, {"A": Series([0, 1], dtype="category")}, False), | ||
(np.all, {"A": Series([1, 2], dtype="category")}, True), | ||
(np.any, {"A": Series([1, 2], dtype="category")}, True), | ||
(np.any, {"A": Series([1, 2], dtype="category")}, False), | ||
# Mix GH#21484 | ||
pytest.param( | ||
np.all, | ||
|
@@ -1308,6 +1311,114 @@ def test_frame_any_with_timedelta(self): | |
tm.assert_series_equal(result, expected) | ||
|
||
|
||
class TestNuisanceColumns: | ||
@pytest.mark.parametrize("method", ["any", "all"]) | ||
def test_any_all_categorical_dtype_nuisance_column(self, method): | ||
# GH#36076 DataFrame should match Series behavior | ||
ser = Series([0, 1], dtype="category", name="A") | ||
df = ser.to_frame() | ||
|
||
# Double-check the Series behavior is to raise | ||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
getattr(ser, method)() | ||
|
||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line here |
||
getattr(np, method)(ser) | ||
|
||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
getattr(df, method)(bool_only=False) | ||
|
||
# With bool_only=None, operating on this column raises and is ignored, | ||
# so we expect an empty result. | ||
result = getattr(df, method)(bool_only=None) | ||
expected = Series([], index=Index([]), dtype=bool) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = getattr(np, method)(df, axis=0) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_median_categorical_dtype_nuisance_column(self): | ||
# GH#21020 DataFrame.median should match Series.median | ||
df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])}) | ||
ser = df["A"] | ||
|
||
# Double-check the Series behavior is to raise | ||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
ser.median() | ||
|
||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
df.median(numeric_only=False) | ||
|
||
result = df.median() | ||
expected = Series([], index=Index([]), dtype=np.float64) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# same thing, but with an additional non-categorical column | ||
df["B"] = df["A"].astype(int) | ||
|
||
with pytest.raises(TypeError, match="does not implement reduction"): | ||
df.median(numeric_only=False) | ||
|
||
result = df.median() | ||
expected = Series([2.0], index=["B"]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead | ||
# of expected.values | ||
|
||
@pytest.mark.parametrize("method", ["min", "max"]) | ||
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): | ||
# GH#28949 DataFrame.min should behave like Series.min | ||
cat = Categorical(["a", "b", "c", "b"], ordered=False) | ||
ser = Series(cat) | ||
df = ser.to_frame("A") | ||
|
||
# Double-check the Series behavior | ||
with pytest.raises(TypeError, match="is not ordered for operation"): | ||
getattr(ser, method)() | ||
|
||
with pytest.raises(TypeError, match="is not ordered for operation"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
getattr(np, method)(ser) | ||
|
||
with pytest.raises(TypeError, match="is not ordered for operation"): | ||
getattr(df, method)(numeric_only=False) | ||
|
||
result = getattr(df, method)() | ||
expected = Series([], index=Index([]), dtype=np.float64) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = getattr(np, method)(df) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# same thing, but with an additional non-categorical column | ||
df["B"] = df["A"].astype(object) | ||
result = getattr(df, method)() | ||
if method == "min": | ||
expected = Series(["a"], index=["B"]) | ||
else: | ||
expected = Series(["c"], index=["B"]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
result = getattr(np, method)(df) | ||
tm.assert_series_equal(result, expected) | ||
|
||
def test_reduction_object_block_splits_nuisance_columns(self): | ||
# GH#37827 | ||
df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) | ||
|
||
# We should only exclude "B", not "A" | ||
result = df.mean() | ||
expected = Series([1.0], index=["A"]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
# Same behavior but heterogeneous dtype | ||
df["C"] = df["A"].astype(int) + 4 | ||
|
||
result = df.mean() | ||
expected = Series([1.0, 5.0], index=["A", "C"]) | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
def test_sum_timedelta64_skipna_false(): | ||
# GH#17235 | ||
arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) | ||
|
@@ -1352,6 +1463,6 @@ def test_minmax_extensionarray(method, numeric_only): | |
df = DataFrame({"Int64": ser}) | ||
result = getattr(df, method)(numeric_only=numeric_only) | ||
expected = Series( | ||
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") | ||
[getattr(int64_info, method)], index=Index(["Int64"], dtype="object") | ||
) | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you make sure to list all the issues you are closing the OP (3?)