Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Groupy dropped nan groups from result when grouping over single column #36842

Merged
merged 26 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b384c51
Fix dropped nas with one group column an dropna=False
phofl Oct 3, 2020
08c3088
Add whatsnew
phofl Oct 3, 2020
47649aa
Fix failing test
phofl Oct 4, 2020
c93590b
Adress review
phofl Oct 6, 2020
c61ce7a
Address review comments
phofl Oct 6, 2020
ab333e4
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 7, 2020
dfa522a
Adress review comments
phofl Oct 7, 2020
2e3e1bf
Change type hint
phofl Oct 7, 2020
d067280
Change type annotation
phofl Oct 7, 2020
5b5b673
Fix type hint to index
phofl Oct 9, 2020
b0a0372
Fix type hint because Index can not be imported
phofl Oct 9, 2020
721d3d6
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 9, 2020
3100511
Change import order
phofl Oct 9, 2020
6ca4324
Fix type hints
phofl Oct 10, 2020
06ce333
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 10, 2020
602d557
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 10, 2020
9d1c760
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 11, 2020
7e809da
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 14, 2020
551bec6
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 15, 2020
749161d
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 29, 2020
528fe0d
Fix pattern
phofl Oct 30, 2020
0d95d58
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 30, 2020
6dbd632
Merge branch 'master' into 35646
jreback Oct 31, 2020
9d7c403
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 31, 2020
00fe075
Merge branch '35646' of https://github.com/phofl/pandas into 35646
phofl Oct 31, 2020
f5b25cb
Add lost whatsnew
phofl Oct 31, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,7 @@ Groupby/resample/rolling
- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`)
- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`)
- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`)
- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)

Reshaping
^^^^^^^^^
Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
val = keys[j][sorted_labels[j][i - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)

if len(tup) == 1:
tup = tup[0]
phofl marked this conversation as resolved.
Show resolved Hide resolved
result[tup] = index[start:i]
start = i
cur = lab
Expand All @@ -912,6 +913,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
val = keys[j][sorted_labels[j][n - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)
if len(tup) == 1:
tup = tup[0]
result[tup] = index[start:]

return result
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
@cache_readonly
def indices(self):
""" dict {group name -> group indices} """
if len(self.groupings) == 1:
return self.groupings[0].indices
else:
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)

@property
def codes(self) -> List[np.ndarray]:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ def get_indexer_dict(label_list, keys):
shape = [len(x) for x in keys]

group_index = get_group_index(label_list, shape, sort=True, xnull=True)
if np.all(group_index == -1):
phofl marked this conversation as resolved.
Show resolved Hide resolved
return {}
ngroups = (
((group_index.size and group_index.max()) + 1)
if is_int64_overflow_possible(shape)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,13 @@ def test_groupby_nat_exclude():
grouped.get_group(pd.NaT)


def test_groupby_two_group_keys_all_nan():
# GH #36842: Grouping over two group keys shouldn't raise an error
df = pd.DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
result = df.groupby(["a", "b"]).indices
assert result == {}


def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
Expand Down
26 changes: 25 additions & 1 deletion pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

import pandas as pd
import pandas.testing as tm
import pandas._testing as tm


@pytest.mark.parametrize(
Expand Down Expand Up @@ -336,3 +336,27 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,

expected = pd.DataFrame(selected_data, index=mi)
tm.assert_frame_equal(result, expected)


def test_groupby_nan_included():
# GH 35646, GH 35542
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = pd.DataFrame(data)
grouped = df.groupby("group", dropna=False)
result = grouped.indices
dtype = "int64"
expected = {
"g1": np.array([0, 2], dtype=dtype),
"g2": np.array([3], dtype=dtype),
np.nan: np.array([1, 4], dtype=dtype),
}
for result_values, expected_values in zip(result.values(), expected.values()):
tm.assert_numpy_array_equal(result_values, expected_values)
assert np.isnan(list(result.keys())[2])
assert list(result.keys())[0:2] == ["g1", "g2"]

result = grouped.mean()
expected = pd.DataFrame(
{"B": [1.0, 3.0, 2.5]}, index=pd.Index(["g1", "g2", np.nan], name="group")
)
tm.assert_frame_equal(result, expected)