Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Groupy dropped nan groups from result when grouping over single column #36842

Merged
merged 26 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
b384c51
Fix dropped nas with one group column an dropna=False
phofl Oct 3, 2020
08c3088
Add whatsnew
phofl Oct 3, 2020
47649aa
Fix failing test
phofl Oct 4, 2020
c93590b
Adress review
phofl Oct 6, 2020
c61ce7a
Address review comments
phofl Oct 6, 2020
ab333e4
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 7, 2020
dfa522a
Adress review comments
phofl Oct 7, 2020
2e3e1bf
Change type hint
phofl Oct 7, 2020
d067280
Change type annotation
phofl Oct 7, 2020
5b5b673
Fix type hint to index
phofl Oct 9, 2020
b0a0372
Fix type hint because Index can not be imported
phofl Oct 9, 2020
721d3d6
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 9, 2020
3100511
Change import order
phofl Oct 9, 2020
6ca4324
Fix type hints
phofl Oct 10, 2020
06ce333
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 10, 2020
602d557
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 10, 2020
9d1c760
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 11, 2020
7e809da
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 14, 2020
551bec6
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 15, 2020
749161d
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 29, 2020
528fe0d
Fix pattern
phofl Oct 30, 2020
0d95d58
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 30, 2020
6dbd632
Merge branch 'master' into 35646
jreback Oct 31, 2020
9d7c403
Merge branch 'master' of https://github.com/pandas-dev/pandas into 35646
phofl Oct 31, 2020
00fe075
Merge branch '35646' of https://github.com/phofl/pandas into 35646
phofl Oct 31, 2020
f5b25cb
Add lost whatsnew
phofl Oct 31, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`)
- Bug in :meth:`Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`)
- Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`)
- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
- Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)

Reshaping
Expand Down
29 changes: 18 additions & 11 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,

if lab != cur:
if lab != -1:
tup = PyTuple_New(k)
for j in range(k):
val = keys[j][sorted_labels[j][i - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)

if k == 1:
# When k = 1 we do not want to return a tuple as key
tup = keys[0][sorted_labels[0][i - 1]]
else:
tup = PyTuple_New(k)
for j in range(k):
val = keys[j][sorted_labels[j][i - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)
result[tup] = index[start:i]
start = i
cur = lab

tup = PyTuple_New(k)
for j in range(k):
val = keys[j][sorted_labels[j][n - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)
if k == 1:
# When k = 1 we do not want to return a tuple as key
tup = keys[0][sorted_labels[0][n - 1]]
else:
tup = PyTuple_New(k)
for j in range(k):
val = keys[j][sorted_labels[j][n - 1]]
PyTuple_SET_ITEM(tup, j, val)
Py_INCREF(val)
result[tup] = index[start:]

return result
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
@cache_readonly
def indices(self):
""" dict {group name -> group indices} """
if len(self.groupings) == 1:
return self.groupings[0].indices
else:
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)
codes_list = [ping.codes for ping in self.groupings]
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)

@property
def codes(self) -> List[np.ndarray]:
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
TYPE_CHECKING,
Callable,
DefaultDict,
Dict,
Iterable,
List,
Optional,
Expand Down Expand Up @@ -528,16 +529,22 @@ def get_flattened_list(
return [tuple(array) for array in arrays.values()]


def get_indexer_dict(label_list, keys):
def get_indexer_dict(
label_list: List[np.ndarray], keys: List["Index"]
) -> Dict[Union[str, Tuple], np.ndarray]:
phofl marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns
-------
dict
dict:
Labels mapped to indexers.
"""
shape = [len(x) for x in keys]

group_index = get_group_index(label_list, shape, sort=True, xnull=True)
if np.all(group_index == -1):
phofl marked this conversation as resolved.
Show resolved Hide resolved
# When all keys are nan and dropna=True, indices_fast can't handle this
# and the return is empty anyway
return {}
ngroups = (
((group_index.size and group_index.max()) + 1)
if is_int64_overflow_possible(shape)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude():
grouped.get_group(pd.NaT)


def test_groupby_two_group_keys_all_nan():
# GH #36842: Grouping over two group keys shouldn't raise an error
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
result = df.groupby(["a", "b"]).indices
assert result == {}


def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
Expand Down
20 changes: 19 additions & 1 deletion pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

import pandas as pd
import pandas.testing as tm
import pandas._testing as tm


@pytest.mark.parametrize(
Expand Down Expand Up @@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,

expected = pd.DataFrame(selected_data, index=mi)
tm.assert_frame_equal(result, expected)


def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = pd.DataFrame(data)
grouped = df.groupby("group", dropna=False)
result = grouped.indices
dtype = "int64"
expected = {
"g1": np.array([0, 2], dtype=dtype),
"g2": np.array([3], dtype=dtype),
np.nan: np.array([1, 4], dtype=dtype),
}
for result_values, expected_values in zip(result.values(), expected.values()):
tm.assert_numpy_array_equal(result_values, expected_values)
assert np.isnan(list(result.keys())[2])
assert list(result.keys())[0:2] == ["g1", "g2"]
15 changes: 15 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window):
result = x.rolling(window).corr(y)
expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
tm.assert_almost_equal(result, expected)


def test_groupby_rolling_nan_included():
# GH 35542
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = DataFrame(data)
result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
expected = DataFrame(
{"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
index=pd.MultiIndex.from_tuples(
[("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
names=["group", None],
),
)
tm.assert_frame_equal(result, expected)