From 7dc4edb2e2e945ea028d1e923732b30ac2bed92c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 3 Nov 2018 06:56:52 -0700 Subject: [PATCH] ENH: Add FrozenList.union and .difference (#23394) Re-attempt of gh-15506. Closes gh-15475. --- doc/source/groupby.rst | 10 +++++++ doc/source/whatsnew/v0.24.0.txt | 5 ++-- pandas/core/indexes/frozen.py | 42 +++++++++++++++++++++++++---- pandas/tests/indexes/test_frozen.py | 23 +++++++++++++--- 4 files changed, 69 insertions(+), 11 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index fecc336049a40b..0a896bac0f2d72 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -125,6 +125,16 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: grouped = df.groupby('A') grouped = df.groupby(['A', 'B']) +.. versionadded:: 0.24 + +If we also have a MultiIndex on columns ``A`` and ``B``, we can group by all +but the specified columns + +.. ipython:: python + + df2 = df.set_index(['A', 'B']) + grouped = df2.groupby(level=df2.index.names.difference(['B']) + These will split the DataFrame on its index (rows). We could also split by the columns: diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 64cc098ccaa94d..b19003e1c12846 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -13,10 +13,9 @@ v0.24.0 (Month XX, 2018) New features ~~~~~~~~~~~~ - :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - +- ``FrozenList`` has gained the ``.union()`` and ``.difference()`` methods. This functionality greatly simplifies groupby's that rely on explicitly excluding certain columns. See :ref:`Splitting an object into groups +` for more information (:issue:`15475`, :issue:`15506`) - :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 4f782e22c2370d..3ac4a2bf31a7e2 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -23,15 +23,47 @@ class FrozenList(PandasObject, list): because it's technically non-hashable, will be used for lookups, appropriately, etc. """ - # Sidenote: This has to be of type list, otherwise it messes up PyTables - # typechecks + # Side note: This has to be of type list. Otherwise, + # it messes up PyTables type checks. - def __add__(self, other): + def union(self, other): + """ + Returns a FrozenList with other concatenated to the end of self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are concatenating. + + Returns + ------- + diff : FrozenList + The collection difference between self and other. + """ if isinstance(other, tuple): other = list(other) - return self.__class__(super(FrozenList, self).__add__(other)) + return type(self)(super(FrozenList, self).__add__(other)) + + def difference(self, other): + """ + Returns a FrozenList with elements from other removed from self. + + Parameters + ---------- + other : array-like + The array-like whose elements we are removing self. + + Returns + ------- + diff : FrozenList + The collection difference between self and other. + """ + other = set(other) + temp = [x for x in self if x not in other] + return type(self)(temp) - __iadd__ = __add__ + # TODO: Consider deprecating these in favor of `union` (xref gh-15506) + __add__ = __iadd__ = union # Python 2 compat def __getslice__(self, i, j): diff --git a/pandas/tests/indexes/test_frozen.py b/pandas/tests/indexes/test_frozen.py index e62329dec98463..db9f875b77b8a9 100644 --- a/pandas/tests/indexes/test_frozen.py +++ b/pandas/tests/indexes/test_frozen.py @@ -11,7 +11,7 @@ class TestFrozenList(CheckImmutable, CheckStringMixin): mutable_methods = ('extend', 'pop', 'remove', 'insert') unicode_container = FrozenList([u("\u05d0"), u("\u05d1"), "c"]) - def setup_method(self, method): + def setup_method(self, _): self.lst = [1, 2, 3, 4, 5] self.container = FrozenList(self.lst) self.klass = FrozenList @@ -25,13 +25,30 @@ def test_add(self): expected = FrozenList([1, 2, 3] + self.lst) self.check_result(result, expected) - def test_inplace(self): + def test_iadd(self): q = r = self.container + q += [5] self.check_result(q, self.lst + [5]) - # other shouldn't be mutated + + # Other shouldn't be mutated. self.check_result(r, self.lst) + def test_union(self): + result = self.container.union((1, 2, 3)) + expected = FrozenList(self.lst + [1, 2, 3]) + self.check_result(result, expected) + + def test_difference(self): + result = self.container.difference([2]) + expected = FrozenList([1, 3, 4, 5]) + self.check_result(result, expected) + + def test_difference_dupe(self): + result = FrozenList([1, 2, 3, 2]).difference([2]) + expected = FrozenList([1, 3]) + self.check_result(result, expected) + class TestFrozenNDArray(CheckImmutable, CheckStringMixin): mutable_methods = ('put', 'itemset', 'fill')