From 090a75840220090507be98b8a42b3437817cc496 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 7 Jun 2020 15:27:18 -0500 Subject: [PATCH 01/35] ENH: Optionally disallow duplicate labels --- doc/source/user_guide/duplicates.rst | 165 ++++++++++++ doc/source/user_guide/index.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 41 +++ pandas/core/frame.py | 24 +- pandas/core/generic.py | 21 ++ pandas/core/indexes/base.py | 20 ++ pandas/core/series.py | 32 ++- pandas/errors/__init__.py | 13 + pandas/tests/test_duplicate_labels.py | 357 ++++++++++++++++++++++++++ 9 files changed, 669 insertions(+), 5 deletions(-) create mode 100644 doc/source/user_guide/duplicates.rst create mode 100644 pandas/tests/test_duplicate_labels.py diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst new file mode 100644 index 0000000000000..7f83df25f9cbb --- /dev/null +++ b/doc/source/user_guide/duplicates.rst @@ -0,0 +1,165 @@ +.. _duplicates: + +**************** +Duplicate Labels +**************** + +:class:`Index` objects are not required to be unique; you can have duplicate row +or column labels. This may be a bit confusing at first. If you're familiar with +SQL, you know that row labels are similar to a primary key on a table, and you +would never want duplicates in a SQL table. But one of pandas' roles is to clean +messy, real-world data before it goes to some downstream system. And real-world +data has duplicates, even in fields that are supposed to be unique. + +This section describes how duplicate labels change the behavior of certain +operations, and how prevent duplicates from arising during operations, or to +detect them if they do. + +.. ipython:: python + + import pandas as pd + import numpy as np + +Consequences of Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas methods (:meth:`Series.reindex` for example) just don't work with +duplicates present. The output can't be determined, and so pandas raises. + +.. ipython:: python + :okexcept: + + s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) + s1.reindex(['a', 'b', 'c']) + +Other methods, like indexing, can give very surprising results. Typically +indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` +with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will +return a scalar. But with duplicates, this isn't the case. + +.. ipython:: python + + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 + +We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` + +.. ipython:: python + + df1['B'] # a series + +But slicing ``'A'`` returns a ``DataFrame`` + + +.. ipython:: python + + df1['A'] # a DataFrame + +This applies to row labels as well + +.. ipython:: python + + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 + df2.loc['b', 'A'] # a scalar + df2.loc['a', 'A'] # a Series + +Duplicate Label Detection +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check with an :class:`Index` (storing the row or column labels) is +unique with :attr:`Index.is_unique`: + +.. ipython:: python + + df2 + df2.index.is_unique + df2.columns.is_unique + +.. note:: + + Checking whether an index is unique is somewhat expensive for large datasets. + Pandas does cache this result, so re-checking on the same index is very fast. + +:meth:`Index.duplicated` will return a boolean ndarray indicating whether a +label is a repeat. + +.. ipython:: python + + df2.index.duplicated() + +Which can be used as a boolean filter to drop duplicate rows. + +.. ipython:: python + + df2.loc[~df2.index.duplicated(), :] + +If you need additional logic to handle duplicate labels, rather than just +dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common +trick. For example, we'll resolve duplicates by taking the average of all rows +with the same label. + +.. ipython:: python + + df2.groupby(level=0).mean() + +.. _duplicates.disallow: + +Disallowing Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +As noted above, handling duplicates is an important feature when reading in raw +data. That said, you may want to avoid introducing duplicates as part of a data +processing pipeline (from methods like :meth:`pandas.concat`, +:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame` +can be created with the argument ``allows_duplicate_labels=False`` to *disallow* +duplicate labels (the default is to allow them). If there are duplicate labels, +an exception will be raised. + +.. ipython:: python + :okexcept: + + pd.Series([0, 1, 2], index=['a', 'b', 'b'], allows_duplicate_labels=False) + +This applies to both row and column labels for a :class:`DataFrame` + +.. ipython:: python + :okexcept: + + pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], + allows_duplicate_labels=False) + +This attribute can be checked with :attr:`~DataFrame.allows_duplicate_labels`, +which indicates whether that object can have duplicate labels. + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2, 3]}, + index=['x', 'y', 'X', 'Y'], + allows_duplicate_labels=False) + df + df.allows_duplicate_labels + +Performing an operation that introduces duplicate labels on a ``Series`` or +``DataFrame`` that disallows duplicates will raise an +:class:`errors.DuplicateLabelError`. + +.. ipython:: python + :okexcept: + + df.rename(str.upper) + +Duplicate Label Propagation +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In general, disallowing duplicates is "sticky". It's preserved through +operations. + +.. ipython:: python + :okexcept: + + s1 = pd.Series(0, index=['a', 'b'], allows_duplicate_labels=False) + s1 + s1.head().rename({"a": "b"}) diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 8226e72779588..2fc9e066e6712 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -33,6 +33,7 @@ Further information on any specific method can be obtained in the reshaping text missing_data + duplicates categorical integer_na boolean diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..d43ad35b43619 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -203,6 +203,47 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') +.. _whatsnew_110.duplicate_labels: + +Optionally disallow duplicate labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to +control whether the index or columns can contain duplicate labels. This can be used to prevent accidental +introduction of duplicate labels, which can affect downstream operations. + +By default, duplicates continue to be allowed + +.. ipython:: python + + pd.Series([1, 2], index=['a', 'a']) + +.. ipython:: python + :okexcept: + + pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) + +Pandas will propagate the ``allows_duplicate_labels`` property through many operations. + +.. ipython:: python + :okexcept: + + a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propage ``allows_duplicate_labels`` + +See :ref:`duplicates` for more. + + + .. _whatsnew_110.grouper_resample_origin: Grouper and resample now supports the arguments origin and offset diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f8ab8966c1f0..efd826664425f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -361,6 +361,23 @@ class DataFrame(NDFrame): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. + allows_duplicate_labels : bool, default True + Whether to allow duplicate row or column labels in this DataFrame. + By default, duplicate labels are permitted. Setting this to ``False`` + will cause an :class:`errors.DuplicateLabelError` to be raised when + `index` or `columns` are not unique, or when any subsequent operation + on this DataFrame introduces duplicates. See :ref:`duplicates.disallow` + for more. + + .. versionadded:: 1.1.0 + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propage ``allows_duplicate_labels`` + See Also -------- @@ -437,6 +454,7 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + allows_duplicate_labels=True, ): if data is None: data = {} @@ -449,7 +467,9 @@ def __init__( if isinstance(data, BlockManager): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, allows_duplicate_labels=allows_duplicate_labels + ) return mgr = self._init_mgr( @@ -535,7 +555,7 @@ def __init__( else: raise ValueError("DataFrame constructor not properly called!") - NDFrame.__init__(self, mgr) + NDFrame.__init__(self, mgr, allows_duplicate_labels=allows_duplicate_labels) # ---------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 714a332be2196..e66914258f160 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -202,6 +202,7 @@ def __init__( self, data: BlockManager, copy: bool = False, + allows_duplicate_labels: bool = True, attrs: Optional[Mapping[Optional[Hashable], Any]] = None, ): # copy kwarg is retained for mypy compat, is not used @@ -214,6 +215,7 @@ def __init__( else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) + object.__setattr__(self, "allows_duplicate_labels", allows_duplicate_labels) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -252,6 +254,22 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + if not value: + for ax in self.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ @@ -5198,10 +5216,13 @@ def __finalize__( if isinstance(other, NDFrame): for name in other.attrs: self.attrs[name] = other.attrs[name] + + self.allows_duplicate_labels = other.allows_duplicate_labels # For subclasses using _metadata. for name in self._metadata: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) + return self def __getattr__(self, name: str): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 746fd140e48a1..82d80f2564fa9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -476,6 +476,26 @@ def _simple_new(cls, values, name: Label = None): def _constructor(self): return type(self) + def _maybe_check_unique(self): + from pandas.errors import DuplicateLabelError + + if not self.is_unique: + # TODO: position, value, not too large. + msg = """Index has duplicates.""" + duplicates = self._format_duplicate_message() + msg += "\n{}".format(duplicates) + + raise DuplicateLabelError(msg) + + def _format_duplicate_message(self): + from pandas import Series + + duplicates = self[self.duplicated(keep="first")].unique() + assert len(duplicates) + + out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + return out.rename_axis("label").to_frame(name="positions") + # -------------------------------------------------------------------- # Index Internals Methods diff --git a/pandas/core/series.py b/pandas/core/series.py index ef47e52151961..a2a4fd85fff8a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -178,6 +178,21 @@ class Series(base.IndexOpsMixin, generic.NDFrame): The name to give to the Series. copy : bool, default False Copy input data. + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this Series. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` is not unique, or any subsequent operation on this Series + introduces duplicates. See :ref:`duplicates.disallow` for more. + + .. versionadded:: 1.1.0 + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propage ``allows_duplicate_labels`` """ _typ = "series" @@ -204,7 +219,14 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Constructors def __init__( - self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + self, + data=None, + index=None, + dtype=None, + name=None, + copy=False, + allows_duplicate_labels=True, + fastpath=False, ): if ( @@ -214,7 +236,9 @@ def __init__( and copy is False ): # GH#33357 called with just the SingleBlockManager - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, allows_duplicate_labels=allows_duplicate_labels + ) self.name = name return @@ -333,7 +357,9 @@ def __init__( data = SingleBlockManager.from_array(data, index) - generic.NDFrame.__init__(self, data) + generic.NDFrame.__init__( + self, data, allows_duplicate_labels=allows_duplicate_labels + ) self.name = name self._set_axis(0, index, fastpath=True) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 4c4ce9df85543..716b278eb15a6 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -200,3 +200,16 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. """ + + +class DuplicateLabelError(ValueError): + """ + Error raised when an operation would introduce duplicate labels. + Examples + -------- + >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c'], allows_duplicates=False) + >>> s.reindex(['a', 'a', 'b']) + Traceback (most recent call last): + ... + DuplicateLabelError: Duplicate labels are not allowed on this pandas object. + """ diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py new file mode 100644 index 0000000000000..7ec315cb89b41 --- /dev/null +++ b/pandas/tests/test_duplicate_labels.py @@ -0,0 +1,357 @@ +"""Tests dealing with the NDFrame.allows_duplicates.""" +import operator + +import numpy as np +import pytest + +import pandas.errors + +import pandas as pd + +not_implemented = pytest.mark.xfail(reason="Not implemented.") + +# ---------------------------------------------------------------------------- +# Preservation + + +class TestPreserves: + @pytest.mark.parametrize( + "cls, data", + [ + (pd.Series, np.array([])), + (pd.Series, [1, 2]), + (pd.DataFrame, {}), + (pd.DataFrame, {"A": [1, 2]}), + ], + ) + def test_construction_ok(self, cls, data): + result = cls(data) + assert result.allows_duplicate_labels is True + + result = cls(data, allows_duplicate_labels=False) + assert result.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "func", + [ + operator.itemgetter(["a"]), + operator.methodcaller("add", 1), + operator.methodcaller("rename", str.upper), + operator.methodcaller("rename", "name"), + pytest.param(operator.methodcaller("abs"), marks=not_implemented), + # TODO: test np.abs + ], + ) + def test_preserved_series(self, func): + s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) + assert func(s).allows_duplicate_labels is False + + @pytest.mark.parametrize( + "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] + ) + # TODO: frame + @not_implemented + def test_align(self, other): + s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) + a, b = s.align(other) + assert a.allows_duplicate_labels is False + assert b.allows_duplicate_labels is False + + def test_preserved_frame(self): + df = pd.DataFrame( + {"A": [1, 2], "B": [3, 4]}, index=["a", "b"], allows_duplicate_labels=False + ) + assert df.loc[["a"]].allows_duplicate_labels is False + assert df.loc[:, ["A", "B"]].allows_duplicate_labels is False + + @not_implemented + def test_to_frame(self): + s = pd.Series(dtype=float, allows_duplicate_labels=False) + assert s.to_frame().allows_duplicate_labels is False + + @pytest.mark.parametrize("func", ["add", "sub"]) + @pytest.mark.parametrize( + "frame", [False, pytest.param(True, marks=not_implemented)] + ) + @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) + def test_binops(self, func, other, frame): + df = pd.Series( + [1, 2], name="A", index=["a", "b"], allows_duplicate_labels=False + ) + if frame: + df = df.to_frame() + if isinstance(other, pd.Series) and frame: + other = other.to_frame() + func = operator.methodcaller(func, other) + assert df.allows_duplicate_labels is False + assert func(df).allows_duplicate_labels is False + + @not_implemented + def test_preserve_getitem(self): + df = pd.DataFrame({"A": [1, 2]}, allows_duplicate_labels=False) + assert df[["A"]].allows_duplicate_labels is False + assert df["A"].allows_duplicate_labels is False + assert df.loc[0].allows_duplicate_labels is False + assert df.loc[[0]].allows_duplicate_labels is False + assert df.loc[0, ["A"]].allows_duplicate_labels is False + + @pytest.mark.xfail(resason="Unclear behavior.") + def test_ndframe_getitem_caching_issue(self): + # NDFrame.__getitem__ will cache the first df['A']. May need to + # invalidate that cache? Update the cached entries? + df = pd.DataFrame({"A": [0]}, allows_duplicate_labels=False) + assert df["A"].allows_duplicate_labels is False + df.allows_duplicate_labels = True + assert df["A"].allows_duplicate_labels is True + + @pytest.mark.parametrize( + "objs, kwargs", + [ + # Series + ( + [ + pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), + pd.Series(2, index=["c", "d"], allows_duplicate_labels=False), + ], + {}, + ), + ( + [ + pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), + pd.Series(2, index=["a", "b"], allows_duplicate_labels=False), + ], + {"ignore_index": True}, + ), + ( + [ + pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), + pd.Series(2, index=["a", "b"], allows_duplicate_labels=False), + ], + {"axis": 1}, + ), + # Frame + ( + [ + pd.DataFrame( + {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.DataFrame( + {"A": [1, 2]}, index=["c", "d"], allows_duplicate_labels=False + ), + ], + {}, + ), + ( + [ + pd.DataFrame( + {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.DataFrame( + {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + ], + {"ignore_index": True}, + ), + ( + [ + pd.DataFrame( + {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.DataFrame( + {"B": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ), + # Series / Frame + ( + [ + pd.DataFrame( + {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.Series( + [1, 2], + index=["a", "b"], + name="B", + allows_duplicate_labels=False, + ), + ], + {"axis": 1}, + ), + ], + ) + @not_implemented + def test_concat(self, objs, kwargs): + result = pd.concat(objs, **kwargs) + assert result.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "left, right, kwargs, expected", + [ + # false false false + pytest.param( + pd.DataFrame( + {"A": [0, 1]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.DataFrame( + {"B": [0, 1]}, index=["a", "d"], allows_duplicate_labels=False + ), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # false true false + pytest.param( + pd.DataFrame( + {"A": [0, 1]}, index=["a", "b"], allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # true true true + ( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + True, + ), + ], + ) + def test_merge(self, left, right, kwargs, expected): + result = pd.merge(left, right, **kwargs) + assert result.allows_duplicate_labels is expected + + @not_implemented + def test_groupby(self): + # XXX: This is under tested + # TODO: + # - apply + # - transform + # - Should passing a grouper that disallows duplicates propagate? + # i.e. df.groupby(pd.Series([0, 1], allows_duplicate_labels=False))? + df = pd.DataFrame({"A": [1, 2, 3]}, allows_duplicate_labels=False) + result = df.groupby([0, 0, 1]).agg("count") + assert result.allows_duplicate_labels is False + + @pytest.mark.parametrize("frame", [True, False]) + @not_implemented + def test_window(self, frame): + df = pd.Series( + 1, + index=pd.date_range("2000", periods=12), + name="A", + allows_duplicate_labels=False, + ) + if frame: + df = df.to_frame() + assert df.rolling(3).mean().allows_duplicate_labels is False + assert df.ewm(3).mean().allows_duplicate_labels is False + assert df.expanding(3).mean().allows_duplicate_labels is False + + +# ---------------------------------------------------------------------------- +# Raises + + +class TestRaises: + @pytest.mark.parametrize( + "cls, axes", + [ + (pd.Series, {"index": ["a", "a"], "dtype": float}), + (pd.DataFrame, {"index": ["a", "a"]}), + (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}), + (pd.DataFrame, {"columns": ["b", "b"]}), + ], + ) + def test_construction_with_duplicates(self, cls, axes): + result = cls(**axes) + assert result.allows_duplicate_labels is True + + with pytest.raises(pandas.errors.DuplicateLabelError): + cls(**axes, allows_duplicate_labels=False) + + @pytest.mark.parametrize( + "data", + [ + pd.Series(index=[0, 0], dtype=float), + pd.DataFrame(index=[0, 0]), + pd.DataFrame(columns=[0, 0]), + ], + ) + def test_setting_allows_duplicate_labels_raises(self, data): + with pytest.raises(pandas.errors.DuplicateLabelError): + data.allows_duplicate_labels = False + + assert data.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))] + ) + @not_implemented + def test_series_raises(self, func): + s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) + with pytest.raises(pandas.errors.DuplicateLabelError): + func(s) + + @pytest.mark.parametrize( + "getter, target", + [ + (operator.itemgetter(["A", "A"]), None), + # loc + (operator.itemgetter(["a", "a"]), "loc"), + pytest.param( + operator.itemgetter(("a", ["A", "A"])), "loc", marks=not_implemented + ), + pytest.param( + operator.itemgetter((["a", "a"], "A")), "loc", marks=not_implemented + ), + # iloc + (operator.itemgetter([0, 0]), "iloc"), + pytest.param( + operator.itemgetter((0, [0, 0])), "iloc", marks=not_implemented + ), + pytest.param( + operator.itemgetter(([0, 0], 0)), "iloc", marks=not_implemented + ), + ], + ) + def test_getitem_raises(self, getter, target): + df = pd.DataFrame( + {"A": [1, 2], "B": [3, 4]}, index=["a", "b"], allows_duplicate_labels=False + ) + if target: + # df, df.loc, or df.iloc + target = getattr(df, target) + else: + target = df + + with pytest.raises(pandas.errors.DuplicateLabelError): + getter(target) + + @pytest.mark.parametrize( + "objs, kwargs", + [ + ( + [ + pd.Series(1, index=[0, 1], name="a", allows_duplicate_labels=False), + pd.Series(2, index=[0, 1], name="a", allows_duplicate_labels=False), + ], + {"axis": 1}, + ) + ], + ) + @not_implemented + def test_concat_raises(self, objs, kwargs): + with pytest.raises(pandas.errors.DuplicateLabelError): + pd.concat(objs, **kwargs) + + @not_implemented + def test_merge_raises(self): + a = pd.DataFrame( + {"A": [0, 1, 2]}, index=["a", "b", "c"], allows_duplicate_labels=False + ) + b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) + with pytest.raises(pandas.errors.DuplicateLabelError): + pd.merge(a, b, left_index=True, right_index=True) From 0962d1963cd2f9f348983a19dd1d68a51065a7be Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 8 Jun 2020 10:22:30 -0500 Subject: [PATCH 02/35] fixup memory_usage test --- pandas/core/generic.py | 3 ++- pandas/tests/base/test_misc.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e66914258f160..9f16369d60fa9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -268,7 +268,8 @@ def allows_duplicate_labels(self, value: bool): for ax in self.axes: ax._maybe_check_unique() - self._allows_duplicate_labels = value + # avoid `can_hold_identifiers` check. + object.__setattr__(self, "_allows_duplicate_labels", value) @classmethod def _validate_dtype(cls, dtype): diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 527f806483d94..c835d6501a77f 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -116,6 +116,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(index_or_series_obj): obj = index_or_series_obj + res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) From 8e0089ce9124ba99ba1d1447ea6bc91c60bb852a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 8 Jun 2020 11:10:33 -0500 Subject: [PATCH 03/35] pickle --- pandas/core/generic.py | 4 ++++ pandas/tests/io/test_pickle.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f16369d60fa9..8055c9fac4462 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1866,6 +1866,10 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) + allows_duplicate_labels = state.get("_allows_duplicate_labels", True) + object.__setattr__( + self, "_allows_duplicate_labels", allows_duplicate_labels + ) # set in the order of internal names # to avoid definitional recursion diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 42b4ea5ad9aac..fbe598d36b9ff 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -509,3 +509,11 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) + + +def test_allows_duplicate_labels(): + s = pd.Series(dtype=float, allows_duplicate_labels=False) + tm.round_trip_pickle(s) + + df = pd.DataFrame(allows_duplicate_labels=False) + tm.round_trip_pickle(df) From b9873dbd0dd94bc70c4436dd2007dec6f5f7138e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 9 Jun 2020 21:14:50 -0500 Subject: [PATCH 04/35] lint --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f371026b3747..eedc18fae8018 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -377,7 +377,6 @@ class DataFrame(NDFrame): it is expected that every method taking or returning one or more DataFrame or Series objects will propage ``allows_duplicate_labels`` - See Also -------- DataFrame.from_records : Constructor from tuples, also record arrays. From d326ed5a0c7c0a3271daf9d5a5194d64d5af32b1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 9 Jun 2020 21:16:05 -0500 Subject: [PATCH 05/35] doc --- doc/source/whatsnew/v1.1.0.rst | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5ebaff99edc91..a990c5580d440 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -216,34 +216,32 @@ By default, duplicates continue to be allowed .. ipython:: python - pd.Series([1, 2], index=['a', 'a']) + pd.Series([1, 2], index=['a', 'a']) .. ipython:: python - :okexcept: + :okexcept: - pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) + pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) Pandas will propagate the ``allows_duplicate_labels`` property through many operations. .. ipython:: python - :okexcept: + :okexcept: - a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) - a - # An operation introducing duplicates - a.reindex(['a', 'b', 'a']) + a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) .. warning:: This is an experimental feature. Currently, many methods fail to propagate the ``allows_duplicate_labels`` value. In future versions it is expected that every method taking or returning one or more - DataFrame or Series objects will propage ``allows_duplicate_labels`` + DataFrame or Series objects will propagate ``allows_duplicate_labels``. See :ref:`duplicates` for more. - - .. _whatsnew_110.grouper_resample_origin: Grouper and resample now supports the arguments origin and offset From fba3536c2dc30d7f1ee192063c6dce1e8ad20f73 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 9 Jun 2020 21:20:57 -0500 Subject: [PATCH 06/35] fixups --- doc/source/user_guide/duplicates.rst | 11 +++++++++-- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 7f83df25f9cbb..b65a43268fb9b 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -67,7 +67,7 @@ This applies to row labels as well Duplicate Label Detection ~~~~~~~~~~~~~~~~~~~~~~~~~ -You can check with an :class:`Index` (storing the row or column labels) is +You can check whether an :class:`Index` (storing the row or column labels) is unique with :attr:`Index.is_unique`: .. ipython:: python @@ -131,7 +131,7 @@ This applies to both row and column labels for a :class:`DataFrame` pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], allows_duplicate_labels=False) -This attribute can be checked with :attr:`~DataFrame.allows_duplicate_labels`, +This attribute can be checked or set with :attr:`~DataFrame.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. .. ipython:: python @@ -163,3 +163,10 @@ operations. s1 = pd.Series(0, index=['a', 'b'], allows_duplicate_labels=False) s1 s1.head().rename({"a": "b"}) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eedc18fae8018..00f5e418de06f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -375,7 +375,7 @@ class DataFrame(NDFrame): This is an experimental feature. Currently, many methods fail to propagate the ``allows_duplicate_labels`` value. In future versions it is expected that every method taking or returning one or more - DataFrame or Series objects will propage ``allows_duplicate_labels`` + DataFrame or Series objects will propagate ``allows_duplicate_labels``. See Also -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 26a9ab0543a37..3bc950ac95b06 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -192,7 +192,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): This is an experimental feature. Currently, many methods fail to propagate the ``allows_duplicate_labels`` value. In future versions it is expected that every method taking or returning one or more - DataFrame or Series objects will propage ``allows_duplicate_labels`` + DataFrame or Series objects will propagate ``allows_duplicate_labels``. """ _typ = "series" From f1e59326cef40dad578ca3c0d8133ce0e88470cd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 17 Jun 2020 16:15:34 -0500 Subject: [PATCH 07/35] handle concat --- pandas/core/generic.py | 4 ++++ pandas/tests/test_duplicate_labels.py | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 933b4d5b9586c..fdce1747178a1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5235,6 +5235,10 @@ def __finalize__( assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) + if method == "concat": + allows_duplicate_labels = all(x.allows_duplicate_labels for x in other.objs) + self.allows_duplicate_labels = allows_duplicate_labels + return self def __getattr__(self, name: str): diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py index 7ec315cb89b41..cbeaf5482e50f 100644 --- a/pandas/tests/test_duplicate_labels.py +++ b/pandas/tests/test_duplicate_labels.py @@ -180,7 +180,6 @@ def test_ndframe_getitem_caching_issue(self): ), ], ) - @not_implemented def test_concat(self, objs, kwargs): result = pd.concat(objs, **kwargs) assert result.allows_duplicate_labels is False @@ -289,7 +288,6 @@ def test_setting_allows_duplicate_labels_raises(self, data): @pytest.mark.parametrize( "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))] ) - @not_implemented def test_series_raises(self, func): s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) with pytest.raises(pandas.errors.DuplicateLabelError): @@ -342,7 +340,6 @@ def test_getitem_raises(self, getter, target): ) ], ) - @not_implemented def test_concat_raises(self, objs, kwargs): with pytest.raises(pandas.errors.DuplicateLabelError): pd.concat(objs, **kwargs) From 3dad6d58568b44b8025eba2f11c1b2fde208cf9f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jun 2020 10:13:19 -0500 Subject: [PATCH 08/35] handle mi --- pandas/core/indexes/base.py | 4 +- pandas/tests/test_duplicate_labels.py | 60 +++++++++++++++++++++++---- 2 files changed, 55 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a1147b952aba9..6e6767226c12d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -491,7 +491,9 @@ def _format_duplicate_message(self): assert len(duplicates) out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] - return out.rename_axis("label").to_frame(name="positions") + if self.nlevels == 1: + out = out.rename_axis("label") + return out.to_frame(name="positions") # -------------------------------------------------------------------- # Index Internals Methods diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py index cbeaf5482e50f..2dcc489850d5a 100644 --- a/pandas/tests/test_duplicate_labels.py +++ b/pandas/tests/test_duplicate_labels.py @@ -4,9 +4,8 @@ import numpy as np import pytest -import pandas.errors - import pandas as pd +import pandas._testing as tm not_implemented = pytest.mark.xfail(reason="Not implemented.") @@ -268,7 +267,7 @@ def test_construction_with_duplicates(self, cls, axes): result = cls(**axes) assert result.allows_duplicate_labels is True - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): cls(**axes, allows_duplicate_labels=False) @pytest.mark.parametrize( @@ -280,7 +279,7 @@ def test_construction_with_duplicates(self, cls, axes): ], ) def test_setting_allows_duplicate_labels_raises(self, data): - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): data.allows_duplicate_labels = False assert data.allows_duplicate_labels is True @@ -290,7 +289,7 @@ def test_setting_allows_duplicate_labels_raises(self, data): ) def test_series_raises(self, func): s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): func(s) @pytest.mark.parametrize( @@ -325,7 +324,7 @@ def test_getitem_raises(self, getter, target): else: target = df - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): getter(target) @pytest.mark.parametrize( @@ -341,7 +340,7 @@ def test_getitem_raises(self, getter, target): ], ) def test_concat_raises(self, objs, kwargs): - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): pd.concat(objs, **kwargs) @not_implemented @@ -350,5 +349,50 @@ def test_merge_raises(self): {"A": [0, 1, 2]}, index=["a", "b", "c"], allows_duplicate_labels=False ) b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) - with pytest.raises(pandas.errors.DuplicateLabelError): + with pytest.raises(pd.errors.DuplicateLabelError): pd.merge(a, b, left_index=True, right_index=True) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 1]), + pd.Index(["a", "a"]), + pd.Index([1.1, 1.1]), + pd.PeriodIndex([pd.Period("2000", "D")] * 2), + pd.DatetimeIndex([pd.Timestamp("2000")] * 2), + pd.TimedeltaIndex([pd.Timedelta("1D")] * 2), + pd.CategoricalIndex(["a", "a"]), + pd.IntervalIndex([pd.Interval(0, 1)] * 2), + pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]), + ], + ids=lambda x: type(x).__name__, +) +def test_raises_basic(idx): + with pytest.raises(pd.errors.DuplicateLabelError): + pd.Series(1, index=idx, allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame({"A": [1, 1]}, index=idx, allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame([[1, 2]], columns=idx, allows_duplicate_labels=False) + + +def test_format_duplicate_labels_message(): + idx = pd.Index(["a", "b", "a", "b", "c"]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label") + ) + tm.assert_frame_equal(result, expected) + + +def test_format_duplicate_labels_message_multi(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, + index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), + ) + tm.assert_frame_equal(result, expected) From e81327da1da3ef538d842a65481ee0e46ebcd25c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jun 2020 10:16:58 -0500 Subject: [PATCH 09/35] note on setting --- doc/source/user_guide/duplicates.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index b65a43268fb9b..056933c6372f6 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -142,6 +142,17 @@ which indicates whether that object can have duplicate labels. df df.allows_duplicate_labels +When processing raw, messy data you might initially read in the messy data +(which potentially has duplicate labels), deduplicate, and then disallow duplicates +going forward, to ensure that your data pipeline doesn't introduce duplicates. + + +.. code-block:: python + + >>> raw = pd.read_csv("...") + >>> deduplicated = raw.groupby(level=0).first() # remove duplicates + >>> dedupcliated.allows_duplicate_labels = False # disallow going forward + Performing an operation that introduces duplicate labels on a ``Series`` or ``DataFrame`` that disallows duplicates will raise an :class:`errors.DuplicateLabelError`. @@ -151,6 +162,9 @@ Performing an operation that introduces duplicate labels on a ``Series`` or df.rename(str.upper) +This error message contains the labels that are duplicated, and the numeric positions +of all the duplicates (including the "original") in the ``Series`` or ``DataFrame``. + Duplicate Label Propagation ^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 3254b8b7d345487f5b634fd20534ca2f1ae9419d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Jun 2020 10:45:49 -0500 Subject: [PATCH 10/35] fixup --- doc/source/user_guide/duplicates.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 056933c6372f6..108707e079aff 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -151,7 +151,7 @@ going forward, to ensure that your data pipeline doesn't introduce duplicates. >>> raw = pd.read_csv("...") >>> deduplicated = raw.groupby(level=0).first() # remove duplicates - >>> dedupcliated.allows_duplicate_labels = False # disallow going forward + >>> deduplicated.allows_duplicate_labels = False # disallow going forward Performing an operation that introduces duplicate labels on a ``Series`` or ``DataFrame`` that disallows duplicates will raise an @@ -163,7 +163,7 @@ Performing an operation that introduces duplicate labels on a ``Series`` or df.rename(str.upper) This error message contains the labels that are duplicated, and the numeric positions -of all the duplicates (including the "original") in the ``Series`` or ``DataFrame``. +of all the duplicates (including the "original") in the ``Series`` or ``DataFrame`` Duplicate Label Propagation ^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 04b6322c425fdc4455f72dbae5be33da2bb27885 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Jun 2020 11:02:28 -0500 Subject: [PATCH 11/35] fix import, docs --- doc/source/reference/general_utility_functions.rst | 1 + pandas/core/indexes/base.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 72a84217323ab..cca0aaf353e28 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -37,6 +37,7 @@ Exceptions and warnings errors.AccessorRegistrationWarning errors.DtypeWarning + errors.DuplicateLabelError errors.EmptyDataError errors.InvalidIndexError errors.MergeError diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6e6767226c12d..3069bbbfa8be8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -16,7 +16,7 @@ from pandas._typing import DtypeObj, Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.errors import InvalidIndexError +from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes import concat as _concat @@ -474,8 +474,6 @@ def _constructor(self): return type(self) def _maybe_check_unique(self): - from pandas.errors import DuplicateLabelError - if not self.is_unique: # TODO: position, value, not too large. msg = """Index has duplicates.""" From fdcdb31c7a8a7038098b7754debce31e122a6f84 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 22 Jun 2020 11:06:07 -0500 Subject: [PATCH 12/35] handle insert --- pandas/core/frame.py | 5 +++++ pandas/tests/test_duplicate_labels.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a93894f706966..9a3e525afcfba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3585,6 +3585,11 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value : int, Series, or array-like allow_duplicates : bool, optional """ + if allow_duplicates and not self.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.allows_dpulicate_labels' is False." + ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py index 2dcc489850d5a..8ce5171757fc9 100644 --- a/pandas/tests/test_duplicate_labels.py +++ b/pandas/tests/test_duplicate_labels.py @@ -396,3 +396,9 @@ def test_format_duplicate_labels_message_multi(): index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), ) tm.assert_frame_equal(result, expected) + + +def test_dataframe_insert_raises(): + df = pd.DataFrame({"A": [1, 2]}, allows_duplicate_labels=False) + with pytest.raises(ValueError, match="Cannot specify"): + df.insert(0, "A", [3, 4], allow_duplicates=True) From 7d71326a092c088d7b9cbfcb47f2f52bc300a6fa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 29 Jun 2020 16:44:17 -0500 Subject: [PATCH 13/35] wip inplace --- pandas/core/frame.py | 4 +++- pandas/core/generic.py | 8 ++++++++ pandas/tests/test_duplicate_labels.py | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c749a4ef03f66..5ec52c74b67bd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3591,7 +3591,7 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: if allow_duplicates and not self.allows_duplicate_labels: raise ValueError( "Cannot specify 'allow_duplicates=True' when " - "'self.allows_dpulicate_labels' is False." + "'self.allows_duplicate_labels' is False." ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) @@ -4487,6 +4487,7 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -4732,6 +4733,7 @@ class max type monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2fb7f9752c74d..76b6b19674e43 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -571,6 +571,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ + self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -3734,6 +3735,13 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + def _check_inplace_and_allows_duplicate_labels(self, inplace): + if inplace and not self.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'inplace=True' when " + "'self.allows_duplicate_labels' is False." + ) + def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py index 8ce5171757fc9..a8dea8af1e430 100644 --- a/pandas/tests/test_duplicate_labels.py +++ b/pandas/tests/test_duplicate_labels.py @@ -402,3 +402,28 @@ def test_dataframe_insert_raises(): df = pd.DataFrame({"A": [1, 2]}, allows_duplicate_labels=False) with pytest.raises(ValueError, match="Cannot specify"): df.insert(0, "A", [3, 4], allow_duplicates=True) + + +def test_inplace_raises(): + df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}, allows_duplicate_labels=False) + s = df["A"] + s.allows_duplicate_labels = False + msg = "Cannot specify" + + with pytest.raises(ValueError, match=msg): + df.set_index("A", inplace=True) + + with pytest.raises(ValueError, match=msg): + df.set_axis(["A", "B"], inplace=True) + + with pytest.raises(ValueError, match=msg): + s.set_axis(["A", "B"], inplace=True) + + with pytest.raises(ValueError, match=msg): + df.set_axis(["A", "B"], inplace=True) + + with pytest.raises(ValueError, match=msg): + df.reset_index(inplace=True) + + with pytest.raises(ValueError, match=msg): + df.rename(lambda x: x, inplace=True) From 3fa067d76eab50af84178ed2fd1416bb518c88e1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 13:39:41 -0500 Subject: [PATCH 14/35] tests for inplace duplicates --- doc/source/user_guide/duplicates.rst | 2 +- pandas/tests/test_duplicate_labels.py | 31 ++++++++++++--------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 108707e079aff..59c875c8edb6f 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -82,7 +82,7 @@ unique with :attr:`Index.is_unique`: Pandas does cache this result, so re-checking on the same index is very fast. :meth:`Index.duplicated` will return a boolean ndarray indicating whether a -label is a repeat. +label is repeated. .. ipython:: python diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/test_duplicate_labels.py index a8dea8af1e430..4a492cb4b1134 100644 --- a/pandas/tests/test_duplicate_labels.py +++ b/pandas/tests/test_duplicate_labels.py @@ -404,26 +404,23 @@ def test_dataframe_insert_raises(): df.insert(0, "A", [3, 4], allow_duplicates=True) -def test_inplace_raises(): +@pytest.mark.parametrize( + "method, frame_only", + [ + (operator.methodcaller("set_index", "A", inplace=True), True), + (operator.methodcaller("set_axis", ["A", "B"], inplace=True), False), + (operator.methodcaller("reset_index", inplace=True), True), + (operator.methodcaller("rename", lambda x: x, inplace=True), False), + ], +) +def test_inplace_raises(method, frame_only): df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}, allows_duplicate_labels=False) s = df["A"] s.allows_duplicate_labels = False msg = "Cannot specify" with pytest.raises(ValueError, match=msg): - df.set_index("A", inplace=True) - - with pytest.raises(ValueError, match=msg): - df.set_axis(["A", "B"], inplace=True) - - with pytest.raises(ValueError, match=msg): - s.set_axis(["A", "B"], inplace=True) - - with pytest.raises(ValueError, match=msg): - df.set_axis(["A", "B"], inplace=True) - - with pytest.raises(ValueError, match=msg): - df.reset_index(inplace=True) - - with pytest.raises(ValueError, match=msg): - df.rename(lambda x: x, inplace=True) + method(df) + if not frame_only: + with pytest.raises(ValueError, match=msg): + method(s) From 91ca7a1e86031b70084cfdb4daaf187a000f7771 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 13:40:02 -0500 Subject: [PATCH 15/35] move to generic --- pandas/tests/{ => generic}/test_duplicate_labels.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{ => generic}/test_duplicate_labels.py (100%) diff --git a/pandas/tests/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py similarity index 100% rename from pandas/tests/test_duplicate_labels.py rename to pandas/tests/generic/test_duplicate_labels.py From 097dd1c9b8c67078d04a9c42af107a00e760e581 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 15:55:49 -0500 Subject: [PATCH 16/35] fixup --- pandas/core/generic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 76b6b19674e43..62c12f883f1bb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -572,6 +572,9 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ self._check_inplace_and_allows_duplicate_labels(inplace) + return self._set_axis_nocheck(labels, axis, inplace) + + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -941,6 +944,7 @@ def rename( else: index = mapper + self._check_inplace_and_allows_duplicate_labels(inplace) result = self if inplace else self.copy(deep=copy) for axis_no, replacements in enumerate((index, columns)): @@ -965,7 +969,7 @@ def rename( raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level) - result.set_axis(new_index, axis=axis_no, inplace=True) + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: From 824863479b68a0fada0fcba983c06de289e11f87 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 16:06:58 -0500 Subject: [PATCH 17/35] add note --- pandas/core/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62c12f883f1bb..d14aa78532a7c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -575,6 +575,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return self._set_axis_nocheck(labels, axis, inplace) def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): + # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy. if inplace: setattr(self, self._get_axis_name(axis), labels) else: From aff930317bc8a179e410b70fa6bc1aefbde9d5e9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Aug 2020 08:53:24 -0500 Subject: [PATCH 18/35] update --- doc/source/reference/frame.rst | 2 ++ doc/source/reference/series.rst | 2 ++ doc/source/user_guide/duplicates.rst | 19 +++++++++++- doc/source/whatsnew/v1.1.0.rst | 38 ------------------------ doc/source/whatsnew/v1.2.0.rst | 43 ++++++++++++++++++++++++++-- pandas/core/generic.py | 27 +++++++++++++++++ pandas/tests/frame/test_api.py | 16 +++++++++++ pandas/tests/series/test_api.py | 16 +++++++++++ 8 files changed, 122 insertions(+), 41 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..dd9223856afce 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -37,6 +37,8 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty + DataFrame.allows_duplicate_labels + DataFrame.set_flags Conversion ~~~~~~~~~~ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..3550a26fa99cd 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -39,6 +39,8 @@ Attributes Series.empty Series.dtypes Series.name + Series.allows_duplicate_labels + Series.set_flags Conversion ---------- diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 59c875c8edb6f..145700b02bc16 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -142,6 +142,22 @@ which indicates whether that object can have duplicate labels. df df.allows_duplicate_labels +:meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes +like ``allows_duplicate_labels`` set to some value + +.. ipython:: python + + df2 = df.set_flags(allows_duplicate_labels=True) + df2.allows_duplicate_labels + +Or the property can just be set directly on the same object + + +.. ipython:: python + + df2.allows_duplicate_labels = False + df2.allows_duplicate_labels + When processing raw, messy data you might initially read in the messy data (which potentially has duplicate labels), deduplicate, and then disallow duplicates going forward, to ensure that your data pipeline doesn't introduce duplicates. @@ -153,7 +169,8 @@ going forward, to ensure that your data pipeline doesn't introduce duplicates. >>> deduplicated = raw.groupby(level=0).first() # remove duplicates >>> deduplicated.allows_duplicate_labels = False # disallow going forward -Performing an operation that introduces duplicate labels on a ``Series`` or +Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate +labels or performing an operation that introduces duplicate labels on a ``Series`` or ``DataFrame`` that disallows duplicates will raise an :class:`errors.DuplicateLabelError`. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f002b46d2bafa..b532f3fad3a5e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -210,44 +210,6 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') -.. _whatsnew_110.duplicate_labels: - -Optionally disallow duplicate labels -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to -control whether the index or columns can contain duplicate labels. This can be used to prevent accidental -introduction of duplicate labels, which can affect downstream operations. - -By default, duplicates continue to be allowed - -.. ipython:: python - - pd.Series([1, 2], index=['a', 'a']) - -.. ipython:: python - :okexcept: - - pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) - -Pandas will propagate the ``allows_duplicate_labels`` property through many operations. - -.. ipython:: python - :okexcept: - - a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) - a - # An operation introducing duplicates - a.reindex(['a', 'b', 'a']) - -.. warning:: - - This is an experimental feature. Currently, many methods fail to - propagate the ``allows_duplicate_labels`` value. In future versions - it is expected that every method taking or returning one or more - DataFrame or Series objects will propagate ``allows_duplicate_labels``. - -See :ref:`duplicates` for more. .. _whatsnew_110.grouper_resample_origin: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..3290c7255e320 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,12 +13,51 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.duplicate_labels: + +Optionally disallow duplicate labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to +control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to +prevent accidental introduction of duplicate labels, which can affect downstream operations. + +By default, duplicates continue to be allowed + +.. ipython:: python + + pd.Series([1, 2], index=['a', 'a']) + +.. ipython:: python + :okexcept: + + pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) + +Pandas will propagate the ``allows_duplicate_labels`` property through many operations. + +.. ipython:: python + :okexcept: + + a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + +See :ref:`duplicates` for more. + .. _whatsnew_120.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - @@ -165,4 +204,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dd21d4f6ed699..b6f032b751d53 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -230,6 +230,7 @@ def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: return mgr # ---------------------------------------------------------------------- + # attrs and flags @property def attrs(self) -> Dict[Optional[Hashable], Any]: @@ -265,6 +266,32 @@ def allows_duplicate_labels(self, value: bool): # avoid `can_hold_identifiers` check. object.__setattr__(self, "_allows_duplicate_labels", value) + def set_flags(self, *, allows_duplicate_labels: Optional[bool] = None) -> FrameOrSeries: + """ + Set global attributes on a copy of this object. + + This method is intended to be used in method chains. + + Parameters + ---------- + allows_duplicate_labels: + Whether the returned object allows duplicate labels. + + Returns + ------- + Series or DataFrame + The same type as the caller. + + See Also + -------- + DataFrame.attrs + DataFrame.allows_duplicate_labels + """ + df = self.copy() + if allows_duplicate_labels is not None: + df.allows_duplicate_labels = allows_duplicate_labels + return df + @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..e04cc01a8989b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -553,6 +553,22 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.DataFrame({"A": [1, 2]}) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.allows_duplicate_labels is True + else: + assert result.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + # We didn't mutate df + assert df.allows_duplicate_labels is True + tm.assert_frame_equal(result, df) + def test_cache_on_copy(self): # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates df = DataFrame({"a": [1]}) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b174eb0e42776..3cfd6c28f9c34 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -522,6 +522,22 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.Series([1, 2]) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.allows_duplicate_labels is True + else: + assert result.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + # We didn't mutate df + assert df.allows_duplicate_labels is True + tm.assert_series_equal(result, df) + class TestCategoricalSeries: @pytest.mark.parametrize( From 64334caf228cc9fd1aba2a86e07d2eeb1c9e9d20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Aug 2020 08:53:28 -0500 Subject: [PATCH 19/35] update --- pandas/core/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b6f032b751d53..5be33d6081aab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -266,7 +266,9 @@ def allows_duplicate_labels(self, value: bool): # avoid `can_hold_identifiers` check. object.__setattr__(self, "_allows_duplicate_labels", value) - def set_flags(self, *, allows_duplicate_labels: Optional[bool] = None) -> FrameOrSeries: + def set_flags( + self, *, allows_duplicate_labels: Optional[bool] = None + ) -> FrameOrSeries: """ Set global attributes on a copy of this object. From bef80bd6d0f93e7334873ca4c14ff234bd1b5626 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Aug 2020 08:54:05 -0500 Subject: [PATCH 20/35] update --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b532f3fad3a5e..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -210,7 +210,6 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') - .. _whatsnew_110.grouper_resample_origin: Grouper and resample now supports the arguments origin and offset From 7d09c8bc60148977f29af64f4cc474e4df99469f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Aug 2020 10:08:50 -0500 Subject: [PATCH 21/35] fixup docs --- pandas/core/generic.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5be33d6081aab..153b563f29cfb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -253,6 +253,31 @@ def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: def allows_duplicate_labels(self) -> bool: """ Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] """ return self._allows_duplicate_labels @@ -276,7 +301,7 @@ def set_flags( Parameters ---------- - allows_duplicate_labels: + allows_duplicate_labels : bool, optional Whether the returned object allows duplicate labels. Returns @@ -286,8 +311,17 @@ def set_flags( See Also -------- - DataFrame.attrs - DataFrame.allows_duplicate_labels + DataFrame.attrs : Set global metadata on this object. + DataFrame.allows_duplicate_labels : If this object allows duplicate labels. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.allows_duplicate_labels + True + >>> df2 = df.set_flags(allows_duplicate_labels=False) + >>> df2.allows_duplicate_labels + False """ df = self.copy() if allows_duplicate_labels is not None: From 674cb97302bfa3ea68ff677e5e323181fdc7d126 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 5 Aug 2020 10:12:22 -0500 Subject: [PATCH 22/35] typing --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 153b563f29cfb..b9be08575be7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -292,7 +292,7 @@ def allows_duplicate_labels(self, value: bool): object.__setattr__(self, "_allows_duplicate_labels", value) def set_flags( - self, *, allows_duplicate_labels: Optional[bool] = None + self: FrameOrSeries, *, allows_duplicate_labels: Optional[bool] = None ) -> FrameOrSeries: """ Set global attributes on a copy of this object. From cc80b02ba7414684749139ddec0a97b3dd50a749 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Aug 2020 06:58:33 -0500 Subject: [PATCH 23/35] fixed typo --- pandas/tests/generic/test_duplicate_labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index 4a492cb4b1134..3b737a004f32a 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -94,7 +94,7 @@ def test_preserve_getitem(self): assert df.loc[[0]].allows_duplicate_labels is False assert df.loc[0, ["A"]].allows_duplicate_labels is False - @pytest.mark.xfail(resason="Unclear behavior.") + @pytest.mark.xfail(reason="Unclear behavior.") def test_ndframe_getitem_caching_issue(self): # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? From 4ced35143cc8809b24ddb2594c424ad0397b9ce9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 21 Aug 2020 10:41:28 -0500 Subject: [PATCH 24/35] todo --- pandas/_testing.py | 6 + pandas/core/_flags.py | 99 ++++++++++ pandas/core/frame.py | 9 +- pandas/core/generic.py | 83 +++----- pandas/core/series.py | 13 +- pandas/errors/__init__.py | 4 +- pandas/tests/frame/test_api.py | 19 +- pandas/tests/generic/test_duplicate_labels.py | 182 ++++++++++-------- pandas/tests/generic/test_generic.py | 10 + pandas/tests/series/test_api.py | 18 +- pandas/tests/test_flags.py | 26 +++ 11 files changed, 308 insertions(+), 161 deletions(-) create mode 100644 pandas/core/_flags.py create mode 100644 pandas/tests/test_flags.py diff --git a/pandas/_testing.py b/pandas/_testing.py index ef6232fa6d575..42ffd50fec764 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1431,6 +1431,7 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", @@ -1492,6 +1493,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1565,6 +1568,9 @@ def assert_frame_equal( if check_like: left, right = left.reindex_like(right), right + if check_flags: + assert left.flags == right.flags + # index comparison assert_index_equal( left.index, diff --git a/pandas/core/_flags.py b/pandas/core/_flags.py new file mode 100644 index 0000000000000..2158e9f8d66ff --- /dev/null +++ b/pandas/core/_flags.py @@ -0,0 +1,99 @@ +import weakref + + +class Flags: + """ + Flags that apply to pandas objects. + + Parameters + ---------- + obj : Series or DataFrame + The object these flags are associated with + allows_duplicate_labels : bool + Whether the object allows duplicate labels + + Notes + ----- + Attributes can be set in two ways + + >>> df = pd.DataFrame() + >>> df.flags + + >>> df.flags.allows_duplicate_labels = False + >>> df.flags + + + >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags + + """ + + _keys = {"allows_duplicate_labels"} + + def __init__(self, obj, *, allows_duplicate_labels): + self._allows_duplicate_labels = allows_duplicate_labels + self._obj = weakref.ref(obj) + + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + obj = self._obj() + if obj is None: + raise ValueError("This flags object has been deleted.") + + if not value: + for ax in obj.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + + def __getitem__(self, key): + if key not in self._keys: + raise KeyError(key) + + return getattr(self, key) + + def __setitem__(self, key, value): + if key not in self._keys: + raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") + setattr(self, key, value) + + def __repr__(self): + return f"" + + def __eq__(self, other): + if isinstance(other, type(self)): + return self.allows_duplicate_labels == other.allows_duplicate_labels + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ca27b46c6ecf1..b98727c1c87c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -461,7 +461,6 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, - allows_duplicate_labels=True, ): if data is None: data = {} @@ -475,7 +474,7 @@ def __init__( if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__( - self, data, allows_duplicate_labels=allows_duplicate_labels + self, data, ) return @@ -580,7 +579,7 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) - NDFrame.__init__(self, mgr, allows_duplicate_labels=allows_duplicate_labels) + NDFrame.__init__(self, mgr) # ---------------------------------------------------------------------- @@ -3673,10 +3672,10 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value : int, Series, or array-like allow_duplicates : bool, optional """ - if allow_duplicates and not self.allows_duplicate_labels: + if allow_duplicates and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'allow_duplicates=True' when " - "'self.allows_duplicate_labels' is False." + "'self.flags.allows_duplicate_labels' is False." ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c6e252c161250..73c8ba5edbca1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -89,6 +89,7 @@ import pandas as pd from pandas.core import missing, nanops +from pandas.core._flags import Flags import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -199,7 +200,6 @@ def __init__( self, data: BlockManager, copy: bool = False, - allows_duplicate_labels: bool = True, attrs: Optional[Mapping[Optional[Hashable], Any]] = None, ): # copy kwarg is retained for mypy compat, is not used @@ -212,7 +212,7 @@ def __init__( else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) - object.__setattr__(self, "allows_duplicate_labels", allows_duplicate_labels) + object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -253,54 +253,17 @@ def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) @property - def allows_duplicate_labels(self) -> bool: - """ - Whether this object allows duplicate labels. - - Setting ``allows_duplicate_labels=False`` ensures that the - index (and columns of a DataFrame) are unique. Most methods - that accept and return a Series or DataFrame will propagate - the value of ``allows_duplicate_labels``. - - See :ref:`duplicates` for more. - - See Also - -------- - DataFrame.attrs : Set global metadata on this object. - DataFrame.set_flags : Set global flags on this object. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) - >>> df.allows_duplicate_labels - True - >>> df.allows_duplicate_labels = False - Traceback (most recent call last): - ... - pandas.errors.DuplicateLabelError: Index has duplicates. - positions - label - a [0, 1] - """ - return self._allows_duplicate_labels - - @allows_duplicate_labels.setter - def allows_duplicate_labels(self, value: bool): - value = bool(value) - if not value: - for ax in self.axes: - ax._maybe_check_unique() - - # avoid `can_hold_identifiers` check. - object.__setattr__(self, "_allows_duplicate_labels", value) + def flags(self) -> Flags: + return self._flags def set_flags( - self: FrameOrSeries, *, allows_duplicate_labels: Optional[bool] = None + self: FrameOrSeries, + *, + copy: bool = False, + allows_duplicate_labels: Optional[bool] = None, ) -> FrameOrSeries: """ - Set global attributes on a copy of this object. - - This method is intended to be used in method chains. + Return a new object with updated flags. Parameters ---------- @@ -312,6 +275,18 @@ def set_flags( Series or DataFrame The same type as the caller. + Notes + ----- + This method returns a new object that's a view on the same data + as the input. Mutating the input or the output will be reflected + in the other. + + This method is intended to be used in method chains. + + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + See Also -------- DataFrame.attrs : Set global metadata on this object. @@ -326,9 +301,9 @@ def set_flags( >>> df2.allows_duplicate_labels False """ - df = self.copy() + df = self.copy(deep=copy) if allows_duplicate_labels is not None: - df.allows_duplicate_labels = allows_duplicate_labels + df.flags["allows_duplicate_labels"] = allows_duplicate_labels return df @classmethod @@ -3889,10 +3864,10 @@ def __delitem__(self, key) -> None: # Unsorted def _check_inplace_and_allows_duplicate_labels(self, inplace): - if inplace and not self.allows_duplicate_labels: + if inplace and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'inplace=True' when " - "'self.allows_duplicate_labels' is False." + "'self.flags.allows_duplicate_labels' is False." ) def get(self, key, default=None): @@ -5298,15 +5273,17 @@ def __finalize__( for name in other.attrs: self.attrs[name] = other.attrs[name] - self.allows_duplicate_labels = other.allows_duplicate_labels + self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. for name in self._metadata: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - allows_duplicate_labels = all(x.allows_duplicate_labels for x in other.objs) - self.allows_duplicate_labels = allows_duplicate_labels + allows_duplicate_labels = all( + x.flags.allows_duplicate_labels for x in other.objs + ) + self.flags.allows_duplicate_labels = allows_duplicate_labels return self diff --git a/pandas/core/series.py b/pandas/core/series.py index 3614c74d1916e..00be7494654ff 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -216,14 +216,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Constructors def __init__( - self, - data=None, - index=None, - dtype=None, - name=None, - copy=False, - allows_duplicate_labels=True, - fastpath=False, + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, ): if ( @@ -234,7 +227,7 @@ def __init__( ): # GH#33357 called with just the SingleBlockManager NDFrame.__init__( - self, data, allows_duplicate_labels=allows_duplicate_labels + self, data, ) self.name = name return @@ -355,7 +348,7 @@ def __init__( data = SingleBlockManager.from_array(data, index) generic.NDFrame.__init__( - self, data, allows_duplicate_labels=allows_duplicate_labels + self, data, ) self.name = name self._set_axis(0, index, fastpath=True) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 397ab1440b8d0..7891671e449b7 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -210,7 +210,9 @@ class DuplicateLabelError(ValueError): Examples -------- - >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c'], allows_duplicate_labels=False) + >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + ... allows_duplicate_labels=False + ... ) >>> s.reindex(['a', 'a', 'b']) Traceback (most recent call last): ... diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 6bd55c02887b8..f5dcabdd5cbfd 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -557,15 +557,26 @@ def test_set_flags(self, allows_duplicate_labels): result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) if allows_duplicate_labels is None: # We don't update when it's not provided - assert result.allows_duplicate_labels is True + assert result.flags.allows_duplicate_labels is True else: - assert result.allows_duplicate_labels is allows_duplicate_labels + assert result.flags.allows_duplicate_labels is allows_duplicate_labels # We made a copy assert df is not result + # We didn't mutate df - assert df.allows_duplicate_labels is True - tm.assert_frame_equal(result, df) + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0, 0] = 10 + assert df.iloc[0, 0] == 0 def test_cache_on_copy(self): # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index 3b737a004f32a..b71c67af3c018 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -25,10 +25,10 @@ class TestPreserves: ) def test_construction_ok(self, cls, data): result = cls(data) - assert result.allows_duplicate_labels is True + assert result.flags.allows_duplicate_labels is True - result = cls(data, allows_duplicate_labels=False) - assert result.allows_duplicate_labels is False + result = cls(data).set_flags(allows_duplicate_labels=False) + assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize( "func", @@ -42,8 +42,8 @@ def test_construction_ok(self, cls, data): ], ) def test_preserved_series(self, func): - s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) - assert func(s).allows_duplicate_labels is False + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + assert func(s).flags.allows_duplicate_labels is False @pytest.mark.parametrize( "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] @@ -51,22 +51,22 @@ def test_preserved_series(self, func): # TODO: frame @not_implemented def test_align(self, other): - s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) a, b = s.align(other) - assert a.allows_duplicate_labels is False - assert b.allows_duplicate_labels is False + assert a.flags.allows_duplicate_labels is False + assert b.flags.allows_duplicate_labels is False def test_preserved_frame(self): - df = pd.DataFrame( - {"A": [1, 2], "B": [3, 4]}, index=["a", "b"], allows_duplicate_labels=False + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ) - assert df.loc[["a"]].allows_duplicate_labels is False - assert df.loc[:, ["A", "B"]].allows_duplicate_labels is False + assert df.loc[["a"]].flags.allows_duplicate_labels is False + assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False @not_implemented def test_to_frame(self): - s = pd.Series(dtype=float, allows_duplicate_labels=False) - assert s.to_frame().allows_duplicate_labels is False + s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) + assert s.to_frame().flags.allows_duplicate_labels is False @pytest.mark.parametrize("func", ["add", "sub"]) @pytest.mark.parametrize( @@ -74,34 +74,34 @@ def test_to_frame(self): ) @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) def test_binops(self, func, other, frame): - df = pd.Series( - [1, 2], name="A", index=["a", "b"], allows_duplicate_labels=False + df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags( + allows_duplicate_labels=False ) if frame: df = df.to_frame() if isinstance(other, pd.Series) and frame: other = other.to_frame() func = operator.methodcaller(func, other) - assert df.allows_duplicate_labels is False - assert func(df).allows_duplicate_labels is False + assert df.flags.allows_duplicate_labels is False + assert func(df).flags.allows_duplicate_labels is False @not_implemented def test_preserve_getitem(self): - df = pd.DataFrame({"A": [1, 2]}, allows_duplicate_labels=False) - assert df[["A"]].allows_duplicate_labels is False - assert df["A"].allows_duplicate_labels is False - assert df.loc[0].allows_duplicate_labels is False - assert df.loc[[0]].allows_duplicate_labels is False - assert df.loc[0, ["A"]].allows_duplicate_labels is False + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + assert df[["A"]].flags.allows_duplicate_labels is False + assert df["A"].flags.allows_duplicate_labels is False + assert df.loc[0].flags.allows_duplicate_labels is False + assert df.loc[[0]].flags.allows_duplicate_labels is False + assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False @pytest.mark.xfail(reason="Unclear behavior.") def test_ndframe_getitem_caching_issue(self): # NDFrame.__getitem__ will cache the first df['A']. May need to # invalidate that cache? Update the cached entries? - df = pd.DataFrame({"A": [0]}, allows_duplicate_labels=False) - assert df["A"].allows_duplicate_labels is False - df.allows_duplicate_labels = True - assert df["A"].allows_duplicate_labels is True + df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) + assert df["A"].flags.allows_duplicate_labels is False + df.flags.allows_duplicate_labels = True + assert df["A"].flags.allows_duplicate_labels is True @pytest.mark.parametrize( "objs, kwargs", @@ -109,55 +109,67 @@ def test_ndframe_getitem_caching_issue(self): # Series ( [ - pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), - pd.Series(2, index=["c", "d"], allows_duplicate_labels=False), + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["c", "d"]).set_flags( + allows_duplicate_labels=False + ), ], {}, ), ( [ - pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), - pd.Series(2, index=["a", "b"], allows_duplicate_labels=False), + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), ], {"ignore_index": True}, ), ( [ - pd.Series(1, index=["a", "b"], allows_duplicate_labels=False), - pd.Series(2, index=["a", "b"], allows_duplicate_labels=False), + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), ], {"axis": 1}, ), # Frame ( [ - pd.DataFrame( - {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), - pd.DataFrame( - {"A": [1, 2]}, index=["c", "d"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags( + allows_duplicate_labels=False ), ], {}, ), ( [ - pd.DataFrame( - {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), - pd.DataFrame( - {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), ], {"ignore_index": True}, ), ( [ - pd.DataFrame( - {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), - pd.DataFrame( - {"B": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), ], {"axis": 1}, @@ -165,13 +177,10 @@ def test_ndframe_getitem_caching_issue(self): # Series / Frame ( [ - pd.DataFrame( - {"A": [1, 2]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), - pd.Series( - [1, 2], - index=["a", "b"], - name="B", + pd.Series([1, 2], index=["a", "b"], name="B",).set_flags( allows_duplicate_labels=False, ), ], @@ -181,18 +190,18 @@ def test_ndframe_getitem_caching_issue(self): ) def test_concat(self, objs, kwargs): result = pd.concat(objs, **kwargs) - assert result.allows_duplicate_labels is False + assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize( "left, right, kwargs, expected", [ # false false false pytest.param( - pd.DataFrame( - {"A": [0, 1]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), - pd.DataFrame( - {"B": [0, 1]}, index=["a", "d"], allows_duplicate_labels=False + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags( + allows_duplicate_labels=False ), dict(left_index=True, right_index=True), False, @@ -200,8 +209,8 @@ def test_concat(self, objs, kwargs): ), # false true false pytest.param( - pd.DataFrame( - {"A": [0, 1]}, index=["a", "b"], allows_duplicate_labels=False + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ), pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), dict(left_index=True, right_index=True), @@ -219,7 +228,7 @@ def test_concat(self, objs, kwargs): ) def test_merge(self, left, right, kwargs, expected): result = pd.merge(left, right, **kwargs) - assert result.allows_duplicate_labels is expected + assert result.flags.allows_duplicate_labels is expected @not_implemented def test_groupby(self): @@ -228,10 +237,9 @@ def test_groupby(self): # - apply # - transform # - Should passing a grouper that disallows duplicates propagate? - # i.e. df.groupby(pd.Series([0, 1], allows_duplicate_labels=False))? - df = pd.DataFrame({"A": [1, 2, 3]}, allows_duplicate_labels=False) + df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False) result = df.groupby([0, 0, 1]).agg("count") - assert result.allows_duplicate_labels is False + assert result.flags.allows_duplicate_labels is False @pytest.mark.parametrize("frame", [True, False]) @not_implemented @@ -244,9 +252,9 @@ def test_window(self, frame): ) if frame: df = df.to_frame() - assert df.rolling(3).mean().allows_duplicate_labels is False - assert df.ewm(3).mean().allows_duplicate_labels is False - assert df.expanding(3).mean().allows_duplicate_labels is False + assert df.rolling(3).mean().flags.allows_duplicate_labels is False + assert df.ewm(3).mean().flags.allows_duplicate_labels is False + assert df.expanding(3).mean().flags.allows_duplicate_labels is False # ---------------------------------------------------------------------------- @@ -263,12 +271,12 @@ class TestRaises: (pd.DataFrame, {"columns": ["b", "b"]}), ], ) - def test_construction_with_duplicates(self, cls, axes): + def test_set_flags_with_duplicates(self, cls, axes): result = cls(**axes) - assert result.allows_duplicate_labels is True + assert result.flags.allows_duplicate_labels is True with pytest.raises(pd.errors.DuplicateLabelError): - cls(**axes, allows_duplicate_labels=False) + cls(**axes).set_flags(allows_duplicate_labels=False) @pytest.mark.parametrize( "data", @@ -280,15 +288,15 @@ def test_construction_with_duplicates(self, cls, axes): ) def test_setting_allows_duplicate_labels_raises(self, data): with pytest.raises(pd.errors.DuplicateLabelError): - data.allows_duplicate_labels = False + data.flags.allows_duplicate_labels = False - assert data.allows_duplicate_labels is True + assert data.flags.allows_duplicate_labels is True @pytest.mark.parametrize( "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))] ) def test_series_raises(self, func): - s = pd.Series([0, 1], index=["a", "b"], allows_duplicate_labels=False) + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) with pytest.raises(pd.errors.DuplicateLabelError): func(s) @@ -315,8 +323,8 @@ def test_series_raises(self, func): ], ) def test_getitem_raises(self, getter, target): - df = pd.DataFrame( - {"A": [1, 2], "B": [3, 4]}, index=["a", "b"], allows_duplicate_labels=False + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False ) if target: # df, df.loc, or df.iloc @@ -332,8 +340,12 @@ def test_getitem_raises(self, getter, target): [ ( [ - pd.Series(1, index=[0, 1], name="a", allows_duplicate_labels=False), - pd.Series(2, index=[0, 1], name="a", allows_duplicate_labels=False), + pd.Series(1, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), ], {"axis": 1}, ) @@ -345,8 +357,8 @@ def test_concat_raises(self, objs, kwargs): @not_implemented def test_merge_raises(self): - a = pd.DataFrame( - {"A": [0, 1, 2]}, index=["a", "b", "c"], allows_duplicate_labels=False + a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( + allows_duplicate_labels=False ) b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) with pytest.raises(pd.errors.DuplicateLabelError): @@ -370,13 +382,13 @@ def test_merge_raises(self): ) def test_raises_basic(idx): with pytest.raises(pd.errors.DuplicateLabelError): - pd.Series(1, index=idx, allows_duplicate_labels=False) + pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) with pytest.raises(pd.errors.DuplicateLabelError): - pd.DataFrame({"A": [1, 1]}, index=idx, allows_duplicate_labels=False) + pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) with pytest.raises(pd.errors.DuplicateLabelError): - pd.DataFrame([[1, 2]], columns=idx, allows_duplicate_labels=False) + pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) def test_format_duplicate_labels_message(): @@ -399,7 +411,7 @@ def test_format_duplicate_labels_message_multi(): def test_dataframe_insert_raises(): - df = pd.DataFrame({"A": [1, 2]}, allows_duplicate_labels=False) + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) with pytest.raises(ValueError, match="Cannot specify"): df.insert(0, "A", [3, 4], allow_duplicates=True) @@ -414,9 +426,11 @@ def test_dataframe_insert_raises(): ], ) def test_inplace_raises(method, frame_only): - df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}, allows_duplicate_labels=False) + df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags( + allows_duplicate_labels=False + ) s = df["A"] - s.allows_duplicate_labels = False + s.flags.allows_duplicate_labels = False msg = "Cannot specify" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 5e66925a38ec6..cd61223e3d0c0 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -887,3 +887,13 @@ def test_axis_numbers_deprecated(self, box): obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS + + def test_flags_identity(self): + s = pd.Series([1, 2]) + assert s.flags is s.flags + df = s.to_frame() + assert df.flags is df.flags + assert s.flags is not df.flags + + df2 = df.copy() + assert df2.flags is not df.flags diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 3cfd6c28f9c34..2fe331fe7b758 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -528,15 +528,25 @@ def test_set_flags(self, allows_duplicate_labels): result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) if allows_duplicate_labels is None: # We don't update when it's not provided - assert result.allows_duplicate_labels is True + assert result.flags.allows_duplicate_labels is True else: - assert result.allows_duplicate_labels is allows_duplicate_labels + assert result.flags.allows_duplicate_labels is allows_duplicate_labels # We made a copy assert df is not result # We didn't mutate df - assert df.allows_duplicate_labels is True - tm.assert_series_equal(result, df) + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0] = 0 + assert df.iloc[0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0] = 10 + assert df.iloc[0] == 0 class TestCategoricalSeries: diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py new file mode 100644 index 0000000000000..28411e56748cd --- /dev/null +++ b/pandas/tests/test_flags.py @@ -0,0 +1,26 @@ +import pandas as pd + + +class TestFlags: + def test_equality(self): + a = pd.DataFrame().set_flags(allows_duplicate_labels=True).flags + b = pd.DataFrame().set_flags(allows_duplicate_labels=False).flags + + assert a == a + assert b == b + assert a != b + assert a != 2 + + def test_set(self): + df = pd.DataFrame().set_flags(allows_duplicate_labels=True) + a = df.flags + a.allows_duplicate_labels = False + assert a.allows_duplicate_labels is False + a["allows_duplicate_labels"] = True + assert a.allows_duplicate_labels is True + + def test_repr(self): + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=True).flags) + assert a == "" + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags) + assert a == "" From 4f7c350d9bda6b36ee476a4771f86e855a28d091 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 24 Aug 2020 10:36:56 -0500 Subject: [PATCH 25/35] flags --- doc/source/reference/frame.rst | 16 ++++- doc/source/reference/series.rst | 15 ++++- doc/source/user_guide/duplicates.rst | 30 +++++---- doc/source/whatsnew/v1.2.0.rst | 9 ++- pandas/__init__.py | 1 + pandas/core/_flags.py | 18 ++++- pandas/core/api.py | 1 + pandas/core/flags.py | 99 ++++++++++++++++++++++++++++ pandas/core/frame.py | 16 ----- pandas/core/generic.py | 54 +++++++++++++-- pandas/core/series.py | 15 ----- pandas/tests/api/test_api.py | 1 + pandas/tests/generic/test_generic.py | 14 ++-- 13 files changed, 225 insertions(+), 64 deletions(-) create mode 100644 pandas/core/flags.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 75c75064ffb6a..9a1ebc8d670dc 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -37,7 +37,6 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty - DataFrame.allows_duplicate_labels DataFrame.set_flags Conversion @@ -278,6 +277,21 @@ Time Series-related DataFrame.tz_convert DataFrame.tz_localize +.. _api.frame.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`DataFrame.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags + + .. _api.frame.metadata: Metadata diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3daac4c4a8738..5131d35334693 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -39,7 +39,7 @@ Attributes Series.empty Series.dtypes Series.name - Series.allows_duplicate_labels + Series.flags Series.set_flags Conversion @@ -529,6 +529,19 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.from_coo Series.sparse.to_coo +.. _api.series.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`Series.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags .. _api.series.metadata: diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 145700b02bc16..0d27ae48f0d09 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -121,26 +121,29 @@ an exception will be raised. .. ipython:: python :okexcept: - pd.Series([0, 1, 2], index=['a', 'b', 'b'], allows_duplicate_labels=False) + pd.Series([0, 1, 2], index=['a', 'b', 'b']).set_flags(allows_duplicate_labels=False) This applies to both row and column labels for a :class:`DataFrame` .. ipython:: python :okexcept: - pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], - allows_duplicate_labels=False) + pd.DataFrame( + [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], + ).set_flags(allows_duplicate_labels=False) -This attribute can be checked or set with :attr:`~DataFrame.allows_duplicate_labels`, +This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. .. ipython:: python - df = pd.DataFrame({"A": [0, 1, 2, 3]}, - index=['x', 'y', 'X', 'Y'], - allows_duplicate_labels=False) + df = ( + pd.DataFrame({"A": [0, 1, 2, 3]}, + index=['x', 'y', 'X', 'Y']) + .set_flags(allows_duplicate_labels=False) + ) df - df.allows_duplicate_labels + df.flags.allows_duplicate_labels :meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes like ``allows_duplicate_labels`` set to some value @@ -148,15 +151,16 @@ like ``allows_duplicate_labels`` set to some value .. ipython:: python df2 = df.set_flags(allows_duplicate_labels=True) - df2.allows_duplicate_labels + df2.flags.allows_duplicate_labels +The new ``DataFrame`` returned is a view on the same data as the old ``DataFrame``. Or the property can just be set directly on the same object .. ipython:: python - df2.allows_duplicate_labels = False - df2.allows_duplicate_labels + df2.flags.allows_duplicate_labels = False + df2.flags.allows_duplicate_labels When processing raw, messy data you might initially read in the messy data (which potentially has duplicate labels), deduplicate, and then disallow duplicates @@ -167,7 +171,7 @@ going forward, to ensure that your data pipeline doesn't introduce duplicates. >>> raw = pd.read_csv("...") >>> deduplicated = raw.groupby(level=0).first() # remove duplicates - >>> deduplicated.allows_duplicate_labels = False # disallow going forward + >>> deduplicated.flags.allows_duplicate_labels = False # disallow going forward Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate labels or performing an operation that introduces duplicate labels on a ``Series`` or @@ -191,7 +195,7 @@ operations. .. ipython:: python :okexcept: - s1 = pd.Series(0, index=['a', 'b'], allows_duplicate_labels=False) + s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) s1 s1.head().rename({"a": "b"}) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4b568772bafc4..4b4a55b66db38 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -31,14 +31,14 @@ By default, duplicates continue to be allowed .. ipython:: python :okexcept: - pd.Series([1, 2], index=['a', 'a'], allows_duplicate_labels=False) + pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) Pandas will propagate the ``allows_duplicate_labels`` property through many operations. .. ipython:: python :okexcept: - a = pd.Series([1, 2], index=['a', 'b'], allows_duplicate_labels=False) + a = pd.Series([1, 2], index=['a', 'b']).set_flags(allows_duplicate_labels=False) a # An operation introducing duplicates a.reindex(['a', 'b', 'a']) @@ -52,6 +52,11 @@ Pandas will propagate the ``allows_duplicate_labels`` property through many oper See :ref:`duplicates` for more. +The ``allows_duplicate_labels`` flag is stored in the new :attr:`DataFrame.flags` +attribute. This stores global attributes that apply to the *pandas object*. This +differs from :attr:`DataFrame.attrs`, which stores information that applies to +the dataset. + Passing arguments to fsspec backends ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/__init__.py b/pandas/__init__.py index 36576da74c75d..2737bcd8f9ccf 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -100,6 +100,7 @@ to_datetime, to_timedelta, # misc + Flags, Grouper, factorize, unique, diff --git a/pandas/core/_flags.py b/pandas/core/_flags.py index 2158e9f8d66ff..abbb918d8d4e5 100644 --- a/pandas/core/_flags.py +++ b/pandas/core/_flags.py @@ -5,12 +5,26 @@ class Flags: """ Flags that apply to pandas objects. + .. versionadded:: 1.2.0 + Parameters ---------- obj : Series or DataFrame The object these flags are associated with - allows_duplicate_labels : bool - Whether the object allows duplicate labels + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this object. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` (or columns for DataFrame) is not unique, or any + subsequent operation on introduces duplicates. + See :ref:`duplicates.disallow` for more. + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. Notes ----- diff --git a/pandas/core/api.py b/pandas/core/api.py index b0b65f9d0be34..348e9206d6e19 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -26,6 +26,7 @@ ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array +from pandas.core.flags import Flags from pandas.core.groupby import Grouper, NamedAgg from pandas.core.indexes.api import ( CategoricalIndex, diff --git a/pandas/core/flags.py b/pandas/core/flags.py new file mode 100644 index 0000000000000..2158e9f8d66ff --- /dev/null +++ b/pandas/core/flags.py @@ -0,0 +1,99 @@ +import weakref + + +class Flags: + """ + Flags that apply to pandas objects. + + Parameters + ---------- + obj : Series or DataFrame + The object these flags are associated with + allows_duplicate_labels : bool + Whether the object allows duplicate labels + + Notes + ----- + Attributes can be set in two ways + + >>> df = pd.DataFrame() + >>> df.flags + + >>> df.flags.allows_duplicate_labels = False + >>> df.flags + + + >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags + + """ + + _keys = {"allows_duplicate_labels"} + + def __init__(self, obj, *, allows_duplicate_labels): + self._allows_duplicate_labels = allows_duplicate_labels + self._obj = weakref.ref(obj) + + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + obj = self._obj() + if obj is None: + raise ValueError("This flags object has been deleted.") + + if not value: + for ax in obj.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + + def __getitem__(self, key): + if key not in self._keys: + raise KeyError(key) + + return getattr(self, key) + + def __setitem__(self, key, value): + if key not in self._keys: + raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") + setattr(self, key, value) + + def __repr__(self): + return f"" + + def __eq__(self, other): + if isinstance(other, type(self)): + return self.allows_duplicate_labels == other.allows_duplicate_labels + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b98727c1c87c7..810c828ed404e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -369,22 +369,6 @@ class DataFrame(NDFrame): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. - allows_duplicate_labels : bool, default True - Whether to allow duplicate row or column labels in this DataFrame. - By default, duplicate labels are permitted. Setting this to ``False`` - will cause an :class:`errors.DuplicateLabelError` to be raised when - `index` or `columns` are not unique, or when any subsequent operation - on this DataFrame introduces duplicates. See :ref:`duplicates.disallow` - for more. - - .. versionadded:: 1.1.0 - - .. warning:: - - This is an experimental feature. Currently, many methods fail to - propagate the ``allows_duplicate_labels`` value. In future versions - it is expected that every method taking or returning one or more - DataFrame or Series objects will propagate ``allows_duplicate_labels``. See Also -------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 73c8ba5edbca1..89e34a7e7291d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -89,11 +89,11 @@ import pandas as pd from pandas.core import missing, nanops -from pandas.core._flags import Flags import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.flags import Flags from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex @@ -238,11 +238,15 @@ def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @property def attrs(self) -> Dict[Optional[Hashable], Any]: """ - Dictionary of global attributes on this object. + Dictionary of global attributes of this dataset. .. warning:: attrs is experimental and may change without warning. + + See Also + -------- + DataFrame.flags """ if self._attrs is None: self._attrs = {} @@ -254,6 +258,42 @@ def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: @property def flags(self) -> Flags: + """ + Get the properties associated with this pandas object. + + The available flags are + + * :attr:`Flags.allows_duplicate_labels` + + Notes + ----- + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + See Also + -------- + Flags + DataFrame.attrs + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags + + + Flags can be get or set using ``.`` + + >>> df.flags.allows_duplicate_labels + True + >>> df.flags.allows_duplicate_labels = False + + Or by slicing with a key + + >>> df.flags["allows_duplicate_labels"] + False + >>> df.flags["allows_duplicate_labels"] = True + """ return self._flags def set_flags( @@ -278,7 +318,7 @@ def set_flags( Notes ----- This method returns a new object that's a view on the same data - as the input. Mutating the input or the output will be reflected + as the input. Mutating the input or the output values will be reflected in the other. This method is intended to be used in method chains. @@ -289,16 +329,16 @@ def set_flags( See Also -------- - DataFrame.attrs : Set global metadata on this object. - DataFrame.allows_duplicate_labels : If this object allows duplicate labels. + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. Examples -------- >>> df = pd.DataFrame({"A": [1, 2]}) - >>> df.allows_duplicate_labels + >>> df.flags.allows_duplicate_labels True >>> df2 = df.set_flags(allows_duplicate_labels=False) - >>> df2.allows_duplicate_labels + >>> df2.flags.allows_duplicate_labels False """ df = self.copy(deep=copy) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0509f57fc381f..28b670c08bc83 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -175,21 +175,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): The name to give to the Series. copy : bool, default False Copy input data. - allows_duplicate_labels : bool, default True - Whether to allow duplicate labels in this Series. By default, - duplicate labels are permitted. Setting this to ``False`` will - cause an :class:`errors.DuplicateLabelError` to be raised when - `index` is not unique, or any subsequent operation on this Series - introduces duplicates. See :ref:`duplicates.disallow` for more. - - .. versionadded:: 1.1.0 - - .. warning:: - - This is an experimental feature. Currently, many methods fail to - propagate the ``allows_duplicate_labels`` value. In future versions - it is expected that every method taking or returning one or more - DataFrame or Series objects will propagate ``allows_duplicate_labels``. """ _typ = "series" diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 1d25336cd3b70..54da13c3c620b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -61,6 +61,7 @@ class TestPDApi(Base): "ExcelFile", "ExcelWriter", "Float64Index", + "Flags", "Grouper", "HDFStore", "Index", diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index cd61223e3d0c0..23bb673586768 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -888,12 +888,12 @@ def test_axis_numbers_deprecated(self, box): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS - def test_flags_identity(self): + @pytest.mark.parametrize("as_frame", [True, False]) + def test_flags_identity(self, as_frame): s = pd.Series([1, 2]) - assert s.flags is s.flags - df = s.to_frame() - assert df.flags is df.flags - assert s.flags is not df.flags + if as_frame: + s = s.to_frame() - df2 = df.copy() - assert df2.flags is not df.flags + assert s.flags is s.flags + s2 = s.copy() + assert s2.flags is not s.flags From ecb97d54273d814527b453b81cdc50a4eff97610 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 24 Aug 2020 10:38:20 -0500 Subject: [PATCH 26/35] rm _flags --- pandas/core/_flags.py | 113 ------------------------------------------ pandas/core/flags.py | 18 ++++++- 2 files changed, 16 insertions(+), 115 deletions(-) delete mode 100644 pandas/core/_flags.py diff --git a/pandas/core/_flags.py b/pandas/core/_flags.py deleted file mode 100644 index abbb918d8d4e5..0000000000000 --- a/pandas/core/_flags.py +++ /dev/null @@ -1,113 +0,0 @@ -import weakref - - -class Flags: - """ - Flags that apply to pandas objects. - - .. versionadded:: 1.2.0 - - Parameters - ---------- - obj : Series or DataFrame - The object these flags are associated with - allows_duplicate_labels : bool, default True - Whether to allow duplicate labels in this object. By default, - duplicate labels are permitted. Setting this to ``False`` will - cause an :class:`errors.DuplicateLabelError` to be raised when - `index` (or columns for DataFrame) is not unique, or any - subsequent operation on introduces duplicates. - See :ref:`duplicates.disallow` for more. - - .. warning:: - - This is an experimental feature. Currently, many methods fail to - propagate the ``allows_duplicate_labels`` value. In future versions - it is expected that every method taking or returning one or more - DataFrame or Series objects will propagate ``allows_duplicate_labels``. - - Notes - ----- - Attributes can be set in two ways - - >>> df = pd.DataFrame() - >>> df.flags - - >>> df.flags.allows_duplicate_labels = False - >>> df.flags - - - >>> df.flags['allows_duplicate_labels'] = True - >>> df.flags - - """ - - _keys = {"allows_duplicate_labels"} - - def __init__(self, obj, *, allows_duplicate_labels): - self._allows_duplicate_labels = allows_duplicate_labels - self._obj = weakref.ref(obj) - - @property - def allows_duplicate_labels(self) -> bool: - """ - Whether this object allows duplicate labels. - - Setting ``allows_duplicate_labels=False`` ensures that the - index (and columns of a DataFrame) are unique. Most methods - that accept and return a Series or DataFrame will propagate - the value of ``allows_duplicate_labels``. - - See :ref:`duplicates` for more. - - See Also - -------- - DataFrame.attrs : Set global metadata on this object. - DataFrame.set_flags : Set global flags on this object. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) - >>> df.allows_duplicate_labels - True - >>> df.allows_duplicate_labels = False - Traceback (most recent call last): - ... - pandas.errors.DuplicateLabelError: Index has duplicates. - positions - label - a [0, 1] - """ - return self._allows_duplicate_labels - - @allows_duplicate_labels.setter - def allows_duplicate_labels(self, value: bool): - value = bool(value) - obj = self._obj() - if obj is None: - raise ValueError("This flags object has been deleted.") - - if not value: - for ax in obj.axes: - ax._maybe_check_unique() - - self._allows_duplicate_labels = value - - def __getitem__(self, key): - if key not in self._keys: - raise KeyError(key) - - return getattr(self, key) - - def __setitem__(self, key, value): - if key not in self._keys: - raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") - setattr(self, key, value) - - def __repr__(self): - return f"" - - def __eq__(self, other): - if isinstance(other, type(self)): - return self.allows_duplicate_labels == other.allows_duplicate_labels - return False diff --git a/pandas/core/flags.py b/pandas/core/flags.py index 2158e9f8d66ff..abbb918d8d4e5 100644 --- a/pandas/core/flags.py +++ b/pandas/core/flags.py @@ -5,12 +5,26 @@ class Flags: """ Flags that apply to pandas objects. + .. versionadded:: 1.2.0 + Parameters ---------- obj : Series or DataFrame The object these flags are associated with - allows_duplicate_labels : bool - Whether the object allows duplicate labels + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this object. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` (or columns for DataFrame) is not unique, or any + subsequent operation on introduces duplicates. + See :ref:`duplicates.disallow` for more. + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. Notes ----- From d1a81fb3d2bd68992d006d913d88e9937bcffcc1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 24 Aug 2020 13:34:58 -0500 Subject: [PATCH 27/35] fixups --- doc/source/user_guide/duplicates.rst | 11 +++++++---- doc/source/whatsnew/v1.2.0.rst | 5 ++++- pandas/core/generic.py | 13 ++++++------- pandas/tests/base/test_misc.py | 2 +- pandas/tests/io/test_pickle.py | 4 ++-- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 0d27ae48f0d09..c9d007be9fd90 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -121,7 +121,10 @@ an exception will be raised. .. ipython:: python :okexcept: - pd.Series([0, 1, 2], index=['a', 'b', 'b']).set_flags(allows_duplicate_labels=False) + pd.Series( + [0, 1, 2], + index=['a', 'b', 'b'] + ).set_flags(allows_duplicate_labels=False) This applies to both row and column labels for a :class:`DataFrame` @@ -138,9 +141,9 @@ which indicates whether that object can have duplicate labels. .. ipython:: python df = ( - pd.DataFrame({"A": [0, 1, 2, 3]}, - index=['x', 'y', 'X', 'Y']) - .set_flags(allows_duplicate_labels=False) + pd.DataFrame({"A": [0, 1, 2, 3]}, + index=['x', 'y', 'X', 'Y']) + .set_flags(allows_duplicate_labels=False) ) df df.flags.allows_duplicate_labels diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 4b4a55b66db38..bf6a5f4579e9f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -38,7 +38,10 @@ Pandas will propagate the ``allows_duplicate_labels`` property through many oper .. ipython:: python :okexcept: - a = pd.Series([1, 2], index=['a', 'b']).set_flags(allows_duplicate_labels=False) + a = ( + pd.Series([1, 2], index=['a', 'b']) + .set_flags(allows_duplicate_labels=False) + ) a # An operation introducing duplicates a.reindex(['a', 'b', 'a']) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 89e34a7e7291d..e1f7f65157133 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -315,6 +315,11 @@ def set_flags( Series or DataFrame The same type as the caller. + See Also + -------- + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. + Notes ----- This method returns a new object that's a view on the same data @@ -327,11 +332,6 @@ def set_flags( pandas object (the Series or DataFrame). Metadata refer to properties of the dataset, and should be stored in :attr:`DataFrame.attrs`. - See Also - -------- - DataFrame.attrs : Global metadata applying to this dataset. - DataFrame.flags : Global flags applying to this object. - Examples -------- >>> df = pd.DataFrame({"A": [1, 2]}) @@ -1944,9 +1944,8 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) - allows_duplicate_labels = state.get("_allows_duplicate_labels", True) object.__setattr__( - self, "_allows_duplicate_labels", allows_duplicate_labels + self, "_flags", Flags(self, allows_duplicate_labels=True) ) # set in the order of internal names diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index d4e32840cc03b..9523fba953ad0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -99,7 +99,7 @@ def test_ndarray_compat_properties(index_or_series_obj): assert getattr(obj, p, None) is not None # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: + for p in ["strides", "itemsize", "base", "data"]: assert not hasattr(obj, p) msg = "can only convert an array of size 1 to a Python scalar" diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index ed1caa8140e15..cc68a161f3f1b 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -489,8 +489,8 @@ def test_read_pickle_with_subclass(): def test_allows_duplicate_labels(): - s = pd.Series(dtype=float, allows_duplicate_labels=False) + s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) tm.round_trip_pickle(s) - df = pd.DataFrame(allows_duplicate_labels=False) + df = pd.DataFrame().set_flags(allows_duplicate_labels=False) tm.round_trip_pickle(df) From 74a4eb8ad982ee8fc12f9b3351a63db7383f4884 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 24 Aug 2020 17:33:09 -0500 Subject: [PATCH 28/35] lint --- doc/source/user_guide/duplicates.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index c9d007be9fd90..74eab71476c58 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -132,7 +132,7 @@ This applies to both row and column labels for a :class:`DataFrame` :okexcept: pd.DataFrame( - [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], + [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], ).set_flags(allows_duplicate_labels=False) This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bf6a5f4579e9f..8d48056171a23 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -39,8 +39,8 @@ Pandas will propagate the ``allows_duplicate_labels`` property through many oper :okexcept: a = ( - pd.Series([1, 2], index=['a', 'b']) - .set_flags(allows_duplicate_labels=False) + pd.Series([1, 2], index=['a', 'b']) + .set_flags(allows_duplicate_labels=False) ) a # An operation introducing duplicates From 50042e10c33c9d8d61297652f44f0992ca338f2e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 24 Aug 2020 20:15:17 -0500 Subject: [PATCH 29/35] lint --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1f7f65157133..b2fc5c17ded4b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -265,17 +265,17 @@ def flags(self) -> Flags: * :attr:`Flags.allows_duplicate_labels` + See Also + -------- + Flags + DataFrame.attrs + Notes ----- "Flags" differ from "metadata". Flags reflect properties of the pandas object (the Series or DataFrame). Metadata refer to properties of the dataset, and should be stored in :attr:`DataFrame.attrs`. - See Also - -------- - Flags - DataFrame.attrs - Examples -------- >>> df = pd.DataFrame({"A": [1, 2]}) From a3360277c74048d9f5c642d9ffd9a5ca1780cb9a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Aug 2020 07:01:38 -0500 Subject: [PATCH 30/35] Pickle --- pandas/tests/generic/test_duplicate_labels.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index b71c67af3c018..e30f306bfbf2f 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -438,3 +438,13 @@ def test_inplace_raises(method, frame_only): if not frame_only: with pytest.raises(ValueError, match=msg): method(s) + + +def test_pickle(): + a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_series_equal(a, b) + + a = a.to_frame() + b = tm.round_trip_pickle(a) + tm.assert_frame_equal(a, b) From 60c853cffd8d13f260b921c28ac12fe2ae5e860e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Aug 2020 07:06:19 -0500 Subject: [PATCH 31/35] Doc --- doc/source/user_guide/duplicates.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 74eab71476c58..c153c8b6096a6 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -114,9 +114,9 @@ As noted above, handling duplicates is an important feature when reading in raw data. That said, you may want to avoid introducing duplicates as part of a data processing pipeline (from methods like :meth:`pandas.concat`, :meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame` -can be created with the argument ``allows_duplicate_labels=False`` to *disallow* -duplicate labels (the default is to allow them). If there are duplicate labels, -an exception will be raised. +*disallow* duplicate labels by calling ``.set_flags(allows_duplicate_labels=False)``. +(the default is to allow them). If there are duplicate labels, an exception +will be raised. .. ipython:: python :okexcept: From cb5d4f2e923890c8161ea0e226cbdebd34b237f9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 27 Aug 2020 07:18:03 -0500 Subject: [PATCH 32/35] test flags --- pandas/_testing.py | 11 ++++++++++- pandas/tests/util/test_assert_frame_equal.py | 15 +++++++++++++++ pandas/tests/util/test_assert_series_equal.py | 15 +++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 42ffd50fec764..6c111a9f110d4 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1227,6 +1227,7 @@ def assert_series_equal( check_categorical=True, check_category_order=True, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", @@ -1273,6 +1274,11 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. + + .. versionadded:: 1.2.0 + rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1309,6 +1315,9 @@ def assert_series_equal( msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, @@ -1569,7 +1578,7 @@ def assert_frame_equal( left, right = left.reindex_like(right), right if check_flags: - assert left.flags == right.flags + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" # index comparison assert_index_equal( diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3aa3c64923b14..5174ff005b5fb 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -268,3 +268,18 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) tm.assert_frame_equal(left, right, check_dtype=False) + + +def test_allows_duplicate_labels(): + left = pd.DataFrame() + right = pd.DataFrame().set_flags(allows_duplicate_labels=False) + tm.assert_frame_equal(left, left) + tm.assert_frame_equal(right, right) + tm.assert_frame_equal(left, right, check_flags=False) + tm.assert_frame_equal(right, left, check_flags=False) + + with pytest.raises(AssertionError, match=" Date: Fri, 28 Aug 2020 09:04:58 -0500 Subject: [PATCH 33/35] pickle --- pandas/core/generic.py | 11 ++++++----- pandas/tests/generic/test_duplicate_labels.py | 2 +- pandas/tests/io/test_pickle.py | 8 -------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bc7dee6e29ff3..69a27e654de9a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -183,6 +183,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "_metadata", "__array_struct__", "__array_interface__", + "_flags", ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() @@ -1929,11 +1930,11 @@ def __getstate__(self) -> Dict[str, Any]: _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, + _flags={k: self.flags[k] for k in self.flags._keys}, **meta, ) def __setstate__(self, state): - if isinstance(state, BlockManager): self._mgr = state elif isinstance(state, dict): @@ -1944,9 +1945,8 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) - object.__setattr__( - self, "_flags", Flags(self, allows_duplicate_labels=True) - ) + flags = state.get("_flags", dict(allows_duplicate_labels=True)) + object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names # to avoid definitional recursion @@ -1954,12 +1954,13 @@ def __setstate__(self, state): # defined meta = set(self._internal_names + self._metadata) for k in list(meta): - if k in state: + if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) for k, v in state.items(): if k not in meta: + print(k) object.__setattr__(self, k, v) else: diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index e30f306bfbf2f..97468e1f10a8b 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -445,6 +445,6 @@ def test_pickle(): b = tm.round_trip_pickle(a) tm.assert_series_equal(a, b) - a = a.to_frame() + a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False) b = tm.round_trip_pickle(a) tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index cc68a161f3f1b..6331113ab8945 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -486,11 +486,3 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) - - -def test_allows_duplicate_labels(): - s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) - tm.round_trip_pickle(s) - - df = pd.DataFrame().set_flags(allows_duplicate_labels=False) - tm.round_trip_pickle(df) From 3a5074106c84884b0058a1e8936673bc93ae3eb0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Aug 2020 16:13:08 -0500 Subject: [PATCH 34/35] remove debug --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 69a27e654de9a..643bb8e851aac 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1960,7 +1960,6 @@ def __setstate__(self, state): for k, v in state.items(): if k not in meta: - print(k) object.__setattr__(self, k, v) else: From bf23fdad7035707f49bd571cf5b9d4d682c9d01f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Sep 2020 08:55:10 -0500 Subject: [PATCH 35/35] fixups --- doc/source/user_guide/duplicates.rst | 2 +- pandas/core/flags.py | 2 +- pandas/core/indexes/base.py | 28 +++++++++++++++++++++++++++- pandas/errors/__init__.py | 2 +- pandas/tests/test_flags.py | 22 ++++++++++++++++++++++ 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index c153c8b6096a6..b65822fab2b23 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -108,7 +108,7 @@ with the same label. Disallowing Duplicate Labels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 1.1.0 +.. versionadded:: 1.2.0 As noted above, handling duplicates is an important feature when reading in raw data. That said, you may want to avoid introducing duplicates as part of a data diff --git a/pandas/core/flags.py b/pandas/core/flags.py index abbb918d8d4e5..15966d8ddce2a 100644 --- a/pandas/core/flags.py +++ b/pandas/core/flags.py @@ -85,7 +85,7 @@ def allows_duplicate_labels(self, value: bool): value = bool(value) obj = self._obj() if obj is None: - raise ValueError("This flags object has been deleted.") + raise ValueError("This flag's object has been deleted.") if not value: for ax in obj.axes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eae72ee006fa5..e5c4cd4cfd28e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -484,8 +484,19 @@ def _constructor(self): return type(self) def _maybe_check_unique(self): + """ + Check that an Index has no duplicates. + + This is typically only called via + `NDFrame.flags.allows_duplicate_labels.setter` when it's set to + True (duplicates aren't allowed). + + Raises + ------ + DuplicateLabelError + When the index is not unique. + """ if not self.is_unique: - # TODO: position, value, not too large. msg = """Index has duplicates.""" duplicates = self._format_duplicate_message() msg += "\n{}".format(duplicates) @@ -493,6 +504,21 @@ def _maybe_check_unique(self): raise DuplicateLabelError(msg) def _format_duplicate_message(self): + """ + Construct the DataFrame for a DuplicateLabelError. + + This returns a DataFrame indicating the labels and positions + of duplicates in an index. This should only be called when it's + already known that duplicates are present. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx._format_duplicate_message() + positions + label + a [0, 2] + """ from pandas import Series duplicates = self[self.duplicated(keep="first")].unique() diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 7891671e449b7..15389ca2c3e61 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -206,7 +206,7 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Examples -------- diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py index 28411e56748cd..f6e3ae4980afb 100644 --- a/pandas/tests/test_flags.py +++ b/pandas/tests/test_flags.py @@ -1,3 +1,5 @@ +import pytest + import pandas as pd @@ -24,3 +26,23 @@ def test_repr(self): assert a == "" a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags) assert a == "" + + def test_obj_ref(self): + df = pd.DataFrame() + flags = df.flags + del df + with pytest.raises(ValueError, match="object has been deleted"): + flags.allows_duplicate_labels = True + + def test_getitem(self): + df = pd.DataFrame() + flags = df.flags + assert flags["allows_duplicate_labels"] is True + flags["allows_duplicate_labels"] = False + assert flags["allows_duplicate_labels"] is False + + with pytest.raises(KeyError): + flags["a"] + + with pytest.raises(ValueError): + flags["a"] = 10