From d5ffb1fc9653a47e5426121fefeccdf2be9e8c46 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 1 Dec 2017 11:45:19 -0500 Subject: [PATCH] Support merging DataFrames on a combo of columns and index levels (GH 14355) (#17484) --- doc/source/merging.rst | 68 ++- doc/source/whatsnew/v0.22.0.txt | 31 ++ pandas/core/frame.py | 37 +- pandas/core/generic.py | 310 ++++++++++++- pandas/core/groupby.py | 11 +- pandas/core/reshape/merge.py | 62 ++- .../generic/test_label_or_level_utils.py | 431 ++++++++++++++++++ pandas/tests/groupby/test_index_as_string.py | 2 +- pandas/tests/reshape/test_merge.py | 9 + .../reshape/test_merge_index_as_string.py | 215 +++++++++ 10 files changed, 1138 insertions(+), 38 deletions(-) create mode 100644 pandas/tests/generic/test_label_or_level_utils.py create mode 100644 pandas/tests/reshape/test_merge_index_as_string.py diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7d981b815d01b..86d2ec2254057 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -518,14 +518,16 @@ standard database join operations between DataFrame objects: - ``left``: A DataFrame object - ``right``: Another DataFrame object -- ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and +- ``on``: Column or index level names to join on. Must be found in both the left + and right DataFrame objects. If not passed and ``left_index`` and ``right_index`` are ``False``, the intersection of the columns in the DataFrames will be inferred to be the join keys -- ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame -- ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame +- ``left_on``: Columns or index levels from the left DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame +- ``right_on``: Columns or index levels from the right DataFrame to use as + keys. Can either be column names, index level names, or arrays with length + equal to the length of the DataFrame - ``left_index``: If ``True``, use the index (row labels) from the left DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys @@ -563,6 +565,10 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.21.0 +.. note:: + + Support for specifying index levels as the ``on``, ``left_on``, and + ``right_on`` parameters was added in version 0.22.0. The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -1121,6 +1127,56 @@ This is not Implemented via ``join`` at-the-moment, however it can be done using labels=['left', 'right'], vertical=False); plt.close('all'); +.. _merging.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.22 + +Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters +may refer to either column names or index level names. This enables merging +``DataFrame`` instances on a combination of index levels and columns without +resetting indexes. + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + result = left.merge(right, on=['key1', 'key2']) + +.. ipython:: python + :suppress: + + @savefig merge_on_index_and_column.png + p.plot([left, right], result, + labels=['left', 'right'], vertical=False); + plt.close('all'); + +.. note:: + + When DataFrames are merged on a string that matches an index level in both + frames, the index level is preserved as an index level in the resulting + DataFrame. + +.. note:: + + If a string matches both a column name and an index level name, then a + warning is issued and the column takes precedence. This will result in an + ambiguity error in a future version. + Overlapping value columns ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index d43d5bec7175f..55e88d2e50919 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -32,6 +32,37 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp pd.get_dummies(df, columns=['c'], dtype=bool).dtypes +.. _whatsnew_0220.enhancements.merge_on_columns_and_levels: + +Merging on a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.merge` as the ``on``, ``left_on``, and ``right_on`` +parameters may now refer to either column names or index level names. +This enables merging ``DataFrame`` instances on a combination of index levels +and columns without resetting indexes. See the :ref:`Merge on columns and +levels ` documentation section. +(:issue:`14355`) + +.. ipython:: python + + left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + + left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], + 'B': ['B0', 'B1', 'B2', 'B3'], + 'key2': ['K0', 'K1', 'K0', 'K1']}, + index=left_index) + + right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3'], + 'key2': ['K0', 'K0', 'K0', 'K1']}, + index=right_index) + + left.merge(right, on=['key1', 'key2']) + + .. _whatsnew_0220.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3561f8a0eadf..ff42e39d9dbdd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -148,16 +148,17 @@ * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys on : label or list - Field names to join on. Must be found in both DataFrames. If on is - None and not merging on indexes, then it merges on the intersection of - the columns by default. + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. left_on : label or list, or array-like - Field names to join on in left DataFrame. Can be a vector or list of - vectors of the length of the DataFrame to use a particular vector as - the join key instead of columns + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. right_on : label or list, or array-like - Field names to join on in right DataFrame or vector/list of vectors per - left_on docs + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. left_index : boolean, default False Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index @@ -196,6 +197,11 @@ .. versionadded:: 0.21.0 +Notes +----- +Support for specifying index levels as the `on`, `left_on`, and +`right_on` parameters was added in version 0.22.0 + Examples -------- @@ -5214,12 +5220,12 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame - on : column name, tuple/list of column names, or array-like - Column(s) in the caller to join on the index in other, - otherwise joins index-on-index. If multiples - columns given, the passed DataFrame must have a MultiIndex. Can - pass an array as the join key if not already contained in the - calling DataFrame. Like an Excel VLOOKUP operation + on : name, tuple/list of names, or array-like + Column or index level name(s) in the caller to join on the index + in `other`, otherwise joins index-on-index. If multiple + values given, the `other` DataFrame must have a MultiIndex. Can + pass an array as the join key if it is not already contained in + the calling DataFrame. Like an Excel VLOOKUP operation how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects. @@ -5244,6 +5250,9 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Support for specifying index levels as the `on` parameter was added + in version 0.22.0 + Examples -------- >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 54b0089335b19..83fd36f0a864f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_re_compilable, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame from pandas.core.common import (_count_not_none, @@ -36,7 +37,7 @@ from pandas.core.base import PandasObject, SelectionMixin from pandas.core.index import (Index, MultiIndex, _ensure_index, - InvalidIndexError) + InvalidIndexError, RangeIndex) import pandas.core.indexing as indexing from pandas.core.indexing import maybe_convert_indices from pandas.core.indexes.datetimes import DatetimeIndex @@ -1038,6 +1039,313 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Label or Level Combination Helpers + # + # A collection of helper methods for DataFrame/Series operations that + # accept a combination of column/index labels and levels. All such + # operations should utilize/extend these methods when possible so that we + # have consistent precedence and validation logic throughout the library. + + def _is_level_reference(self, key, axis=0): + """ + Test whether a key is a level reference for a given axis. + + To be considered a level reference, `key` must be a string that: + - (axis=0): Matches the name of an index level and does NOT match + a column label. + - (axis=1): Matches the name of a column level and does NOT match + an index label. + + Parameters + ---------- + key: str + Potential level name for the given axis + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_level: bool + """ + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_is_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + not self._is_label_reference(key, axis=axis)) + + def _is_label_reference(self, key, axis=0): + """ + Test whether a key is a label reference for a given axis. + + To be considered a label reference, `key` must be a string that: + - (axis=0): Matches a column label + - (axis=1): Matches an index label + + Parameters + ---------- + key: str + Potential label name + axis: int, default 0 + Axis perpendicular to the axis that labels are associated with + (0 means search for column labels, 1 means search for index labels) + + Returns + ------- + is_label: bool + """ + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_reference is not implemented for {type}" + .format(type=type(self))) + + return (key is not None and + is_hashable(key) and + any(key in self.axes[ax] for ax in other_axes)) + + def _is_label_or_level_reference(self, key, axis=0): + """ + Test whether a key is a label or level reference for a given axis. + + To be considered either a label or a level reference, `key` must be a + string that: + - (axis=0): Matches a column label or an index level + - (axis=1): Matches an index label or a column level + + Parameters + ---------- + key: str + Potential label or level name + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + is_label_or_level: bool + """ + + if self.ndim > 2: + raise NotImplementedError( + "_is_label_or_level_reference is not implemented for {type}" + .format(type=type(self))) + + return (self._is_level_reference(key, axis=axis) or + self._is_label_reference(key, axis=axis)) + + def _check_label_or_level_ambiguity(self, key, axis=0): + """ + Check whether `key` matches both a level of the input `axis` and a + label of the other axis and raise a ``FutureWarning`` if this is the + case. + + Note: This method will be altered to raise an ambiguity exception in + a future version. + + Parameters + ---------- + key: str or object + label or level name + + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + ambiguous: bool + + Raises + ------ + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(self))) + + if (key is not None and + is_hashable(key) and + key in self.axes[axis].names and + any(key in self.axes[ax] for ax in other_axes)): + + # Build an informative and grammatical warning + level_article, level_type = (('an', 'index') + if axis == 0 else + ('a', 'column')) + + label_article, label_type = (('a', 'column') + if axis == 0 else + ('an', 'index')) + + msg = ("'{key}' is both {level_article} {level_type} level and " + "{label_article} {label_type} label.\n" + "Defaulting to {label_type}, but this will raise an " + "ambiguity error in a future version" + ).format(key=key, + level_article=level_article, + level_type=level_type, + label_article=label_article, + label_type=label_type) + + warnings.warn(msg, FutureWarning, stacklevel=2) + return True + else: + return False + + def _get_label_or_level_values(self, key, axis=0): + """ + Return a 1-D array of values associated with `key`, a label or level + from the given `axis`. + + Retrieval logic: + - (axis=0): Return column values if `key` matches a column label. + Otherwise return index level values if `key` matches an index + level. + - (axis=1): Return row values if `key` matches an index label. + Otherwise return column level values if 'key' matches a column + level + + Parameters + ---------- + key: str + Label or level name. + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + values: np.ndarray + + Raises + ------ + KeyError + if `key` matches neither a label nor a level + ValueError + if `key` matches multiple labels + """ + + axis = self._get_axis_number(axis) + other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis] + + if self.ndim > 2: + raise NotImplementedError( + "_get_label_or_level_values is not implemented for {type}" + .format(type=type(self))) + + if self._is_label_reference(key, axis=axis): + self._check_label_or_level_ambiguity(key, axis=axis) + values = self.xs(key, axis=other_axes[0])._values + elif self._is_level_reference(key, axis=axis): + values = self.axes[axis].get_level_values(key)._values + else: + raise KeyError(key) + + # Check for duplicates + if values.ndim > 1: + label_axis_name = 'column' if axis == 0 else 'index' + raise ValueError(("The {label_axis_name} label '{key}' " + "is not unique") + .format(key=key, + label_axis_name=label_axis_name)) + + return values + + def _drop_labels_or_levels(self, keys, axis=0): + """ + Drop labels and/or levels for the given `axis`. + + For each key in `keys`: + - (axis=0): If key matches a column label then drop the column. + Otherwise if key matches an index level then drop the level. + - (axis=1): If key matches an index label then drop the row. + Otherwise if key matches a column level then drop the level. + + Parameters + ---------- + keys: str or list of str + labels or levels to drop + axis: int, default 0 + Axis that levels are associated with (0 for index, 1 for columns) + + Returns + ------- + dropped: DataFrame + + Raises + ------ + ValueError + if any `keys` match neither a label nor a level + """ + + axis = self._get_axis_number(axis) + + if self.ndim > 2: + raise NotImplementedError( + "_drop_labels_or_levels is not implemented for {type}" + .format(type=type(self))) + + # Validate keys + keys = com._maybe_make_list(keys) + invalid_keys = [k for k in keys if not + self._is_label_or_level_reference(k, axis=axis)] + + if invalid_keys: + raise ValueError(("The following keys are not valid labels or " + "levels for axis {axis}: {invalid_keys}") + .format(axis=axis, + invalid_keys=invalid_keys)) + + # Compute levels and labels to drop + levels_to_drop = [k for k in keys + if self._is_level_reference(k, axis=axis)] + + labels_to_drop = [k for k in keys + if not self._is_level_reference(k, axis=axis)] + + # Perform copy upfront and then use inplace operations below. + # This ensures that we always perform exactly one copy. + # ``copy`` and/or ``inplace`` options could be added in the future. + dropped = self.copy() + + if axis == 0: + # Handle dropping index levels + if levels_to_drop: + dropped.reset_index(levels_to_drop, drop=True, inplace=True) + + # Handle dropping columns labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=1, inplace=True) + else: + # Handle dropping column levels + if levels_to_drop: + if isinstance(dropped.columns, MultiIndex): + # Drop the specified levels from the MultiIndex + dropped.columns = dropped.columns.droplevel(levels_to_drop) + else: + # Drop the last level of Index by replacing with + # a RangeIndex + dropped.columns = RangeIndex(dropped.columns.size) + + # Handle dropping index labels + if labels_to_drop: + dropped.drop(labels_to_drop, axis=0, inplace=True) + + return dropped + # ---------------------------------------------------------------------- # Iteration diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6052b373ca0ea..a5d8cc254cd93 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2913,16 +2913,11 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: - if validate and gpr in obj.index.names: - warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % gpr, - FutureWarning, stacklevel=5) + if validate: + obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) - elif gpr in obj.index.names: + elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 56ca913dbcddb..bad7088a126cf 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -587,6 +587,8 @@ def get_result(self): self._maybe_add_join_keys(result, left_indexer, right_indexer) + self._maybe_restore_index_levels(result) + return result def _indicator_pre_merge(self, left, right): @@ -629,6 +631,39 @@ def _indicator_post_merge(self, result): axis=1) return result + def _maybe_restore_index_levels(self, result): + """ + Restore index levels specified as `on` parameters + + Here we check for cases where `self.left_on` and `self.right_on` pairs + each reference an index level in their respective DataFrames. The + joined columns corresponding to these pairs are then restored to the + index of `result`. + + **Note:** This method has side effects. It modifies `result` in-place + + Parameters + ---------- + result: DataFrame + merge result + + Returns + ------- + None + """ + names_to_restore = [] + for name, left_key, right_key in zip(self.join_names, + self.left_on, + self.right_on): + if (self.orig_left._is_level_reference(left_key) and + self.orig_right._is_level_reference(right_key) and + name not in result.index.names): + + names_to_restore.append(name) + + if names_to_restore: + result.set_index(names_to_restore, inplace=True) + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): left_has_missing = None @@ -698,8 +733,17 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: key_col = Index(lvals).where(~mask, rvals) - if name in result: + if result._is_label_reference(name): result[name] = key_col + elif result._is_level_reference(name): + if isinstance(result.index, MultiIndex): + idx_list = [result.index.get_level_values(level_name) + if level_name != name else key_col + for level_name in result.index.names] + + result.set_index(idx_list, inplace=True) + else: + result.index = Index(key_col, name=name) else: result.insert(i, name or 'key_{i}'.format(i=i), key_col) @@ -796,7 +840,8 @@ def _get_merge_keys(self): join_names.append(None) # what to do? else: if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -805,7 +850,8 @@ def _get_merge_keys(self): else: if not is_rkey(rk): if rk is not None: - right_keys.append(right[rk]._values) + right_keys.append( + right._get_label_or_level_values(rk)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -818,7 +864,7 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left[lk]._values) + left_keys.append(left._get_label_or_level_values(lk)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -830,7 +876,7 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left[k]._values) + left_keys.append(left._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -844,7 +890,7 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right[k]._values) + right_keys.append(right._get_label_or_level_values(k)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) @@ -854,10 +900,10 @@ def _get_merge_keys(self): left_keys = [self.left.index.values] if left_drop: - self.left = self.left.drop(left_drop, axis=1) + self.left = self.left._drop_labels_or_levels(left_drop) if right_drop: - self.right = self.right.drop(right_drop, axis=1) + self.right = self.right._drop_labels_or_levels(right_drop) return left_keys, right_keys, join_names diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py new file mode 100644 index 0000000000000..456cb48020500 --- /dev/null +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -0,0 +1,431 @@ +import pytest +import pandas as pd +import pandas.util.testing as tm +from pandas.core.dtypes.missing import array_equivalent + + +# Fixtures +# ======== +@pytest.fixture +def df(): + """DataFrame with columns 'L1', 'L2', and 'L3' """ + return pd.DataFrame({'L1': [1, 2, 3], + 'L2': [11, 12, 13], + 'L3': ['A', 'B', 'C']}) + + +@pytest.fixture(params=[[], ['L1'], ['L1', 'L2'], ['L1', 'L2', 'L3']]) +def df_levels(request, df): + """DataFrame with columns or index levels 'L1', 'L2', and 'L3' """ + levels = request.param + + if levels: + df = df.set_index(levels) + + return df + + +@pytest.fixture +def df_ambig(df): + """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3' """ + df = df.set_index(['L1', 'L2']) + + df['L1'] = df['L3'] + + return df + + +@pytest.fixture +def df_duplabels(df): + """DataFrame with level 'L1' and labels 'L2', 'L3', and 'L2' """ + df = df.set_index(['L1']) + df = pd.concat([df, df['L2']], axis=1) + + return df + + +@pytest.fixture +def panel(): + with tm.assert_produces_warning(DeprecationWarning, + check_stacklevel=False): + return pd.Panel() + + +# Test is label/level reference +# ============================= +def get_labels_levels(df_levels): + expected_labels = list(df_levels.columns) + expected_levels = [name for name in df_levels.index.names + if name is not None] + return expected_labels, expected_levels + + +def assert_label_reference(frame, labels, axis): + for label in labels: + assert frame._is_label_reference(label, axis=axis) + assert not frame._is_level_reference(label, axis=axis) + assert frame._is_label_or_level_reference(label, axis=axis) + + +def assert_level_reference(frame, levels, axis): + for level in levels: + assert frame._is_level_reference(level, axis=axis) + assert not frame._is_label_reference(level, axis=axis) + assert frame._is_label_or_level_reference(level, axis=axis) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_or_label_reference_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_level_reference(df_levels, expected_levels, axis=axis) + assert_label_reference(df_levels, expected_labels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_is_level_reference_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 should reference the label, not the level + assert_label_reference(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + # Therefore L2 is an level reference + assert_level_reference(df_ambig, ['L2'], axis=axis) + + # df has a column named L3 and it not an level reference + assert_label_reference(df_ambig, ['L3'], axis=axis) + + +# Series +# ------ +def test_is_level_reference_series_simple_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_reference(s, ['L1'], axis=0) + assert not s._is_level_reference('L2') + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_reference(s, ['L1', 'L2'], axis=0) + assert not s._is_level_reference('L3') + + +def test_is_level_reference_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._is_level_reference('L1', axis=1) + + +# Panel +# ----- +def test_is_level_reference_panel_error(panel): + msg = ("_is_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_level_reference('L1', axis=0) + + +def test_is_label_reference_panel_error(panel): + msg = ("_is_label_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_reference('L1', axis=0) + + +def test_is_label_or_level_reference_panel_error(panel): + msg = ("_is_label_or_level_reference is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._is_label_or_level_reference('L1', axis=0) + + +# Test _check_label_or_level_ambiguity_df +# ======================================= + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_check_label_or_level_ambiguity_df(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df_ambig has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous + with tm.assert_produces_warning(FutureWarning, + clear=True, + check_stacklevel=False) as w: + + assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) + warning_msg = w[0].message.args[0] + if axis == 0: + assert warning_msg.startswith("'L1' is both an index level " + "and a column label") + else: + assert warning_msg.startswith("'L1' is both a column level " + "and an index label") + + # df_ambig has an on-axis level named L2 and it is not ambiguous + # No warning should be raised + with tm.assert_produces_warning(None): + assert not df_ambig._check_label_or_level_ambiguity('L2', axis=axis) + + # df_ambig has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert not df_ambig._is_level_reference('L3', axis=axis) + + +# Series +# ------ +def test_check_label_or_level_ambiguity_series(df): + + # A series has no columns and therefore references are never ambiguous + + # Make series with L1 as index + s = df.set_index('L1').L2 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + with tm.assert_produces_warning(None): + assert not s._check_label_or_level_ambiguity('L1', axis=0) + assert not s._check_label_or_level_ambiguity('L2', axis=0) + assert not s._check_label_or_level_ambiguity('L3', axis=0) + + +def test_check_label_or_level_ambiguity_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._check_label_or_level_ambiguity('L1', axis=1) + + +# Panel +# ----- +def test_check_label_or_level_ambiguity_panel_error(panel): + msg = ("_check_label_or_level_ambiguity is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._check_label_or_level_ambiguity('L1', axis=0) + + +# Test _get_label_or_level_values +# =============================== +def assert_label_values(frame, labels, axis): + for label in labels: + if axis == 0: + expected = frame[label]._values + else: + expected = frame.loc[label]._values + + result = frame._get_label_or_level_values(label, axis=axis) + assert array_equivalent(expected, result) + + +def assert_level_values(frame, levels, axis): + for level in levels: + if axis == 0: + expected = frame.index.get_level_values(level=level)._values + else: + expected = (frame.columns + .get_level_values(level=level) + ._values) + + result = frame._get_label_or_level_values(level, axis=axis) + assert array_equivalent(expected, result) + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_simple(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_label_values(df_levels, expected_labels, axis=axis) + assert_level_values(df_levels, expected_levels, axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_ambig(df_ambig, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_ambig = df_ambig.T + + # df has both an on-axis level and off-axis label named L1 + # Therefore L1 is ambiguous but will default to label + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + assert_label_values(df_ambig, ['L1'], axis=axis) + + # df has an on-axis level named L2 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_level_values(df_ambig, ['L2'], axis=axis) + + # df has an off-axis label named L3 and it is not ambiguous + with tm.assert_produces_warning(None): + assert_label_values(df_ambig, ['L3'], axis=axis) + + +@pytest.mark.parametrize('axis', [0, 1]) +def test_get_label_or_level_values_df_duplabels(df_duplabels, axis): + + # Transpose frame if axis == 1 + if axis == 1: + df_duplabels = df_duplabels.T + + # df has unambiguous level 'L1' + assert_level_values(df_duplabels, ['L1'], axis=axis) + + # df has unique label 'L3' + assert_label_values(df_duplabels, ['L3'], axis=axis) + + # df has duplicate labels 'L2' + if axis == 0: + expected_msg = "The column label 'L2' is not unique" + else: + expected_msg = "The index label 'L2' is not unique" + + with tm.assert_raises_regex(ValueError, expected_msg): + assert_label_values(df_duplabels, ['L2'], axis=axis) + + +# Series +# ------ +def test_get_label_or_level_values_series_axis0(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_level_values(s, ['L1'], axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_level_values(s, ['L1', 'L2'], axis=0) + + +def test_get_label_or_level_values_series_axis1_error(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + + with tm.assert_raises_regex(ValueError, "No axis named 1"): + s._get_label_or_level_values('L1', axis=1) + + +# Panel +# ----- +def test_get_label_or_level_values_panel_error(panel): + msg = ("_get_label_or_level_values is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._get_label_or_level_values('L1', axis=0) + + +# Test _drop_labels_or_levels +# =========================== +def assert_labels_dropped(frame, labels, axis): + for label in labels: + df_dropped = frame._drop_labels_or_levels(label, axis=axis) + + if axis == 0: + assert label in frame.columns + assert label not in df_dropped.columns + else: + assert label in frame.index + assert label not in df_dropped.index + + +def assert_levels_dropped(frame, levels, axis): + for level in levels: + df_dropped = frame._drop_labels_or_levels(level, axis=axis) + + if axis == 0: + assert level in frame.index.names + assert level not in df_dropped.index.names + else: + assert level in frame.columns.names + assert level not in df_dropped.columns.names + + +# DataFrame +# --------- +@pytest.mark.parametrize('axis', [0, 1]) +def test_drop_labels_or_levels_df(df_levels, axis): + + # Compute expected labels and levels + expected_labels, expected_levels = get_labels_levels(df_levels) + + # Transpose frame if axis == 1 + if axis == 1: + df_levels = df_levels.T + + # Perform checks + assert_labels_dropped(df_levels, expected_labels, axis=axis) + assert_levels_dropped(df_levels, expected_levels, axis=axis) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + df_levels._drop_labels_or_levels('L4', axis=axis) + + +# Series +# ------ +def test_drop_labels_or_levels_series(df): + + # Make series with L1 as index + s = df.set_index('L1').L2 + assert_levels_dropped(s, ['L1'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + # Make series with L1 and L2 as index + s = df.set_index(['L1', 'L2']).L3 + assert_levels_dropped(s, ['L1', 'L2'], axis=0) + + with tm.assert_raises_regex(ValueError, "not valid labels or levels"): + s._drop_labels_or_levels('L4', axis=0) + + +# Panel +# ----- +def test_drop_labels_or_levels_panel_error(panel): + msg = ("_drop_labels_or_levels is not implemented for {type}" + .format(type=type(panel))) + + with tm.assert_raises_regex(NotImplementedError, msg): + panel._drop_labels_or_levels('L1', axis=0) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index 3b6e15036cfe2..cee78eab3a636 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -108,7 +108,7 @@ def test_grouper_column_index_level_precedence(frame, assert_frame_equal(result, expected) - # Grouping with level Grouper should produce a difference result but + # Grouping with level Grouper should produce a different result but # still no warning with tm.assert_produces_warning(False): not_expected = frame.groupby(level_groupers).mean() diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index b76951e8c2ac2..cd0701e3864fc 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -72,6 +72,15 @@ def test_merge_common(self): exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) + def test_merge_index_as_on_arg(self): + # GH14355 + + left = self.df.set_index('key1') + right = self.df2.set_index('key1') + result = merge(left, right, on='key1') + expected = merge(self.df, self.df2, on='key1').set_index('key1') + assert_frame_equal(result, expected) + def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py new file mode 100644 index 0000000000000..4c638f8e441fa --- /dev/null +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -0,0 +1,215 @@ +import numpy as np +import pytest + +from pandas import DataFrame +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture +def df1(): + return DataFrame(dict( + outer=[1, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4], + inner=[1, 2, 3, 1, 2, 3, 4, 1, 2, 1, 2], + v1=np.linspace(0, 1, 11))) + + +@pytest.fixture +def df2(): + return DataFrame(dict( + outer=[1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3], + inner=[1, 2, 2, 3, 3, 4, 2, 3, 1, 1, 2, 3], + v2=np.linspace(10, 11, 12))) + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def left_df(request, df1): + """ Construct left test DataFrame with specified levels + (any of 'outer', 'inner', and 'v1')""" + levels = request.param + if levels: + df1 = df1.set_index(levels) + + return df1 + + +@pytest.fixture(params=[[], ['outer'], ['outer', 'inner']]) +def right_df(request, df2): + """ Construct right test DataFrame with specified levels + (any of 'outer', 'inner', and 'v2')""" + levels = request.param + + if levels: + df2 = df2.set_index(levels) + + return df2 + + +def compute_expected(df_left, df_right, + on=None, left_on=None, right_on=None, how=None): + """ + Compute the expected merge result for the test case. + + This method computes the expected result of merging two DataFrames on + a combination of their columns and index levels. It does so by + explicitly dropping/resetting their named index levels, performing a + merge on their columns, and then finally restoring the appropriate + index in the result. + + Parameters + ---------- + df_left : DataFrame + The left DataFrame (may have zero or more named index levels) + df_right : DataFrame + The right DataFrame (may have zero or more named index levels) + on : list of str + The on parameter to the merge operation + left_on : list of str + The left_on parameter to the merge operation + right_on : list of str + The right_on parameter to the merge operation + how : str + The how parameter to the merge operation + + Returns + ------- + DataFrame + The expected merge result + """ + + # Handle on param if specified + if on is not None: + left_on, right_on = on, on + + # Compute input named index levels + left_levels = [n for n in df_left.index.names if n is not None] + right_levels = [n for n in df_right.index.names if n is not None] + + # Compute output named index levels + output_levels = [i for i in left_on + if i in right_levels and i in left_levels] + + # Drop index levels that aren't involved in the merge + drop_left = [n for n in left_levels if n not in left_on] + if drop_left: + df_left = df_left.reset_index(drop_left, drop=True) + + drop_right = [n for n in right_levels if n not in right_on] + if drop_right: + df_right = df_right.reset_index(drop_right, drop=True) + + # Convert remaining index levels to columns + reset_left = [n for n in left_levels if n in left_on] + if reset_left: + df_left = df_left.reset_index(level=reset_left) + + reset_right = [n for n in right_levels if n in right_on] + if reset_right: + df_right = df_right.reset_index(level=reset_right) + + # Perform merge + expected = df_left.merge(df_right, + left_on=left_on, + right_on=right_on, + how=how) + + # Restore index levels + if output_levels: + expected = expected.set_index(output_levels) + + return expected + + +@pytest.mark.parametrize('on,how', + [(['outer'], 'inner'), + (['inner'], 'left'), + (['outer', 'inner'], 'right'), + (['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_on(left_df, right_df, on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, on=on, how=how) + + # Perform merge + result = left_df.merge(right_df, on=on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_on,right_on,how', + [(['outer'], ['outer'], 'inner'), + (['inner'], ['inner'], 'right'), + (['outer', 'inner'], ['outer', 'inner'], 'left'), + (['inner', 'outer'], ['inner', 'outer'], 'outer')]) +def test_merge_indexes_and_columns_lefton_righton( + left_df, right_df, left_on, right_on, how): + + # Construct expected result + expected = compute_expected(left_df, right_df, + left_on=left_on, + right_on=right_on, + how=how) + + # Perform merge + result = left_df.merge(right_df, + left_on=left_on, right_on=right_on, how=how) + assert_frame_equal(result, expected, check_like=True) + + +@pytest.mark.parametrize('left_index', + ['inner', ['inner', 'outer']]) +@pytest.mark.parametrize('how', + ['inner', 'left', 'right', 'outer']) +def test_join_indexes_and_columns_on(df1, df2, left_index, how): + + # Construct left_df + left_df = df1.set_index(left_index) + + # Construct right_df + right_df = df2.set_index(['outer', 'inner']) + + # Result + expected = (left_df.reset_index() + .join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + .set_index(left_index)) + + # Perform join + result = left_df.join(right_df, on=['outer', 'inner'], how=how, + lsuffix='_x', rsuffix='_y') + + assert_frame_equal(result, expected, check_like=True) + + +def test_merge_index_column_precedence(df1, df2): + + # Construct left_df with both an index and a column named 'outer'. + # We make this 'outer' column equal to the 'inner' column so that we + # can verify that the correct values are used by the merge operation + left_df = df1.set_index('outer') + left_df['outer'] = left_df['inner'] + + # Construct right_df with an index level named 'outer' + right_df = df2.set_index('outer') + + # Construct expected result. + # The 'outer' column from left_df is chosen and the resulting + # frame has no index levels + expected = (left_df.reset_index(level='outer', drop=True) + .merge(right_df.reset_index(), on=['outer', 'inner'])) + + # Merge left_df and right_df on 'outer' and 'inner' + # 'outer' for left_df should refer to the 'outer' column, not the + # 'outer' index level and a FutureWarning should be raised + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, on=['outer', 'inner']) + + # Check results + assert_frame_equal(result, expected) + + # Perform the same using the left_on and right_on parameters + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = left_df.merge(right_df, + left_on=['outer', 'inner'], + right_on=['outer', 'inner']) + + assert_frame_equal(result, expected)