BUG: DataFrame.merge(suffixes=) does not respect None (#24819)

pandas-dev · Feb 6, 2019 · 09633b8 · 09633b8
1 parent 776530c
commit 09633b8
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 10 deletions.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -181,6 +181,7 @@ Groupby/Resample/Rolling
 Reshaping
 ^^^^^^^^^
 
+- Bug in :func:`pandas.merge` adds a string of ``None`` if ``None`` is assigned in suffixes instead of remain the column name as-is (:issue:`24782`).
 - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
 - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`)
 -

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1971,16 +1971,28 @@ def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
             raise ValueError('columns overlap but no suffix specified: '
                              '{rename}'.format(rename=to_rename))
 
-        def lrenamer(x):
-            if x in to_rename:
-                return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
-            return x
+        def renamer(x, suffix):
+            """Rename the left and right indices.
+
+            If there is overlap, and suffix is not None, add
+            suffix, otherwise, leave it as-is.
+
+            Parameters
+            ----------
+            x : original column name
+            suffix : str or None
 
-        def rrenamer(x):
-            if x in to_rename:
-                return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
+            Returns
+            -------
+            x : renamed column name
+            """
+            if x in to_rename and suffix is not None:
+                return '{x}{suffix}'.format(x=x, suffix=suffix)
             return x
 
+        lrenamer = partial(renamer, suffix=lsuffix)
+        rrenamer = partial(renamer, suffix=rsuffix)
+
         return (_transform_index(left, lrenamer),
                 _transform_index(right, rrenamer))
 

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -159,9 +159,15 @@ def merge_ordered(left, right, on=None,
         left DataFrame
     fill_method : {'ffill', None}, default None
         Interpolation method for data
-    suffixes : 2-length sequence (tuple, list, ...)
-        Suffix to apply to overlapping column names in the left and right
-        side, respectively
+    suffixes : Sequence, default is ("_x", "_y")
+        A length-2 sequence where each element is optionally a string
+        indicating the suffix to add to overlapping column names in
+        `left` and `right` respectively. Pass a value of `None` instead
+        of a string to indicate that the column name from `left` or
+        `right` should be left as-is, with no suffix. At least one of the
+        values must not be None.
+
+        .. versionchanged:: 0.25.0
     how : {'left', 'right', 'outer', 'inner'}, default 'outer'
         * left: use only keys from left frame (SQL: left outer join)
         * right: use only keys from right frame (SQL: right outer join)

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -1526,3 +1526,65 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
         with pytest.raises(ValueError, match=msg):
             result = pd.merge(a, b, on=on, left_on=left_on, right_on=right_on,
                               left_index=left_index, right_index=right_index)
+
+
+@pytest.mark.parametrize("col1, col2, kwargs, expected_cols", [
+    (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]),
+    (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]),
+    (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]),
+    ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]),
+    (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]),
+    ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
+    ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]),
+    ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]),
+    ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]),
+    (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]),
+    ("a", "a", dict(), ["a_x", "a_y"]),
+    (0, 0, dict(), ["0_x", "0_y"])
+])
+def test_merge_suffix(col1, col2, kwargs, expected_cols):
+    # issue: 24782
+    a = pd.DataFrame({col1: [1, 2, 3]})
+    b = pd.DataFrame({col2: [4, 5, 6]})
+
+    expected = pd.DataFrame([[1, 4], [2, 5], [3, 6]],
+                            columns=expected_cols)
+
+    result = a.merge(b, left_index=True, right_index=True, **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+    result = pd.merge(a, b, left_index=True, right_index=True, **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("col1, col2, suffixes", [
+    ("a", "a", [None, None]),
+    ("a", "a", (None, None)),
+    ("a", "a", ("", None)),
+    (0, 0, [None, None]),
+    (0, 0, (None, ""))
+])
+def test_merge_suffix_error(col1, col2, suffixes):
+    # issue: 24782
+    a = pd.DataFrame({col1: [1, 2, 3]})
+    b = pd.DataFrame({col2: [3, 4, 5]})
+
+    # TODO: might reconsider current raise behaviour, see issue 24782
+    msg = "columns overlap but no suffix specified"
+    with pytest.raises(ValueError, match=msg):
+        pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
+
+
+@pytest.mark.parametrize("col1, col2, suffixes", [
+    ("a", "a", None),
+    (0, 0, None)
+])
+def test_merge_suffix_none_error(col1, col2, suffixes):
+    # issue: 24782
+    a = pd.DataFrame({col1: [1, 2, 3]})
+    b = pd.DataFrame({col2: [3, 4, 5]})
+
+    # TODO: might reconsider current raise behaviour, see GH24782
+    msg = "iterable"
+    with pytest.raises(TypeError, match=msg):
+        pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)