ERR: Raise on duplicates names in read_csv (#27175)

pandas-dev · Jul 2, 2019 · e145443 · e145443
1 parent 7ec7c9e
commit e145443
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 24 deletions.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -108,8 +108,7 @@ header : int or list of ints, default ``'infer'``
   line of data rather than the first line of the file.
 names : array-like, default ``None``
   List of column names to use. If file contains no header row, then you should
-  explicitly pass ``header=None``. Duplicates in this list will cause
-  a ``UserWarning`` to be issued.
+  explicitly pass ``header=None``. Duplicates in this list are not allowed.
 index_col : int, str, sequence of int / str, or False, default ``None``
   Column(s) to use as the row labels of the ``DataFrame``, either given as
   string name or column index. If a sequence of int / str is given, a

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
@@ -218,7 +218,7 @@ contained the values ``[0, 3]``.
 **New behavior**:
 
 .. ipython:: python
-   :okwarning:
+   :okexcept:
 
    pd.read_csv(StringIO(data), names=names)
 

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -567,6 +567,7 @@ Other API changes
 - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
 - :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
 - :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
+- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)
 
 .. _whatsnew_0250.deprecations:
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -99,8 +99,8 @@
     data rather than the first line of the file.
 names : array-like, optional
     List of column names to use. If file contains no header row, then you
-    should explicitly pass ``header=None``. Duplicates in this list will cause
-    a ``UserWarning`` to be issued.
+    should explicitly pass ``header=None``. Duplicates in this list are not
+    allowed.
 index_col : int, str, sequence of int / str, or False, default ``None``
   Column(s) to use as the row labels of the ``DataFrame``, either given as
   string name or column index. If a sequence of int / str is given, a
@@ -394,10 +394,7 @@ def _validate_names(names):
 
     if names is not None:
         if len(names) != len(set(names)):
-            msg = ("Duplicate names specified. This "
-                   "will raise an error in the future.")
-            warnings.warn(msg, UserWarning, stacklevel=3)
-
+            raise ValueError('Duplicate names are not allowed.')
     return names
 
 

diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
@@ -424,18 +424,17 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers):
+def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
     # see gh-9424
     parser = all_parsers
     expected = concat([Series([], name="one", dtype="u1"),
                        Series([], name="one.1", dtype="f")], axis=1)
     expected.index = expected.index.astype(object)
 
-    with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+    with pytest.raises(ValueError, match='Duplicate names'):
         data = ""
-        result = parser.read_csv(StringIO(data), names=["one", "one"],
-                                 dtype={0: "u1", 1: "f"})
-        tm.assert_frame_equal(result, expected)
+        parser.read_csv(StringIO(data), names=["one", "one"],
+                        dtype={0: "u1", 1: "f"})
 
 
 def test_raise_on_passed_int_dtype_with_nas(all_parsers):

diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py
@@ -37,17 +37,13 @@ def test_basic_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_basic_names_warn(all_parsers):
+def test_basic_names_raise(all_parsers):
     # See gh-7160
     parser = all_parsers
 
     data = "0,1,2\n3,4,5"
-    expected = DataFrame([[0, 1, 2], [3, 4, 5]],
-                         columns=["a", "b", "a.1"])
-
-    with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
-        result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
-        tm.assert_frame_equal(result, expected)
+    with pytest.raises(ValueError, match='Duplicate names'):
+        parser.read_csv(StringIO(data), names=["a", "b", "a"])
 
 
 @pytest.mark.parametrize("data,expected", [
@@ -90,9 +86,8 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
     # see gh-17095
     parser = all_parsers
 
-    with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
-        result = parser.read_csv(StringIO(data), names=names)
-        tm.assert_frame_equal(result, expected)
+    with pytest.raises(ValueError, match='Duplicate names'):
+        parser.read_csv(StringIO(data), names=names)
 
 
 def test_mangled_unnamed_placeholders(all_parsers):