Skip to content

Commit

Permalink
ERR: Raise on duplicates names in read_csv (#27175)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and jreback committed Jul 2, 2019
1 parent 7ec7c9e commit e145443
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 24 deletions.
3 changes: 1 addition & 2 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,7 @@ header : int or list of ints, default ``'infer'``
line of data rather than the first line of the file.
names : array-like, default ``None``
List of column names to use. If file contains no header row, then you should
explicitly pass ``header=None``. Duplicates in this list will cause
a ``UserWarning`` to be issued.
explicitly pass ``header=None``. Duplicates in this list are not allowed.
index_col : int, str, sequence of int / str, or False, default ``None``
Column(s) to use as the row labels of the ``DataFrame``, either given as
string name or column index. If a sequence of int / str is given, a
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ contained the values ``[0, 3]``.
**New behavior**:

.. ipython:: python
:okwarning:
:okexcept:
pd.read_csv(StringIO(data), names=names)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ Other API changes
- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
- :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)

.. _whatsnew_0250.deprecations:

Expand Down
9 changes: 3 additions & 6 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@
data rather than the first line of the file.
names : array-like, optional
List of column names to use. If file contains no header row, then you
should explicitly pass ``header=None``. Duplicates in this list will cause
a ``UserWarning`` to be issued.
should explicitly pass ``header=None``. Duplicates in this list are not
allowed.
index_col : int, str, sequence of int / str, or False, default ``None``
Column(s) to use as the row labels of the ``DataFrame``, either given as
string name or column index. If a sequence of int / str is given, a
Expand Down Expand Up @@ -394,10 +394,7 @@ def _validate_names(names):

if names is not None:
if len(names) != len(set(names)):
msg = ("Duplicate names specified. This "
"will raise an error in the future.")
warnings.warn(msg, UserWarning, stacklevel=3)

raise ValueError('Duplicate names are not allowed.')
return names


Expand Down
9 changes: 4 additions & 5 deletions pandas/tests/io/parser/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,17 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
tm.assert_frame_equal(result, expected)


def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers):
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat([Series([], name="one", dtype="u1"),
Series([], name="one.1", dtype="f")], axis=1)
expected.index = expected.index.astype(object)

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
with pytest.raises(ValueError, match='Duplicate names'):
data = ""
result = parser.read_csv(StringIO(data), names=["one", "one"],
dtype={0: "u1", 1: "f"})
tm.assert_frame_equal(result, expected)
parser.read_csv(StringIO(data), names=["one", "one"],
dtype={0: "u1", 1: "f"})


def test_raise_on_passed_int_dtype_with_nas(all_parsers):
Expand Down
15 changes: 5 additions & 10 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,13 @@ def test_basic_names(all_parsers):
tm.assert_frame_equal(result, expected)


def test_basic_names_warn(all_parsers):
def test_basic_names_raise(all_parsers):
# See gh-7160
parser = all_parsers

data = "0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=["a", "b", "a.1"])

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
result = parser.read_csv(StringIO(data), names=["a", "b", "a"])
tm.assert_frame_equal(result, expected)
with pytest.raises(ValueError, match='Duplicate names'):
parser.read_csv(StringIO(data), names=["a", "b", "a"])


@pytest.mark.parametrize("data,expected", [
Expand Down Expand Up @@ -90,9 +86,8 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
# see gh-17095
parser = all_parsers

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
result = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
with pytest.raises(ValueError, match='Duplicate names'):
parser.read_csv(StringIO(data), names=names)


def test_mangled_unnamed_placeholders(all_parsers):
Expand Down

0 comments on commit e145443

Please sign in to comment.