Skip to content

Commit

Permalink
API: Warn about dups in names for read_csv (#17346)
Browse files Browse the repository at this point in the history
xref gh-17095.
  • Loading branch information
gfyoung authored and jreback committed Sep 24, 2017
1 parent ae16bf9 commit 1f51271
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 33 deletions.
4 changes: 2 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'``
rather than the first line of the file.
names : array-like, default ``None``
List of column names to use. If file contains no header row, then you should
explicitly pass ``header=None``. Duplicates in this list are not allowed unless
``mangle_dupe_cols=True``, which is the default.
explicitly pass ``header=None``. Duplicates in this list will cause
a ``UserWarning`` to be issued.
index_col : int or sequence or ``False``, default ``None``
Column to use as the row labels of the DataFrame. If a sequence is given, a
MultiIndex is used. If you have a malformed file with delimiters at the end of
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ Other API Changes
- The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
- Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
Expand Down
33 changes: 31 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@
rather than the first line of the file.
names : array-like, default None
List of column names to use. If file contains no header row, then you
should explicitly pass header=None. Duplicates in this list are not
allowed unless mangle_dupe_cols=True, which is the default.
should explicitly pass header=None. Duplicates in this list will cause
a ``UserWarning`` to be issued.
index_col : int or sequence or False, default None
Column to use as the row labels of the DataFrame. If a sequence is given, a
MultiIndex is used. If you have a malformed file with delimiters at the end
Expand Down Expand Up @@ -385,6 +385,32 @@ def _validate_integer(name, val, min_val=0):
return val


def _validate_names(names):
"""
Check if the `names` parameter contains duplicates.
If duplicates are found, we issue a warning before returning.
Parameters
----------
names : array-like or None
An array containing a list of the names used for the output DataFrame.
Returns
-------
names : array-like or None
The original `names` parameter.
"""

if names is not None:
if len(names) != len(set(names)):
msg = ("Duplicate names specified. This "
"will raise an error in the future.")
warnings.warn(msg, UserWarning, stacklevel=3)

return names


def _read(filepath_or_buffer, kwds):
"""Generic reader of line files."""
encoding = kwds.get('encoding', None)
Expand All @@ -407,6 +433,9 @@ def _read(filepath_or_buffer, kwds):
chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
nrows = _validate_integer('nrows', kwds.get('nrows', None))

# Check for duplicates in names.
_validate_names(kwds.get("names", None))

# Create the parser.
parser = TextFileReader(filepath_or_buffer, **kwds)

Expand Down
14 changes: 0 additions & 14 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1357,20 +1357,6 @@ def test_euro_decimal_format(self):
assert df2['Number2'].dtype == float
assert df2['Number3'].dtype == float

def test_read_duplicate_names(self):
# See gh-7160
data = "a,b,a\n0,1,2\n3,4,5"
df = self.read_csv(StringIO(data))
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=['a', 'b', 'a.1'])
tm.assert_frame_equal(df, expected)

data = "0,1,2\n3,4,5"
df = self.read_csv(StringIO(data), names=["a", "b", "a"])
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=['a', 'b', 'a.1'])
tm.assert_frame_equal(df, expected)

def test_inf_parsing(self):
data = """\
,A
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/io/parser/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,11 @@ def test_empty_with_dup_column_pass_dtype_by_indexes(self):
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)

data = ''
result = self.read_csv(StringIO(data), names=['one', 'one'],
dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = ''
result = self.read_csv(StringIO(data), names=['one', 'one'],
dtype={0: 'u1', 1: 'f'})
tm.assert_frame_equal(result, expected, check_index_type=False)

def test_raise_on_passed_int_dtype_with_nas(self):
# see gh-2631
Expand Down
46 changes: 35 additions & 11 deletions pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
"""

from pandas.compat import StringIO
from pandas import DataFrame

import pandas.util.testing as tm


class DupeColumnTests(object):
Expand All @@ -25,6 +28,21 @@ def test_basic(self):
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_basic_names(self):
# See gh-7160
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=["a", "b", "a.1"])

df = self.read_csv(StringIO(data))
tm.assert_frame_equal(df, expected)

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
data = "0,1,2\n3,4,5"
df = self.read_csv(StringIO(data),
names=["a", "b", "a"])
tm.assert_frame_equal(df, expected)

def test_thorough_mangle_columns(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
Expand All @@ -45,20 +63,26 @@ def test_thorough_mangle_names(self):
# see gh-17095
data = "a,b,b\n1,2,3"
names = ["a.1", "a.1", "a.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]

data = "a,b,c,d,e,f\n1,2,3,4,5,6"
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
df = self.read_csv(StringIO(data), sep=",", names=names,
mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]

0 comments on commit 1f51271

Please sign in to comment.