Skip to content

Commit

Permalink
ENH: Better error message if usecols doesn't match columns (pandas-de…
Browse files Browse the repository at this point in the history
  • Loading branch information
AaronCritchley authored and jreback committed Dec 3, 2017
1 parent 0e16818 commit 7a3f81a
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 11 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other Enhancements
- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)

.. _whatsnew_0220.api_breaking:

Expand Down
42 changes: 39 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,38 @@ def _evaluate_usecols(usecols, names):
return usecols


def _validate_usecols_names(usecols, names):
"""
Validates that all usecols are present in a given
list of names. If not, raise a ValueError that
shows what usecols are missing.
Parameters
----------
usecols : iterable of usecols
The columns to validate are present in names.
names : iterable of names
The column names to check against.
Returns
-------
usecols : iterable of usecols
The `usecols` parameter if the validation succeeds.
Raises
------
ValueError : Columns were missing. Error message will list them.
"""
missing = [c for c in usecols if c not in names]
if len(missing) > 0:
raise ValueError(
"Usecols do not match columns, "
"columns expected but not found: {missing}".format(missing=missing)
)

return usecols


def _validate_skipfooter_arg(skipfooter):
"""
Validate the 'skipfooter' parameter.
Expand Down Expand Up @@ -1753,14 +1785,14 @@ def __init__(self, src, **kwds):
# GH 14671
if (self.usecols_dtype == 'string' and
not set(usecols).issubset(self.orig_names)):
raise ValueError("Usecols do not match names.")
_validate_usecols_names(usecols, self.orig_names)

if len(self.names) > len(usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in usecols or n in usecols)]

if len(self.names) < len(usecols):
raise ValueError("Usecols do not match names.")
_validate_usecols_names(usecols, self.names)

self._set_noconvert_columns()

Expand Down Expand Up @@ -2532,9 +2564,13 @@ def _handle_usecols(self, columns, usecols_key):
raise ValueError("If using multiple headers, usecols must "
"be integers.")
col_indices = []

for col in self.usecols:
if isinstance(col, string_types):
col_indices.append(usecols_key.index(col))
try:
col_indices.append(usecols_key.index(col))
except ValueError:
_validate_usecols_names(self.usecols, usecols_key)
else:
col_indices.append(col)
else:
Expand Down
21 changes: 13 additions & 8 deletions pandas/tests/io/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'

if self.engine == 'c':
msg = 'Usecols do not match names'
else:
msg = 'is not in list'
msg = (
"Usecols do not match columns, "
"columns expected but not found: {missing}"
)

usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
Expand All @@ -492,11 +492,16 @@ def test_raise_on_usecols_names_mismatch(self):
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f', 'g']
with tm.assert_raises_regex(
ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")):
self.read_csv(StringIO(data), usecols=usecols)

names = ['A', 'B', 'C', 'D']
Expand All @@ -520,9 +525,9 @@ def test_raise_on_usecols_names_mismatch(self):
# tm.assert_frame_equal(df, expected)

usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)

0 comments on commit 7a3f81a

Please sign in to comment.