Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Better error message if usecols doesn't match columns #17310

Merged
Merged
42 changes: 39 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,38 @@ def _evaluate_usecols(usecols, names):
return usecols


def _validate_usecols(usecols, names):
"""
Validates that all usecols are present in a given
list of names. If not, raise a ValueError that
shows what usecols are missing.

Parameters
----------
usecols : iterable of usecols
The columns to validate are present in names.
names : iterable of names
The column names to check against.

Returns
-------
usecols : iterable of usecols
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a validate_usecols_arg function, is there a reason you are creating a new one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, was going from @gfyoung's suggestion.
We could extend the existing function, at the moment it's only taking usecols as an arg so would be extending it's arguments as well as logic.
I've not checked if every call to validate_usecols_arg would have a names argument to pass through, so may need to default it? Although I would think we'd always have column names to check against, right?

Let me know if this is a 'must-have' for you and I'll implement.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it certainly can be an optional argument
having 2 functions do similar things is confusing

The `usecols` parameter if the validation succeeds.

Raises
------
ValueError : Columns were missing. Error message will list them.
"""
missing = [c for c in usecols if c not in names]
if len(missing) > 0:
raise ValueError(
"Usecols do not match columns, "
"columns expected but not found: {missing}".format(missing=missing)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might need to ', '.join(missing) here, not sure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Python 3.6.3 |Anaconda, Inc.| (default, Nov  8 2017, 18:10:31) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> l = [1, 'foo', [2,3]]
>>> 'Formatted l: {}'.format(l)
"Formatted l: [1, 'foo', [2, 3]]"

if you'd prefer a different error message, I'd be happy to use join though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that’s fine i guess format is pretty smart about this ok!

)

return usecols


def _validate_skipfooter_arg(skipfooter):
"""
Validate the 'skipfooter' parameter.
Expand Down Expand Up @@ -1662,14 +1694,14 @@ def __init__(self, src, **kwds):
# GH 14671
if (self.usecols_dtype == 'string' and
not set(usecols).issubset(self.orig_names)):
raise ValueError("Usecols do not match names.")
_validate_usecols(usecols, self.orig_names)

if len(self.names) > len(usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in usecols or n in usecols)]

if len(self.names) < len(usecols):
raise ValueError("Usecols do not match names.")
_validate_usecols(usecols, self.names)

self._set_noconvert_columns()

Expand Down Expand Up @@ -2442,9 +2474,13 @@ def _handle_usecols(self, columns, usecols_key):
raise ValueError("If using multiple headers, usecols must "
"be integers.")
col_indices = []

for col in self.usecols:
if isinstance(col, string_types):
col_indices.append(usecols_key.index(col))
try:
col_indices.append(usecols_key.index(col))
except ValueError:
_validate_usecols(self.usecols, usecols_key)
else:
col_indices.append(col)
else:
Expand Down
16 changes: 8 additions & 8 deletions pandas/tests/io/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'

if self.engine == 'c':
msg = 'Usecols do not match names'
else:
msg = 'is not in list'
msg = (
"Usecols do not match columns, "
"columns expected but not found: {missing}"
)

usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
Expand All @@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self):
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test where you have 2 missing cols?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, will do this and add the whatsnew entry tomorrow, thanks!

self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)

names = ['A', 'B', 'C', 'D']
Expand All @@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self):
# tm.assert_frame_equal(df, expected)

usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError, msg):
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)