-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Better error message if usecols doesn't match columns #17310
Changes from 8 commits
b0e102a
15d4786
841a6cc
dced1b7
5bf89a8
ba93833
8a06cee
1afb4c1
5a8a852
2209eae
93185f5
5dfccdb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1137,6 +1137,38 @@ def _evaluate_usecols(usecols, names): | |
return usecols | ||
|
||
|
||
def _validate_usecols_names(usecols, names): | ||
""" | ||
Validates that all usecols are present in a given | ||
list of names. If not, raise a ValueError that | ||
shows what usecols are missing. | ||
|
||
Parameters | ||
---------- | ||
usecols : iterable of usecols | ||
The columns to validate are present in names. | ||
names : iterable of names | ||
The column names to check against. | ||
|
||
Returns | ||
------- | ||
usecols : iterable of usecols | ||
The `usecols` parameter if the validation succeeds. | ||
|
||
Raises | ||
------ | ||
ValueError : Columns were missing. Error message will list them. | ||
""" | ||
missing = [c for c in usecols if c not in names] | ||
if len(missing) > 0: | ||
raise ValueError( | ||
"Usecols do not match columns, " | ||
"columns expected but not found: {missing}".format(missing=missing) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you might need to ', '.join(missing) here, not sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Python 3.6.3 |Anaconda, Inc.| (default, Nov 8 2017, 18:10:31)
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> l = [1, 'foo', [2,3]]
>>> 'Formatted l: {}'.format(l)
"Formatted l: [1, 'foo', [2, 3]]" if you'd prefer a different error message, I'd be happy to use join though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that’s fine i guess format is pretty smart about this ok! |
||
) | ||
|
||
return usecols | ||
|
||
|
||
def _validate_skipfooter_arg(skipfooter): | ||
""" | ||
Validate the 'skipfooter' parameter. | ||
|
@@ -1749,14 +1781,14 @@ def __init__(self, src, **kwds): | |
# GH 14671 | ||
if (self.usecols_dtype == 'string' and | ||
not set(usecols).issubset(self.orig_names)): | ||
raise ValueError("Usecols do not match names.") | ||
_validate_usecols_names(usecols, self.orig_names) | ||
|
||
if len(self.names) > len(usecols): | ||
self.names = [n for i, n in enumerate(self.names) | ||
if (i in usecols or n in usecols)] | ||
|
||
if len(self.names) < len(usecols): | ||
raise ValueError("Usecols do not match names.") | ||
_validate_usecols_names(usecols, self.names) | ||
|
||
self._set_noconvert_columns() | ||
|
||
|
@@ -2528,9 +2560,13 @@ def _handle_usecols(self, columns, usecols_key): | |
raise ValueError("If using multiple headers, usecols must " | ||
"be integers.") | ||
col_indices = [] | ||
|
||
for col in self.usecols: | ||
if isinstance(col, string_types): | ||
col_indices.append(usecols_key.index(col)) | ||
try: | ||
col_indices.append(usecols_key.index(col)) | ||
except ValueError: | ||
_validate_usecols_names(self.usecols, usecols_key) | ||
else: | ||
col_indices.append(col) | ||
else: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self): | |
# GH 14671 | ||
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' | ||
|
||
if self.engine == 'c': | ||
msg = 'Usecols do not match names' | ||
else: | ||
msg = 'is not in list' | ||
msg = ( | ||
"Usecols do not match columns, " | ||
"columns expected but not found: {missing}" | ||
) | ||
|
||
usecols = ['a', 'b', 'c', 'd'] | ||
df = self.read_csv(StringIO(data), usecols=usecols) | ||
|
@@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self): | |
tm.assert_frame_equal(df, expected) | ||
|
||
usecols = ['a', 'b', 'c', 'f'] | ||
with tm.assert_raises_regex(ValueError, msg): | ||
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a test where you have 2 missing cols? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, will do this and add the whatsnew entry tomorrow, thanks! |
||
self.read_csv(StringIO(data), usecols=usecols) | ||
|
||
usecols = ['a', 'b', 'f'] | ||
with tm.assert_raises_regex(ValueError, msg): | ||
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): | ||
self.read_csv(StringIO(data), usecols=usecols) | ||
|
||
names = ['A', 'B', 'C', 'D'] | ||
|
@@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self): | |
# tm.assert_frame_equal(df, expected) | ||
|
||
usecols = ['A', 'B', 'C', 'f'] | ||
with tm.assert_raises_regex(ValueError, msg): | ||
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): | ||
self.read_csv(StringIO(data), header=0, names=names, | ||
usecols=usecols) | ||
usecols = ['A', 'B', 'f'] | ||
with tm.assert_raises_regex(ValueError, msg): | ||
with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): | ||
self.read_csv(StringIO(data), names=names, usecols=usecols) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is a validate_usecols_arg function, is there a reason you are creating a new one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, was going from @gfyoung's suggestion.
We could extend the existing function, at the moment it's only taking
usecols
as an arg so would be extending it's arguments as well as logic.I've not checked if every call to
validate_usecols_arg
would have anames
argument to pass through, so may need to default it? Although I would think we'd always have column names to check against, right?Let me know if this is a 'must-have' for you and I'll implement.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it certainly can be an optional argument
having 2 functions do similar things is confusing