Skip to content

Commit

Permalink
ERRR: Raise error in usecols when column doesn't exist but length mat…
Browse files Browse the repository at this point in the history
…ches (#16460)

* gh-14671 Check if usecols with type string contains a subset of names, if not throws an error

* tests added for gh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out

* Review comments

(cherry picked from commit 50a62c1)
  • Loading branch information
bpraggastis authored and TomAugspurger committed Jun 4, 2017
1 parent 7286bc7 commit 2c43917
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ I/O
^^^

- Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`)
- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`)
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
- Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`)
- Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
Expand Down
6 changes: 6 additions & 0 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1643,6 +1643,12 @@ def __init__(self, src, **kwds):

if self.usecols:
usecols = _evaluate_usecols(self.usecols, self.orig_names)

# GH 14671
if (self.usecols_dtype == 'string' and
not set(usecols).issubset(self.orig_names)):
raise ValueError("Usecols do not match names.")

if len(self.names) > len(usecols):
self.names = [n for i, n in enumerate(self.names)
if (i in usecols or n in usecols)]
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/io/parser/usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,54 @@ def test_uneven_length_cols(self):
'C': [3, 5, 4, 3, 3, 7]})
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)

def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'

if self.engine == 'c':
msg = 'Usecols do not match names'
else:
msg = 'is not in list'

usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
'd': [4, 8]})
tm.assert_frame_equal(df, expected)

usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), usecols=usecols)

usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), usecols=usecols)

names = ['A', 'B', 'C', 'D']

df = self.read_csv(StringIO(data), header=0, names=names)
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
'D': [4, 8]})
tm.assert_frame_equal(df, expected)

# TODO: https://github.com/pandas-dev/pandas/issues/16469
# usecols = ['A','C']
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)
#
# usecols = [0,2]
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)

usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError, msg):
self.read_csv(StringIO(data), names=names, usecols=usecols)

0 comments on commit 2c43917

Please sign in to comment.