From 2c43917d025357477c4a28bd9c5c34097b91576e Mon Sep 17 00:00:00 2001 From: bpraggastis Date: Sun, 4 Jun 2017 03:47:14 -0700 Subject: [PATCH] ERRR: Raise error in usecols when column doesn't exist but length matches (#16460) * gh-14671 Check if usecols with type string contains a subset of names, if not throws an error * tests added for gh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out * Review comments (cherry picked from commit 50a62c17c16d24b8a20be9ef281a86bf589144f2) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/parsers.py | 6 ++++ pandas/tests/io/parser/usecols.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index b58f82f5533a3..0c2bc8e79e4f7 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -71,6 +71,7 @@ I/O ^^^ - Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`) +- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bc65903e64cd9..8064191282250 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1643,6 +1643,12 @@ def __init__(self, src, **kwds): if self.usecols: usecols = _evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + if (self.usecols_dtype == 'string' and + not set(usecols).issubset(self.orig_names)): + raise ValueError("Usecols do not match names.") + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 8761d1ccd3da4..f582e5037ca07 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -475,3 +475,54 @@ def test_uneven_length_cols(self): 'C': [3, 5, 4, 3, 3, 7]}) df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected) + + def test_raise_on_usecols_names_mismatch(self): + # GH 14671 + data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' + + if self.engine == 'c': + msg = 'Usecols do not match names' + else: + msg = 'is not in list' + + usecols = ['a', 'b', 'c', 'd'] + df = self.read_csv(StringIO(data), usecols=usecols) + expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], + 'd': [4, 8]}) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 'b', 'c', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + usecols = ['a', 'b', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(data), header=0, names=names) + expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7], + 'D': [4, 8]}) + tm.assert_frame_equal(df, expected) + + # TODO: https://github.com/pandas-dev/pandas/issues/16469 + # usecols = ['A','C'] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + # + # usecols = [0,2] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + + usecols = ['A', 'B', 'C', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), header=0, names=names, + usecols=usecols) + usecols = ['A', 'B', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), names=names, usecols=usecols)