-
-
Notifications
You must be signed in to change notification settings - Fork 17.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Allow usecols to accept callable (GH14154) #14234
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,13 +90,18 @@ | |
MultiIndex is used. If you have a malformed file with delimiters at the end | ||
of each line, you might consider index_col=False to force pandas to _not_ | ||
use the first column as the index (row names) | ||
usecols : array-like, default None | ||
Return a subset of the columns. All elements in this array must either | ||
usecols : array-like or callable, default None | ||
Return a subset of the columns. If array-like, all elements must either | ||
be positional (i.e. integer indices into the document columns) or strings | ||
that correspond to column names provided either by the user in `names` or | ||
inferred from the document header row(s). For example, a valid `usecols` | ||
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter | ||
results in much faster parsing time and lower memory usage. | ||
inferred from the document header row(s). For example, a valid array-like | ||
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. | ||
|
||
If callable, the callable function will be evaluated against the column | ||
names, returning names where the callable function evaluates to True. An | ||
example of a valid callable argument would be ``lambda x: x.upper() in | ||
['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster | ||
parsing time and lower memory usage. | ||
as_recarray : boolean, default False | ||
DEPRECATED: this argument will be removed in a future version. Please call | ||
`pd.read_csv(...).to_records()` instead. | ||
|
@@ -977,17 +982,33 @@ def _is_index_col(col): | |
return col is not None and col is not False | ||
|
||
|
||
def _evaluate_usecols(usecols, names): | ||
""" | ||
Check whether or not the 'usecols' parameter | ||
is a callable. If so, enumerates the 'names' | ||
parameter and returns a set of indices for | ||
each entry in 'names' that evaluates to True. | ||
If not a callable, returns 'usecols'. | ||
""" | ||
if callable(usecols): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you give a mini-doc string here |
||
return set([i for i, name in enumerate(names) | ||
if usecols(name)]) | ||
return usecols | ||
|
||
|
||
def _validate_usecols_arg(usecols): | ||
""" | ||
Check whether or not the 'usecols' parameter | ||
contains all integers (column selection by index) | ||
or strings (column by name). Raises a ValueError | ||
if that is not the case. | ||
contains all integers (column selection by index), | ||
strings (column by name) or is a callable. Raises | ||
a ValueError if that is not the case. | ||
""" | ||
msg = ("The elements of 'usecols' must " | ||
"either be all strings, all unicode, or all integers") | ||
msg = ("'usecols' must either be all strings, all unicode, " | ||
"all integers or a callable") | ||
|
||
if usecols is not None: | ||
if callable(usecols): | ||
return usecols | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can just in-line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Took a look but don't think that would work. The problem seems to be the names argument that's a requirement for Unless I'm missing something I think I'd have to modify |
||
usecols_dtype = lib.infer_dtype(usecols) | ||
if usecols_dtype not in ('empty', 'integer', | ||
'string', 'unicode'): | ||
|
@@ -1499,11 +1520,12 @@ def __init__(self, src, **kwds): | |
self.orig_names = self.names[:] | ||
|
||
if self.usecols: | ||
if len(self.names) > len(self.usecols): | ||
usecols = _evaluate_usecols(self.usecols, self.orig_names) | ||
if len(self.names) > len(usecols): | ||
self.names = [n for i, n in enumerate(self.names) | ||
if (i in self.usecols or n in self.usecols)] | ||
if (i in usecols or n in usecols)] | ||
|
||
if len(self.names) < len(self.usecols): | ||
if len(self.names) < len(usecols): | ||
raise ValueError("Usecols do not match names.") | ||
|
||
self._set_noconvert_columns() | ||
|
@@ -1665,9 +1687,10 @@ def read(self, nrows=None): | |
|
||
def _filter_usecols(self, names): | ||
# hackish | ||
if self.usecols is not None and len(names) != len(self.usecols): | ||
usecols = _evaluate_usecols(self.usecols, names) | ||
if usecols is not None and len(names) != len(usecols): | ||
names = [name for i, name in enumerate(names) | ||
if i in self.usecols or name in self.usecols] | ||
if i in usecols or name in usecols] | ||
return names | ||
|
||
def _get_index_names(self): | ||
|
@@ -2291,7 +2314,9 @@ def _handle_usecols(self, columns, usecols_key): | |
usecols_key is used if there are string usecols. | ||
""" | ||
if self.usecols is not None: | ||
if any([isinstance(col, string_types) for col in self.usecols]): | ||
if callable(self.usecols): | ||
col_indices = _evaluate_usecols(self.usecols, usecols_key) | ||
elif any([isinstance(u, string_types) for u in self.usecols]): | ||
if len(columns) > 1: | ||
raise ValueError("If using multiple headers, usecols must " | ||
"be integers.") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,8 +23,9 @@ def test_raise_on_mixed_dtype_usecols(self): | |
1000,2000,3000 | ||
4000,5000,6000 | ||
""" | ||
msg = ("The elements of 'usecols' must " | ||
"either be all strings, all unicode, or all integers") | ||
|
||
msg = ("'usecols' must either be all strings, all unicode, " | ||
"all integers or a callable") | ||
usecols = [0, 'b', 2] | ||
|
||
with tm.assertRaisesRegexp(ValueError, msg): | ||
|
@@ -302,8 +303,8 @@ def test_usecols_with_mixed_encoding_strings(self): | |
3.568935038,7,False,a | ||
''' | ||
|
||
msg = ("The elements of 'usecols' must " | ||
"either be all strings, all unicode, or all integers") | ||
msg = ("'usecols' must either be all strings, all unicode, " | ||
"all integers or a callable") | ||
|
||
with tm.assertRaisesRegexp(ValueError, msg): | ||
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) | ||
|
@@ -366,3 +367,31 @@ def test_np_array_usecols(self): | |
expected = DataFrame([[1, 2]], columns=usecols) | ||
result = self.read_csv(StringIO(data), usecols=usecols) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_callable_usecols(self): | ||
# See gh-14154 | ||
s = '''AaA,bBb,CCC,ddd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue number as a comment |
||
0.056674973,8,True,a | ||
2.613230982,2,False,b | ||
3.568935038,7,False,a | ||
''' | ||
|
||
data = { | ||
'AaA': { | ||
0: 0.056674972999999997, | ||
1: 2.6132309819999997, | ||
2: 3.5689350380000002 | ||
}, | ||
'bBb': {0: 8, 1: 2, 2: 7}, | ||
'ddd': {0: 'a', 1: 'b', 2: 'a'} | ||
} | ||
expected = DataFrame(data) | ||
df = self.read_csv(StringIO(s), usecols=lambda x: | ||
x.upper() in ['AAA', 'BBB', 'DDD']) | ||
tm.assert_frame_equal(df, expected) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test that |
||
|
||
# Check that a callable returning only False returns | ||
# an empty DataFrame | ||
expected = DataFrame() | ||
df = self.read_csv(StringIO(s), usecols=lambda x: False) | ||
tm.assert_frame_equal(df, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add before the example some explanation how the callable is used? (I suppose it is called on each individual column name and should return True or False?)
And maybe also use
``
around the code example to better indicate that the "in ['AAA', 'BBB', 'DDD']" still belongs with the "lambda x: x.upper()"