Skip to content

Commit

Permalink
BUG: Properly parse unicode usecols names in CSV (pandas-dev#24856)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and Pingviinituutti committed Feb 28, 2019
1 parent 721cd3f commit 7f5a90a
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 11 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1708,6 +1708,7 @@ I/O
^^^

- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
- Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`)
- Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
- Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)
Expand Down
29 changes: 21 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,15 +1296,28 @@ def _validate_usecols_arg(usecols):
if usecols is not None:
if callable(usecols):
return usecols, None
# GH20529, ensure is iterable container but not string.
elif not is_list_like(usecols):

if not is_list_like(usecols):
# see gh-20529
#
# Ensure it is iterable container but not string.
raise ValueError(msg)
else:
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)
return set(usecols), usecols_dtype

usecols_dtype = lib.infer_dtype(usecols, skipna=False)

if usecols_dtype not in ("empty", "integer",
"string", "unicode"):
raise ValueError(msg)

usecols = set(usecols)

if usecols_dtype == "unicode":
# see gh-13253
#
# Python 2.x compatibility
usecols = {col.encode("utf-8") for col in usecols}

return usecols, usecols_dtype
return usecols, None


Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/io/parser/test_usecols.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pytest

from pandas._libs.tslib import Timestamp
from pandas.compat import PY2, StringIO
from pandas.compat import StringIO

from pandas import DataFrame, Index
import pandas.util.testing as tm
Expand Down Expand Up @@ -387,8 +387,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):

@pytest.mark.parametrize("usecols", [
["あああ", "いい"],
pytest.param([u"あああ", u"いい"], marks=pytest.mark.skipif(
PY2, reason="Buggy behavior: see gh-13253"))
[u"あああ", u"いい"]
])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
Expand Down

0 comments on commit 7f5a90a

Please sign in to comment.