BUG: Properly parse unicode usecols names in CSV (pandas-dev#24856)

Pingviinituutti · Feb 28, 2019 · 7f5a90a · 7f5a90a
1 parent 721cd3f
commit 7f5a90a
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 11 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1708,6 +1708,7 @@ I/O
 ^^^
 
 - Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
+- Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`)
 - Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
 - Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
 - Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1296,15 +1296,28 @@ def _validate_usecols_arg(usecols):
     if usecols is not None:
         if callable(usecols):
             return usecols, None
-        # GH20529, ensure is iterable container but not string.
-        elif not is_list_like(usecols):
+
+        if not is_list_like(usecols):
+            # see gh-20529
+            #
+            # Ensure it is iterable container but not string.
             raise ValueError(msg)
-        else:
-            usecols_dtype = lib.infer_dtype(usecols, skipna=False)
-            if usecols_dtype not in ('empty', 'integer',
-                                     'string', 'unicode'):
-                raise ValueError(msg)
-        return set(usecols), usecols_dtype
+
+        usecols_dtype = lib.infer_dtype(usecols, skipna=False)
+
+        if usecols_dtype not in ("empty", "integer",
+                                 "string", "unicode"):
+            raise ValueError(msg)
+
+        usecols = set(usecols)
+
+        if usecols_dtype == "unicode":
+            # see gh-13253
+            #
+            # Python 2.x compatibility
+            usecols = {col.encode("utf-8") for col in usecols}
+
+        return usecols, usecols_dtype
     return usecols, None
 
 

diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
@@ -9,7 +9,7 @@
 import pytest
 
 from pandas._libs.tslib import Timestamp
-from pandas.compat import PY2, StringIO
+from pandas.compat import StringIO
 
 from pandas import DataFrame, Index
 import pandas.util.testing as tm
@@ -387,8 +387,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
 
 @pytest.mark.parametrize("usecols", [
     ["あああ", "いい"],
-    pytest.param([u"あああ", u"いい"], marks=pytest.mark.skipif(
-        PY2, reason="Buggy behavior: see gh-13253"))
+    [u"あああ", u"いい"]
 ])
 def test_usecols_with_multi_byte_characters(all_parsers, usecols):
     data = """あああ,いい,ううう,ええええ