Merge pull request #10577 from santegoeds/bugfix/csv_reader-empty-dat…

…a-with-dtype-args Fixed bug where read_csv ignores dtype arg if input is empty.
pandas-dev · Jul 18, 2015 · 061c506 · 061c506
2 parents 3089006 + 904aaea
commit 061c506
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 6 deletions.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -368,7 +368,8 @@ Bug Fixes
 - Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
 - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
 - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
-- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']``  (:issue:`10413`, :issue:`10467`)
+- Bug in `pandas.read_csv` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype``
+  (:issue:`10413`, :issue:`10467`, :issue:`10577`)
 - Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
 - Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
 - Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1170,7 +1170,8 @@ def read(self, nrows=None):
             if nrows is None:
                 return _get_empty_meta(self.orig_names,
                                        self.index_col,
-                                       self.index_names)
+                                       self.index_names,
+                                       dtype=self.kwds.get('dtype'))
             else:
                 raise
 
@@ -2219,19 +2220,30 @@ def _clean_index_names(columns, index_col):
     return index_names, columns, index_col
 
 
-def _get_empty_meta(columns, index_col, index_names):
+def _get_empty_meta(columns, index_col, index_names, dtype=None):
     columns = list(columns)
 
+    if dtype is None:
+        dtype = {}
+    else:
+        # Convert column indexes to column names.
+        dtype = dict((columns[k] if com.is_integer(k) else k, v)
+                     for k, v in compat.iteritems(dtype))
+
     if index_col is None or index_col is False:
         index = Index([])
     else:
-        index_col = list(index_col)
-        index = MultiIndex.from_arrays([[]] * len(index_col), names=index_names)
+        index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
+                  for index_name in index_names ]
+        index = MultiIndex.from_arrays(index, names=index_names)
         index_col.sort()
         for i, n in enumerate(index_col):
             columns.pop(n-i)
 
-    return index, columns, {}
+    col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object)))
+                    for col_name in columns)
+
+    return index, columns, col_dict
 
 
 def _floatify_na_values(na_values):

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -3540,6 +3540,64 @@ def test_pass_dtype(self):
         self.assertEqual(result['one'].dtype, 'u1')
         self.assertEqual(result['two'].dtype, 'S1')
 
+    def test_empty_pass_dtype(self):
+        data = 'one,two'
+        result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
+
+        expected = DataFrame({'one': np.empty(0, dtype='u1'),
+                              'two': np.empty(0, dtype=np.object)})
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_index_pass_dtype(self):
+        data = 'one,two'
+        result = self.read_csv(StringIO(data), index_col=['one'],
+                               dtype={'one': 'u1', 1: 'f'})
+
+        expected = DataFrame({'two': np.empty(0, dtype='f')},
+                             index=Index([], dtype='u1', name='one'))
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_multiindex_pass_dtype(self):
+        data = 'one,two,three'
+        result = self.read_csv(StringIO(data), index_col=['one', 'two'],
+                               dtype={'one': 'u1', 1: 'f8'})
+
+        expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=MultiIndex.from_arrays(
+            [np.empty(0, dtype='u1'), np.empty(0, dtype='O')],
+            names=['one', 'two'])
+            )
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_mangled_column_pass_dtype_by_names(self):
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'})
+
+        expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
+
+        expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_dup_column_pass_dtype_by_names(self):
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'})
+        expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1)
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_with_dup_column_pass_dtype_by_indexes(self):
+        ### FIXME in GH9424
+        raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns")
+
+        data = 'one,one'
+        result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'})
+        expected = pd.concat([Series([], name='one', dtype='u1'), 
+                              Series([], name='one', dtype='f')], axis=1)
+        tm.assert_frame_equal(result, expected)
+
     def test_usecols_dtypes(self):
         data = """\
 1,2,3