Skip to content

Commit

Permalink
BUG: pd.read_csv uses dtype arg with empty input
Browse files Browse the repository at this point in the history
  • Loading branch information
santegoeds committed Jul 17, 2015
1 parent b061055 commit 904aaea
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 6 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ Bug Fixes
- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
- Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
- Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`)
- Bug in `pandas.read_csv` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype``
(:issue:`10413`, :issue:`10467`, :issue:`10577`)
- Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
- Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)
Expand Down
22 changes: 17 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,8 @@ def read(self, nrows=None):
if nrows is None:
return _get_empty_meta(self.orig_names,
self.index_col,
self.index_names)
self.index_names,
dtype=self.kwds.get('dtype'))
else:
raise

Expand Down Expand Up @@ -2219,19 +2220,30 @@ def _clean_index_names(columns, index_col):
return index_names, columns, index_col


def _get_empty_meta(columns, index_col, index_names):
def _get_empty_meta(columns, index_col, index_names, dtype=None):
columns = list(columns)

if dtype is None:
dtype = {}
else:
# Convert column indexes to column names.
dtype = dict((columns[k] if com.is_integer(k) else k, v)
for k, v in compat.iteritems(dtype))

if index_col is None or index_col is False:
index = Index([])
else:
index_col = list(index_col)
index = MultiIndex.from_arrays([[]] * len(index_col), names=index_names)
index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
for index_name in index_names ]
index = MultiIndex.from_arrays(index, names=index_names)
index_col.sort()
for i, n in enumerate(index_col):
columns.pop(n-i)

return index, columns, {}
col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object)))
for col_name in columns)

return index, columns, col_dict


def _floatify_na_values(na_values):
Expand Down
58 changes: 58 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3540,6 +3540,64 @@ def test_pass_dtype(self):
self.assertEqual(result['one'].dtype, 'u1')
self.assertEqual(result['two'].dtype, 'S1')

def test_empty_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), dtype={'one': 'u1'})

expected = DataFrame({'one': np.empty(0, dtype='u1'),
'two': np.empty(0, dtype=np.object)})
tm.assert_frame_equal(result, expected)

def test_empty_with_index_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), index_col=['one'],
dtype={'one': 'u1', 1: 'f'})

expected = DataFrame({'two': np.empty(0, dtype='f')},
index=Index([], dtype='u1', name='one'))
tm.assert_frame_equal(result, expected)

def test_empty_with_multiindex_pass_dtype(self):
data = 'one,two,three'
result = self.read_csv(StringIO(data), index_col=['one', 'two'],
dtype={'one': 'u1', 1: 'f8'})

expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=MultiIndex.from_arrays(
[np.empty(0, dtype='u1'), np.empty(0, dtype='O')],
names=['one', 'two'])
)
tm.assert_frame_equal(result, expected)

def test_empty_with_mangled_column_pass_dtype_by_names(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'})

expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected)

def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})

expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected)

def test_empty_with_dup_column_pass_dtype_by_names(self):
data = 'one,one'
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'})
expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1)
tm.assert_frame_equal(result, expected)

def test_empty_with_dup_column_pass_dtype_by_indexes(self):
### FIXME in GH9424
raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns")

data = 'one,one'
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'})
expected = pd.concat([Series([], name='one', dtype='u1'),
Series([], name='one', dtype='f')], axis=1)
tm.assert_frame_equal(result, expected)

def test_usecols_dtypes(self):
data = """\
1,2,3
Expand Down

0 comments on commit 904aaea

Please sign in to comment.