From b8e36ac55a719dc57758ef88c40a7a3fde6c9c9f Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 24 Aug 2017 17:06:11 +0100 Subject: [PATCH] BUG: Set index when reading stata file Ensures index is set when requested when reading state dta file closes #16342 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/stata.py | 11 ++++++++--- pandas/tests/io/test_stata.py | 8 ++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index fcadd26156b1d4..45e53a87ba8175 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -369,6 +369,7 @@ I/O - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) +- Bug in :func:`read_stata` where the index was not set (:issue:`16342`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 253ed03c25db94..fac506f3cb6c40 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1486,6 +1486,8 @@ def read(self, nrows=None, convert_dates=None, columns = self._columns if order_categoricals is None: order_categoricals = self._order_categoricals + if index is None: + index = self._index if nrows is None: nrows = self.nobs @@ -1526,7 +1528,7 @@ def read(self, nrows=None, convert_dates=None, if len(data) == 0: data = DataFrame(columns=self.varlist, index=index) else: - data = DataFrame.from_records(data, index=index) + data = DataFrame.from_records(data) data.columns = self.varlist # If index is not specified, use actual row number rather than @@ -1553,7 +1555,7 @@ def read(self, nrows=None, convert_dates=None, cols_ = np.where(self.dtyplist)[0] # Convert columns (if needed) to match input type - index = data.index + ix = data.index requires_type_conversion = False data_formatted = [] for i in cols_: @@ -1563,7 +1565,7 @@ def read(self, nrows=None, convert_dates=None, if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( - (col, Series(data[col], index, self.dtyplist[i]))) + (col, Series(data[col], ix, self.dtyplist[i]))) else: data_formatted.append((col, data[col])) if requires_type_conversion: @@ -1606,6 +1608,9 @@ def read(self, nrows=None, convert_dates=None, if convert: data = DataFrame.from_items(retyped_data) + if index is not None: + data = data.set_index(data.pop(index)) + return data def _do_convert_missing(self, data, convert_missing): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a414928d318c42..e7f67fd9ac8c14 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1309,3 +1309,11 @@ def test_value_labels_iterator(self, write_index): dta_iter = pd.read_stata(path, iterator=True) value_labels = dta_iter.value_labels() assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}} + + def test_set_index(self): + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + df.to_stata(path) + reread = pd.read_stata(path, index='index') + tm.assert_frame_equal(df, reread)