Skip to content

Commit

Permalink
BUG: Ensure chunksize is set if not provided
Browse files Browse the repository at this point in the history
Remvoe error message inorrectl added
Fixed new issues identified by mypy
Add test to ensure conversion of large ints is correct

closes pandas-dev#37280
  • Loading branch information
bashtage committed Oct 21, 2020
1 parent 6ac3765 commit 5ad63c0
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 43 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`)
- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`)
- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)

.. ---------------------------------------------------------------------------
Expand Down
83 changes: 43 additions & 40 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ class PossiblePrecisionLoss(Warning):


precision_loss_doc = """
Column converted from %s to %s, and some data are outside of the lossless
Column converted from {0} to {1}, and some data are outside of the lossless
conversion range. This may result in a loss of precision in the saved data.
"""

Expand Down Expand Up @@ -543,7 +543,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
object in a DataFrame.
"""
ws = ""
# original, if small, if large
# original, if small, if large
conversion_data = (
(np.bool_, np.int8, np.int8),
(np.uint8, np.int8, np.int16),
Expand All @@ -563,7 +563,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
dtype = c_data[1]
else:
dtype = c_data[2]
if c_data[2] == np.float64: # Warn if necessary
if c_data[2] == np.int64: # Warn if necessary
if data[col].max() >= 2 ** 53:
ws = precision_loss_doc.format("uint64", "float64")

Expand Down Expand Up @@ -627,12 +627,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
self.value_labels = list(zip(np.arange(len(categories)), categories))
self.value_labels.sort(key=lambda x: x[0])
self.text_len = 0
self.off: List[int] = []
self.val: List[int] = []
self.txt: List[bytes] = []
self.n = 0

# Compute lengths and setup lists of offsets and labels
offsets: List[int] = []
values: List[int] = []
for vl in self.value_labels:
category = vl[1]
if not isinstance(category, str):
Expand All @@ -642,9 +642,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
ValueLabelTypeMismatch,
)
category = category.encode(encoding)
self.off.append(self.text_len)
offsets.append(self.text_len)
self.text_len += len(category) + 1 # +1 for the padding
self.val.append(vl[0])
values.append(vl[0])
self.txt.append(category)
self.n += 1

Expand All @@ -655,8 +655,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
)

# Ensure int32
self.off = np.array(self.off, dtype=np.int32)
self.val = np.array(self.val, dtype=np.int32)
self.off = np.array(offsets, dtype=np.int32)
self.val = np.array(values, dtype=np.int32)

# Total length
self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
Expand Down Expand Up @@ -868,23 +868,23 @@ def __init__(self):
# with a label, but the underlying variable is -127 to 100
# we're going to drop the label and cast to int
self.DTYPE_MAP = dict(
list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
+ [
(251, np.int8),
(252, np.int16),
(253, np.int32),
(254, np.float32),
(255, np.float64),
(251, np.dtype(np.int8)),
(252, np.dtype(np.int16)),
(253, np.dtype(np.int32)),
(254, np.dtype(np.float32)),
(255, np.dtype(np.float64)),
]
)
self.DTYPE_MAP_XML = dict(
[
(32768, np.uint8), # Keys to GSO
(65526, np.float64),
(65527, np.float32),
(65528, np.int32),
(65529, np.int16),
(65530, np.int8),
(32768, np.dtype(np.uint8)), # Keys to GSO
(65526, np.dtype(np.float64)),
(65527, np.dtype(np.float32)),
(65528, np.dtype(np.int32)),
(65529, np.dtype(np.int16)),
(65530, np.dtype(np.int8)),
]
)
# error: Argument 1 to "list" has incompatible type "str";
Expand Down Expand Up @@ -1045,10 +1045,12 @@ def __init__(
self._order_categoricals = order_categoricals
self._encoding = ""
self._chunksize = chunksize
if self._chunksize is not None and (
not isinstance(chunksize, int) or chunksize <= 0
):
raise ValueError("chunksize must be a positive integer when set.")
self._using_iterator = False
if self._chunksize is None:
self._chunksize = 1
else:
if not isinstance(chunksize, int) or chunksize <= 0:
raise ValueError("chunksize must be a positive integer when set.")

# State variables for the file
self._has_string_data = False
Expand All @@ -1057,7 +1059,7 @@ def __init__(
self._column_selector_set = False
self._value_labels_read = False
self._data_read = False
self._dtype = None
self._dtype: Optional[np.dtype] = None
self._lines_read = 0

self._native_byteorder = _set_endianness(sys.byteorder)
Expand Down Expand Up @@ -1193,7 +1195,7 @@ def _read_new_header(self) -> None:
# Get data type information, works for versions 117-119.
def _get_dtypes(
self, seek_vartypes: int
) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:

self.path_or_buf.seek(seek_vartypes)
raw_typlist = [
Expand Down Expand Up @@ -1518,11 +1520,8 @@ def _read_strls(self) -> None:
self.GSO[str(v_o)] = decoded_va

def __next__(self) -> DataFrame:
if self._chunksize is None:
raise ValueError(
"chunksize must be set to a positive integer to use as an iterator."
)
return self.read(nrows=self._chunksize or 1)
self._using_iterator = True
return self.read(nrows=self._chunksize)

def get_chunk(self, size: Optional[int] = None) -> DataFrame:
"""
Expand Down Expand Up @@ -1690,11 +1689,15 @@ def any_startswith(x: str) -> bool:
convert = False
for col in data:
dtype = data[col].dtype
if dtype in (np.float16, np.float32):
dtype = np.float64
if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
dtype = np.dtype(np.float64)
convert = True
elif dtype in (np.int8, np.int16, np.int32):
dtype = np.int64
elif dtype in (
np.dtype(np.int8),
np.dtype(np.int16),
np.dtype(np.int32),
):
dtype = np.dtype(np.int64)
convert = True
retyped_data.append((col, data[col].astype(dtype)))
if convert:
Expand Down Expand Up @@ -1806,14 +1809,14 @@ def _do_convert_categoricals(
keys = np.array(list(vl.keys()))
column = data[col]
key_matches = column.isin(keys)
if self._chunksize is not None and key_matches.all():
initial_categories = keys
if self._using_iterator and key_matches.all():
initial_categories: Optional[np.ndarray] = keys
# If all categories are in the keys and we are iterating,
# use the same keys for all chunks. If some are missing
# value labels, then we will fall back to the categories
# varying across chunks.
else:
if self._chunksize is not None:
if self._using_iterator:
# warn is using an iterator
warnings.warn(
categorical_conversion_warning, CategoricalConversionWarning
Expand Down Expand Up @@ -2024,7 +2027,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
"ty",
"%ty",
]:
return np.float64 # Stata expects doubles for SIFs
return np.dtype(np.float64) # Stata expects doubles for SIFs
else:
raise NotImplementedError(f"Format {fmt} not implemented")

Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
StataReader(dta_file, chunksize=0)
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize="apple")
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
with StataReader(dta_file) as reader:
reader.__next__()


def test_iterator_value_labels():
Expand All @@ -1983,3 +1980,20 @@ def test_iterator_value_labels():
for i in range(2):
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])


def test_precision_loss():
df = DataFrame(
[[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
columns=["big", "little"],
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(
PossiblePrecisionLoss, match="Column converted from int64 to float64"
):
df.to_stata(path, write_index=False)
reread = read_stata(path)
expected_dt = Series([np.float64, np.float64], index=["big", "little"])
tm.assert_series_equal(reread.dtypes, expected_dt)
assert reread.loc[0, "little"] == df.loc[0, "little"]
assert reread.loc[0, "big"] == float(df.loc[0, "big"])

0 comments on commit 5ad63c0

Please sign in to comment.