Skip to content

Commit

Permalink
REGR: NA-values in ctors with string dtype (#21366)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and jreback committed Jun 8, 2018
1 parent 93be27d commit 636dd01
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 15 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ and bug fixes. We recommend that all users upgrade to this version.
:local:
:backlinks: none


.. _whatsnew_0231.fixed_regressions:

Fixed Regressions
Expand All @@ -29,6 +28,7 @@ Fixed Regressions
- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing
values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`)

Expand Down
11 changes: 11 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,14 @@ def tz_aware_fixture(request):
Fixture for trying explicit timezones: {0}
"""
return request.param


@pytest.fixture(params=[str, 'str', 'U'])
def string_dtype(request):
"""Parametrized fixture for string dtypes.
* str
* 'str'
* 'U'
"""
return request.param
42 changes: 42 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values):
result = np.empty(len(values), dtype='object')
result[:] = values
return result


def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
"""
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
Parameters
----------
values : Sequence
dtype : numpy.dtype, optional
copy : bool, default False
Note that copies may still be made with ``copy=False`` if casting
is required.
Returns
-------
arr : ndarray[dtype]
Examples
--------
>>> np.array([1.0, 2.0, None], dtype='str')
array(['1.0', '2.0', 'None'], dtype='<U4')
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')
"""
subarr = np.array(values, dtype=dtype, copy=copy)

if dtype is not None and dtype.kind in ("U", "S"):
# GH-21083
# We can't just return np.array(subarr, dtype='str') since
# NumPy will convert the non-string objects into strings
# Including NA values. Se we have to go
# string -> object -> update NA, which requires an
# additional pass over the data.
na_values = isna(values)
subarr2 = subarr.astype(object)
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
subarr = subarr2

return subarr
4 changes: 3 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
maybe_convert_platform,
maybe_cast_to_datetime, maybe_castable,
construct_1d_arraylike_from_scalar,
construct_1d_ndarray_preserving_na,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.missing import (
isna,
Expand Down Expand Up @@ -4074,7 +4075,8 @@ def _try_cast(arr, take_fast_path):
isinstance(subarr, np.ndarray))):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
subarr = np.array(subarr, dtype=dtype, copy=copy)
subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/dtypes/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
maybe_convert_scalar,
find_common_type,
construct_1d_object_array_from_listlike,
construct_1d_ndarray_preserving_na,
construct_1d_arraylike_from_scalar)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
Expand Down Expand Up @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self):
tm.assert_categorical_equal(result, expected,
check_category_order=True,
check_dtype=True)


@pytest.mark.parametrize('values, dtype, expected', [
([1, 2, 3], None, np.array([1, 2, 3])),
(np.array([1, 2, 3]), None, np.array([1, 2, 3])),
(['1', '2', None], None, np.array(['1', '2', None])),
(['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
])
def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self):
assert a.dtype == df.a.dtype
assert b.dtype == df.b.dtype

def test_constructor_dtype_str_na_values(self, string_dtype):
# https://github.com/pandas-dev/pandas/issues/21083
df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
result = df.isna()
expected = DataFrame({"A": [False, True]})
tm.assert_frame_equal(result, expected)
assert df.iloc[1, 0] is None

df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
assert np.isnan(df.iloc[1, 0])

def test_constructor_rec(self):
rec = self.frame.to_records(index=False)

Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/frame/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):

@pytest.mark.parametrize('input_vals', [
([1, 2]),
([1.0, 2.0, np.nan]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals):
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements are converted to strings when
# dtype is str, 'str', or 'U'

for dtype in ['str', str, 'U']:
result = DataFrame({'A': input_vals}, dtype=dtype)
expected = DataFrame({'A': input_vals}).astype({'A': dtype})
assert_frame_equal(result, expected)
result = DataFrame({'A': input_vals}, dtype=string_dtype)
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
assert_frame_equal(result, expected)

def test_constructor_list_str_na(self, string_dtype):

result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
assert_frame_equal(result, expected)


class TestDataFrameDatetimeWithTZ(TestData):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):

data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

s = Series(data, dtype=str)
s = Series(data, dtype=object).astype(str)
result = s.mode(dropna)
expected3 = Series(expected3, dtype=str)
tm.assert_series_equal(result, expected3)
Expand Down
26 changes: 20 additions & 6 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self):
result = pd.Series(index=['b', 'a', 'c'])
assert result.index.tolist() == ['b', 'a', 'c']

def test_constructor_dtype_str_na_values(self, string_dtype):
# https://github.com/pandas-dev/pandas/issues/21083
ser = Series(['x', None], dtype=string_dtype)
result = ser.isna()
expected = Series([False, True])
tm.assert_series_equal(result, expected)
assert ser.iloc[1] is None

ser = Series(['x', np.nan], dtype=string_dtype)
assert np.isnan(ser.iloc[1])

def test_constructor_series(self):
index1 = ['d', 'b', 'a', 'c']
index2 = sorted(index1)
Expand Down Expand Up @@ -164,22 +175,25 @@ def test_constructor_list_like(self):

@pytest.mark.parametrize('input_vals', [
([1, 2]),
([1.0, 2.0, np.nan]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals):
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements from a list are converted to strings
# when dtype is str, 'str', or 'U'
result = Series(input_vals, dtype=string_dtype)
expected = Series(input_vals).astype(string_dtype)
assert_series_equal(result, expected)

for dtype in ['str', str, 'U']:
result = Series(input_vals, dtype=dtype)
expected = Series(input_vals).astype(dtype)
assert_series_equal(result, expected)
def test_constructor_list_str_na(self, string_dtype):
result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
expected = Series(['1.0', '2.0', np.nan], dtype=object)
assert_series_equal(result, expected)
assert np.isnan(result[2])

def test_constructor_generator(self):
gen = (i for i in range(10))
Expand Down

0 comments on commit 636dd01

Please sign in to comment.