From 31e0743a96480100bc7d016834af0925ad4b0f63 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Dec 2020 16:02:35 -0800 Subject: [PATCH] REF: implement nested_data_to_arrays (#38757) --- pandas/core/arrays/categorical.py | 26 +++++++----- pandas/core/frame.py | 27 ++++--------- pandas/core/internals/construction.py | 40 ++++++++++++++++++- .../tests/series/apply/test_series_apply.py | 8 +++- 4 files changed, 69 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 940c56340f75e..1bb5556663c29 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,17 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast +from typing import ( + TYPE_CHECKING, + Dict, + Hashable, + List, + Sequence, + Type, + TypeVar, + Union, + cast, +) from warnings import warn import numpy as np @@ -58,6 +68,10 @@ from pandas.io.formats import console +if TYPE_CHECKING: + from pandas import Index + + CategoricalT = TypeVar("CategoricalT", bound="Categorical") @@ -1708,13 +1722,7 @@ def fillna(self, value=None, method=None, limit=None): mask = self.isna() new_codes = self._validate_setitem_value(value) - - if isinstance(value, (np.ndarray, Categorical)): - # We get ndarray or Categorical if called via Series.fillna, - # where it will unwrap another aligned Series before getting here - codes[mask] = new_codes[mask] - else: - codes[mask] = new_codes + np.putmask(codes, mask, new_codes) return self._from_backing_data(codes) @@ -2510,7 +2518,7 @@ def _delegate_method(self, name, *args, **kwargs): # utility routines -def _get_codes_for_values(values, categories) -> np.ndarray: +def _get_codes_for_values(values, categories: "Index") -> np.ndarray: """ utility routine to turn values into codes given the specified categories diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9097ac13192c9..42b3046b0ffe9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -112,7 +112,6 @@ is_integer_dtype, is_iterator, is_list_like, - is_named_tuple, is_object_dtype, is_scalar, is_sequence, @@ -129,7 +128,7 @@ transform, ) from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import extract_array, sanitize_masked_array from pandas.core.generic import NDFrame, _shared_docs @@ -147,13 +146,14 @@ from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, - get_names_from_index, init_dict, init_ndarray, masked_rec_array_to_mgr, + nested_data_to_arrays, reorder_arrays, sanitize_index, to_arrays, + treat_as_nested, ) from pandas.core.reshape.melt import melt from pandas.core.series import Series @@ -565,27 +565,16 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) # For data is list-like, or Iterable (will consume into list) - elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): + elif is_list_like(data): if not isinstance(data, (abc.Sequence, ExtensionArray)): data = list(data) if len(data) > 0: if is_dataclass(data[0]): data = dataclasses_to_dicts(data) - if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: - if is_named_tuple(data[0]) and columns is None: - columns = data[0]._fields - arrays, columns = to_arrays(data, columns, dtype=dtype) - columns = ensure_index(columns) - - # set the index - if index is None: - if isinstance(data[0], Series): - index = get_names_from_index(data) - elif isinstance(data[0], Categorical): - index = ibase.default_index(len(data[0])) - else: - index = ibase.default_index(len(data)) - + if treat_as_nested(data): + arrays, columns, index = nested_data_to_arrays( + data, columns, index, dtype + ) mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 9e0d20e3de4e4..d9db728f66754 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -27,6 +27,7 @@ is_extension_array_dtype, is_integer_dtype, is_list_like, + is_named_tuple, is_object_dtype, ) from pandas.core.dtypes.generic import ( @@ -106,7 +107,7 @@ def masked_rec_array_to_mgr( # essentially process a record array then fill it fdata = ma.getdata(data) if index is None: - index = get_names_from_index(fdata) + index = _get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) @@ -286,6 +287,41 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) +def nested_data_to_arrays( + data: Sequence, + columns: Optional[Index], + index: Optional[Index], + dtype: Optional[DtypeObj], +): + """ + Convert a single sequence of arrays to multiple arrays. + """ + # By the time we get here we have already checked treat_as_nested(data) + + if is_named_tuple(data[0]) and columns is None: + columns = data[0]._fields + + arrays, columns = to_arrays(data, columns, dtype=dtype) + columns = ensure_index(columns) + + if index is None: + if isinstance(data[0], ABCSeries): + index = _get_names_from_index(data) + elif isinstance(data[0], Categorical): + index = ibase.default_index(len(data[0])) + else: + index = ibase.default_index(len(data)) + + return arrays, columns, index + + +def treat_as_nested(data) -> bool: + """ + Check if we should use nested_data_to_arrays. + """ + return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1 + + # --------------------------------------------------------------------- @@ -432,7 +468,7 @@ def reorder_arrays(arrays, arr_columns, columns): return arrays, arr_columns -def get_names_from_index(data): +def _get_names_from_index(data): has_some_name = any(getattr(s, "name", None) is not None for s in data) if not has_some_name: return ibase.default_index(len(data)) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 02121772bf1c7..5935d0c81af88 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -778,10 +778,14 @@ def test_map_missing_mixed(self, vals, mapping, exp): ), ], ) - def test_apply_series_on_date_time_index_aware_series(self, dti, exp): + @pytest.mark.parametrize("aware", [True, False]) + def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware): # GH 25959 # Calling apply on a localized time series should not cause an error - index = dti.tz_localize("UTC").index + if aware: + index = dti.tz_localize("UTC").index + else: + index = dti.index result = Series(index).apply(lambda x: Series([1, 2])) tm.assert_frame_equal(result, exp)