REF: implement nested_data_to_arrays (#38757)

pandas-dev · Dec 29, 2020 · 31e0743 · 31e0743
1 parent 8ff5c42
commit 31e0743
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 32 deletions.
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2,7 +2,17 @@
 from functools import partial
 import operator
 from shutil import get_terminal_size
-from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Hashable,
+    List,
+    Sequence,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 from warnings import warn
 
 import numpy as np
@@ -58,6 +68,10 @@
 
 from pandas.io.formats import console
 
+if TYPE_CHECKING:
+    from pandas import Index
+
+
 CategoricalT = TypeVar("CategoricalT", bound="Categorical")
 
 
@@ -1708,13 +1722,7 @@ def fillna(self, value=None, method=None, limit=None):
             mask = self.isna()
 
             new_codes = self._validate_setitem_value(value)
-
-            if isinstance(value, (np.ndarray, Categorical)):
-                # We get ndarray or Categorical if called via Series.fillna,
-                #  where it will unwrap another aligned Series before getting here
-                codes[mask] = new_codes[mask]
-            else:
-                codes[mask] = new_codes
+            np.putmask(codes, mask, new_codes)
 
         return self._from_backing_data(codes)
 
@@ -2510,7 +2518,7 @@ def _delegate_method(self, name, *args, **kwargs):
 # utility routines
 
 
-def _get_codes_for_values(values, categories) -> np.ndarray:
+def _get_codes_for_values(values, categories: "Index") -> np.ndarray:
     """
     utility routine to turn values into codes given the specified categories
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -112,7 +112,6 @@
     is_integer_dtype,
     is_iterator,
     is_list_like,
-    is_named_tuple,
     is_object_dtype,
     is_scalar,
     is_sequence,
@@ -129,7 +128,7 @@
     transform,
 )
 from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays import Categorical, ExtensionArray
+from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse import SparseFrameAccessor
 from pandas.core.construction import extract_array, sanitize_masked_array
 from pandas.core.generic import NDFrame, _shared_docs
@@ -147,13 +146,14 @@
 from pandas.core.internals.construction import (
     arrays_to_mgr,
     dataclasses_to_dicts,
-    get_names_from_index,
     init_dict,
     init_ndarray,
     masked_rec_array_to_mgr,
+    nested_data_to_arrays,
     reorder_arrays,
     sanitize_index,
     to_arrays,
+    treat_as_nested,
 )
 from pandas.core.reshape.melt import melt
 from pandas.core.series import Series
@@ -565,27 +565,16 @@ def __init__(
                 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
 
         # For data is list-like, or Iterable (will consume into list)
-        elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
+        elif is_list_like(data):
             if not isinstance(data, (abc.Sequence, ExtensionArray)):
                 data = list(data)
             if len(data) > 0:
                 if is_dataclass(data[0]):
                     data = dataclasses_to_dicts(data)
-                if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
-                    if is_named_tuple(data[0]) and columns is None:
-                        columns = data[0]._fields
-                    arrays, columns = to_arrays(data, columns, dtype=dtype)
-                    columns = ensure_index(columns)
-
-                    # set the index
-                    if index is None:
-                        if isinstance(data[0], Series):
-                            index = get_names_from_index(data)
-                        elif isinstance(data[0], Categorical):
-                            index = ibase.default_index(len(data[0]))
-                        else:
-                            index = ibase.default_index(len(data))
-
+                if treat_as_nested(data):
+                    arrays, columns, index = nested_data_to_arrays(
+                        data, columns, index, dtype
+                    )
                     mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
                 else:
                     mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -27,6 +27,7 @@
     is_extension_array_dtype,
     is_integer_dtype,
     is_list_like,
+    is_named_tuple,
     is_object_dtype,
 )
 from pandas.core.dtypes.generic import (
@@ -106,7 +107,7 @@ def masked_rec_array_to_mgr(
     # essentially process a record array then fill it
     fdata = ma.getdata(data)
     if index is None:
-        index = get_names_from_index(fdata)
+        index = _get_names_from_index(fdata)
         if index is None:
             index = ibase.default_index(len(data))
     index = ensure_index(index)
@@ -286,6 +287,41 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
 
 
+def nested_data_to_arrays(
+    data: Sequence,
+    columns: Optional[Index],
+    index: Optional[Index],
+    dtype: Optional[DtypeObj],
+):
+    """
+    Convert a single sequence of arrays to multiple arrays.
+    """
+    # By the time we get here we have already checked treat_as_nested(data)
+
+    if is_named_tuple(data[0]) and columns is None:
+        columns = data[0]._fields
+
+    arrays, columns = to_arrays(data, columns, dtype=dtype)
+    columns = ensure_index(columns)
+
+    if index is None:
+        if isinstance(data[0], ABCSeries):
+            index = _get_names_from_index(data)
+        elif isinstance(data[0], Categorical):
+            index = ibase.default_index(len(data[0]))
+        else:
+            index = ibase.default_index(len(data))
+
+    return arrays, columns, index
+
+
+def treat_as_nested(data) -> bool:
+    """
+    Check if we should use nested_data_to_arrays.
+    """
+    return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1
+
+
 # ---------------------------------------------------------------------
 
 
@@ -432,7 +468,7 @@ def reorder_arrays(arrays, arr_columns, columns):
     return arrays, arr_columns
 
 
-def get_names_from_index(data):
+def _get_names_from_index(data):
     has_some_name = any(getattr(s, "name", None) is not None for s in data)
     if not has_some_name:
         return ibase.default_index(len(data))

diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py
@@ -778,10 +778,14 @@ def test_map_missing_mixed(self, vals, mapping, exp):
             ),
         ],
     )
-    def test_apply_series_on_date_time_index_aware_series(self, dti, exp):
+    @pytest.mark.parametrize("aware", [True, False])
+    def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware):
         # GH 25959
         # Calling apply on a localized time series should not cause an error
-        index = dti.tz_localize("UTC").index
+        if aware:
+            index = dti.tz_localize("UTC").index
+        else:
+            index = dti.index
         result = Series(index).apply(lambda x: Series([1, 2]))
         tm.assert_frame_equal(result, exp)