Skip to content

Commit

Permalink
REF: implement nested_data_to_arrays (#38757)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Dec 29, 2020
1 parent 8ff5c42 commit 31e0743
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 32 deletions.
26 changes: 17 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@
from functools import partial
import operator
from shutil import get_terminal_size
from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast
from typing import (
TYPE_CHECKING,
Dict,
Hashable,
List,
Sequence,
Type,
TypeVar,
Union,
cast,
)
from warnings import warn

import numpy as np
Expand Down Expand Up @@ -58,6 +68,10 @@

from pandas.io.formats import console

if TYPE_CHECKING:
from pandas import Index


CategoricalT = TypeVar("CategoricalT", bound="Categorical")


Expand Down Expand Up @@ -1708,13 +1722,7 @@ def fillna(self, value=None, method=None, limit=None):
mask = self.isna()

new_codes = self._validate_setitem_value(value)

if isinstance(value, (np.ndarray, Categorical)):
# We get ndarray or Categorical if called via Series.fillna,
# where it will unwrap another aligned Series before getting here
codes[mask] = new_codes[mask]
else:
codes[mask] = new_codes
np.putmask(codes, mask, new_codes)

return self._from_backing_data(codes)

Expand Down Expand Up @@ -2510,7 +2518,7 @@ def _delegate_method(self, name, *args, **kwargs):
# utility routines


def _get_codes_for_values(values, categories) -> np.ndarray:
def _get_codes_for_values(values, categories: "Index") -> np.ndarray:
"""
utility routine to turn values into codes given the specified categories
Expand Down
27 changes: 8 additions & 19 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@
is_integer_dtype,
is_iterator,
is_list_like,
is_named_tuple,
is_object_dtype,
is_scalar,
is_sequence,
Expand All @@ -129,7 +128,7 @@
transform,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import extract_array, sanitize_masked_array
from pandas.core.generic import NDFrame, _shared_docs
Expand All @@ -147,13 +146,14 @@
from pandas.core.internals.construction import (
arrays_to_mgr,
dataclasses_to_dicts,
get_names_from_index,
init_dict,
init_ndarray,
masked_rec_array_to_mgr,
nested_data_to_arrays,
reorder_arrays,
sanitize_index,
to_arrays,
treat_as_nested,
)
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
Expand Down Expand Up @@ -565,27 +565,16 @@ def __init__(
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

# For data is list-like, or Iterable (will consume into list)
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
elif is_list_like(data):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = dataclasses_to_dicts(data)
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

if treat_as_nested(data):
arrays, columns, index = nested_data_to_arrays(
data, columns, index, dtype
)
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
Expand Down
40 changes: 38 additions & 2 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
is_named_tuple,
is_object_dtype,
)
from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -106,7 +107,7 @@ def masked_rec_array_to_mgr(
# essentially process a record array then fill it
fdata = ma.getdata(data)
if index is None:
index = get_names_from_index(fdata)
index = _get_names_from_index(fdata)
if index is None:
index = ibase.default_index(len(data))
index = ensure_index(index)
Expand Down Expand Up @@ -286,6 +287,41 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)


def nested_data_to_arrays(
data: Sequence,
columns: Optional[Index],
index: Optional[Index],
dtype: Optional[DtypeObj],
):
"""
Convert a single sequence of arrays to multiple arrays.
"""
# By the time we get here we have already checked treat_as_nested(data)

if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields

arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

if index is None:
if isinstance(data[0], ABCSeries):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

return arrays, columns, index


def treat_as_nested(data) -> bool:
"""
Check if we should use nested_data_to_arrays.
"""
return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1


# ---------------------------------------------------------------------


Expand Down Expand Up @@ -432,7 +468,7 @@ def reorder_arrays(arrays, arr_columns, columns):
return arrays, arr_columns


def get_names_from_index(data):
def _get_names_from_index(data):
has_some_name = any(getattr(s, "name", None) is not None for s in data)
if not has_some_name:
return ibase.default_index(len(data))
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/series/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,10 +778,14 @@ def test_map_missing_mixed(self, vals, mapping, exp):
),
],
)
def test_apply_series_on_date_time_index_aware_series(self, dti, exp):
@pytest.mark.parametrize("aware", [True, False])
def test_apply_series_on_date_time_index_aware_series(self, dti, exp, aware):
# GH 25959
# Calling apply on a localized time series should not cause an error
index = dti.tz_localize("UTC").index
if aware:
index = dti.tz_localize("UTC").index
else:
index = dti.index
result = Series(index).apply(lambda x: Series([1, 2]))
tm.assert_frame_equal(result, exp)

Expand Down

0 comments on commit 31e0743

Please sign in to comment.