From e47dbdfad7e210431534c5c661cb320a73d0694f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Dec 2020 15:03:19 -0800 Subject: [PATCH] REF: implement sanitize_masked_array (#38398) --- pandas/core/construction.py | 22 +++++++++++++++------- pandas/core/frame.py | 11 ++--------- pandas/core/internals/construction.py | 15 ++++++++------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ad18cb8dd5bc6..66ebdc2763617 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -419,6 +419,20 @@ def ensure_wrapped_if_datetimelike(arr): return arr +def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray: + """ + Convert numpy MaskedArray to ensure mask is softened. + """ + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True + data[mask] = fill_value + else: + data = data.copy() + return data + + def sanitize_array( data, index: Optional[Index], @@ -432,13 +446,7 @@ def sanitize_array( """ if isinstance(data, ma.MaskedArray): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() + data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9cad404f2ce82..66399f2b9a5e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -87,7 +87,6 @@ maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, - maybe_upcast, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -126,7 +125,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.sparse import SparseFrameAccessor -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, sanitize_masked_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( @@ -535,13 +534,7 @@ def __init__( # a masked array else: - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True - data[mask] = fill_value - else: - data = data.copy() + data = sanitize_masked_array(data) mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3c5216b65a70b..562d92580ba22 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -53,6 +53,8 @@ ) if TYPE_CHECKING: + from numpy.ma.mrecords import MaskedRecords + from pandas import Series # --------------------------------------------------------------------- @@ -96,13 +98,12 @@ def arrays_to_mgr( def masked_rec_array_to_mgr( - data, index, columns, dtype: Optional[DtypeObj], copy: bool + data: "MaskedRecords", index, columns, dtype: Optional[DtypeObj], copy: bool ): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it - fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) @@ -116,11 +117,11 @@ def masked_rec_array_to_mgr( # fill if needed new_arrays = [] - for fv, arr, col in zip(fill_value, arrays, arr_columns): - # TODO: numpy docs suggest fv must be scalar, but could it be - # non-scalar for object dtype? - assert lib.is_scalar(fv), fv - mask = ma.getmaskarray(data[col]) + for col in arr_columns: + arr = data[col] + fv = arr.fill_value + + mask = ma.getmaskarray(arr) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv