REF: implement sanitize_masked_array (#38398)

pandas-dev · Dec 11, 2020 · e47dbdf · e47dbdf
1 parent 5a7514c
commit e47dbdf
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 23 deletions.
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -419,6 +419,20 @@ def ensure_wrapped_if_datetimelike(arr):
     return arr
 
 
+def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
+    """
+    Convert numpy MaskedArray to ensure mask is softened.
+    """
+    mask = ma.getmaskarray(data)
+    if mask.any():
+        data, fill_value = maybe_upcast(data, copy=True)
+        data.soften_mask()  # set hardmask False if it was True
+        data[mask] = fill_value
+    else:
+        data = data.copy()
+    return data
+
+
 def sanitize_array(
     data,
     index: Optional[Index],
@@ -432,13 +446,7 @@ def sanitize_array(
     """
 
     if isinstance(data, ma.MaskedArray):
-        mask = ma.getmaskarray(data)
-        if mask.any():
-            data, fill_value = maybe_upcast(data, copy=True)
-            data.soften_mask()  # set hardmask False if it was True
-            data[mask] = fill_value
-        else:
-            data = data.copy()
+        data = sanitize_masked_array(data)
 
     # extract ndarray or ExtensionArray, ensure we have no PandasArray
     data = extract_array(data, extract_numpy=True)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -87,7 +87,6 @@
     maybe_convert_platform,
     maybe_downcast_to_dtype,
     maybe_infer_to_datetimelike,
-    maybe_upcast,
     validate_numeric_casting,
 )
 from pandas.core.dtypes.common import (
@@ -126,7 +125,7 @@
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import Categorical, ExtensionArray
 from pandas.core.arrays.sparse import SparseFrameAccessor
-from pandas.core.construction import extract_array
+from pandas.core.construction import extract_array, sanitize_masked_array
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import (
@@ -535,13 +534,7 @@ def __init__(
 
             # a masked array
             else:
-                mask = ma.getmaskarray(data)
-                if mask.any():
-                    data, fill_value = maybe_upcast(data, copy=True)
-                    data.soften_mask()  # set hardmask False if it was True
-                    data[mask] = fill_value
-                else:
-                    data = data.copy()
+                data = sanitize_masked_array(data)
                 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
 
         elif isinstance(data, (np.ndarray, Series, Index)):

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -53,6 +53,8 @@
 )
 
 if TYPE_CHECKING:
+    from numpy.ma.mrecords import MaskedRecords
+
     from pandas import Series
 
 # ---------------------------------------------------------------------
@@ -96,13 +98,12 @@ def arrays_to_mgr(
 
 
 def masked_rec_array_to_mgr(
-    data, index, columns, dtype: Optional[DtypeObj], copy: bool
+    data: "MaskedRecords", index, columns, dtype: Optional[DtypeObj], copy: bool
 ):
     """
     Extract from a masked rec array and create the manager.
     """
     # essentially process a record array then fill it
-    fill_value = data.fill_value
     fdata = ma.getdata(data)
     if index is None:
         index = get_names_from_index(fdata)
@@ -116,11 +117,11 @@ def masked_rec_array_to_mgr(
 
     # fill if needed
     new_arrays = []
-    for fv, arr, col in zip(fill_value, arrays, arr_columns):
-        # TODO: numpy docs suggest fv must be scalar, but could it be
-        #  non-scalar for object dtype?
-        assert lib.is_scalar(fv), fv
-        mask = ma.getmaskarray(data[col])
+    for col in arr_columns:
+        arr = data[col]
+        fv = arr.fill_value
+
+        mask = ma.getmaskarray(arr)
         if mask.any():
             arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
             arr[mask] = fv