Skip to content

Commit

Permalink
Combine UnsignedIntegerCoder and CFMaskCoder (#9274)
Browse files Browse the repository at this point in the history
* Fix small typo in docstring

* Combine CF Unsigned and Mask handling

* Replace UnsignedIntegerCode tests with CFMaskCoder usage

* Fix dtype type annotation

* Fix when unsigned serialization warning is expected in tests

* Small refactor of CFMaskCoder decoding

* Add CF encoder tests for _Unsigned=false cases

* Remove UnsignedIntegerCoder from api docs

---------

Co-authored-by: Kai Mühlbauer <kai.muehlbauer@uni-bonn.de>
Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
  • Loading branch information
3 people authored Aug 20, 2024
1 parent 40291ad commit 4ab0679
Show file tree
Hide file tree
Showing 5 changed files with 226 additions and 120 deletions.
1 change: 0 additions & 1 deletion doc/api-hidden.rst
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,6 @@

conventions.decode_cf_variables

coding.variables.UnsignedIntegerCoder
coding.variables.CFMaskCoder
coding.variables.CFScaleOffsetCoder

Expand Down
239 changes: 131 additions & 108 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def _is_time_like(units):


def _check_fill_values(attrs, name, dtype):
""" "Check _FillValue and missing_value if available.
"""Check _FillValue and missing_value if available.
Return dictionary with raw fill values and set with encoded fill values.
Expand Down Expand Up @@ -298,18 +298,87 @@ def _check_fill_values(attrs, name, dtype):
return raw_fill_dict, encoded_fill_values


def _convert_unsigned_fill_value(
name: T_Name,
data: Any,
unsigned: str,
raw_fill_value: Any,
encoded_fill_values: set,
) -> Any:
if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if raw_fill_value is not None:
new_fill = np.array(raw_fill_value, dtype=data.dtype)
encoded_fill_values.remove(raw_fill_value)
# use view here to prevent OverflowError
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if raw_fill_value is not None:
new_fill = signed_dtype.type(raw_fill_value)
encoded_fill_values.remove(raw_fill_value)
encoded_fill_values.add(new_fill)
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)
return data


def _encode_unsigned_fill_value(
name: T_Name,
fill_value: Any,
encoded_dtype: np.dtype,
) -> Any:
try:
if hasattr(fill_value, "item"):
# if numpy type, convert to python native integer to determine overflow
# otherwise numpy unsigned ints will silently cast to the signed counterpart
fill_value = fill_value.item()
# passes if provided fill value fits in encoded on-disk type
new_fill = encoded_dtype.type(fill_value)
except OverflowError:
encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned"
warnings.warn(
f"variable {name!r} will be stored as {encoded_kind_str} integers "
f"but _FillValue attribute can't be represented as a "
f"{encoded_kind_str} integer.",
SerializationWarning,
stacklevel=3,
)
# user probably provided the fill as the in-memory dtype,
# convert to on-disk type to match CF standard
orig_kind = "u" if encoded_dtype.kind == "i" else "i"
orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item()
return new_fill


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

def encode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

dtype = np.dtype(encoding.get("dtype", data.dtype))
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
has_unsigned = encoding.get("_Unsigned") is not None
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")
# to properly handle _FillValue/missing_value below [a], [b]
# we need to check if unsigned data is written as signed data
unsigned = encoding.get("_Unsigned") is not None
fill_value = None

fv_exists = fv is not None
mv_exists = mv is not None
Expand All @@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None):

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
# [a] need to skip this if _Unsigned is available
if not unsigned:
encoding["_FillValue"] = dtype.type(fv)
encoding["_FillValue"] = (
_encode_unsigned_fill_value(name, fv, dtype)
if has_unsigned
else dtype.type(fv)
)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)

if mv_exists:
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
# [b] need to provide mv verbatim if _Unsigned is available
encoding["missing_value"] = attrs.get(
"_FillValue",
(dtype.type(mv) if not unsigned else mv),
(
_encode_unsigned_fill_value(name, mv, dtype)
if has_unsigned
else dtype.type(mv)
),
)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)

# apply fillna
if not pd.isnull(fill_value):
if fill_value is not None and not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
Expand All @@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None):
else:
data = duck_array_ops.fillna(data, fill_value)

if fill_value is not None and has_unsigned:
pop_to(encoding, attrs, "_Unsigned")
# XXX: Is this actually needed? Doesn't the backend handle this?
data = duck_array_ops.astype(duck_array_ops.around(data), dtype)
attrs["_FillValue"] = fill_value

return Variable(dims, data, attrs, encoding, fastpath=True)

def decode(self, variable: Variable, name: T_Name = None):
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
return variable

if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
dims, data, attrs, encoding = unpack_for_decoding(variable)

# Even if _Unsigned is use, retain on-disk _FillValue
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if "_Unsigned" in attrs:
unsigned = pop_to(attrs, encoding, "_Unsigned")
data = _convert_unsigned_fill_value(
name,
data,
unsigned,
raw_fill_dict.get("_FillValue"),
encoded_fill_values,
)

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)
transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
return Variable(dims, data, attrs, encoding, fastpath=True)


def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
Expand Down Expand Up @@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return variable


class UnsignedIntegerCoder(VariableCoder):
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
if variable.encoding.get("_Unsigned", "false") == "true":
dims, data, attrs, encoding = unpack_for_encoding(variable)

pop_to(encoding, attrs, "_Unsigned")
# we need the on-disk type here
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
if "_FillValue" in attrs:
try:
# user provided the on-disk signed fill
new_fill = signed_dtype.type(attrs["_FillValue"])
except OverflowError:
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
.view(signed_dtype)
.item()
)
attrs["_FillValue"] = new_fill
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if "_Unsigned" in variable.attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)
unsigned = pop_to(attrs, encoding, "_Unsigned")

if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if "_FillValue" in attrs:
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if "_FillValue" in attrs:
new_fill = signed_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

Expand Down
2 changes: 0 additions & 2 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def encode_cf_variable(
times.CFTimedeltaCoder(),
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NativeEnumCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
Expand Down Expand Up @@ -279,7 +278,6 @@ def decode_cf_variable(

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
variables.CFMaskCoder(),
variables.CFScaleOffsetCoder(),
]:
Expand Down
Loading

0 comments on commit 4ab0679

Please sign in to comment.