Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine UnsignedIntegerCoder and CFMaskCoder #9274

Merged
merged 13 commits into from
Aug 20, 2024
Merged
242 changes: 134 additions & 108 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def _is_time_like(units):


def _check_fill_values(attrs, name, dtype):
""" "Check _FillValue and missing_value if available.
"""Check _FillValue and missing_value if available.

Return dictionary with raw fill values and set with encoded fill values.

Expand Down Expand Up @@ -298,18 +298,61 @@ def _check_fill_values(attrs, name, dtype):
return raw_fill_dict, encoded_fill_values


def _convert_unsigned_fill_value(
name: T_Name,
data: Any,
unsigned: str,
raw_fill_value: Any,
encoded_fill_values: set,
) -> Any:
if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if raw_fill_value is not None:
new_fill = np.array(raw_fill_value, dtype=data.dtype)
encoded_fill_values.remove(raw_fill_value)
# use view here to prevent OverflowError
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if raw_fill_value is not None:
new_fill = signed_dtype.type(raw_fill_value)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block didn't use @kmuehlbauer's trick for handling overflow by using .view. I'm wondering if I need to use that here, but no tests hit it. I've never actually seen _Unsigned == "false" in the wild.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I change that here then I think I might be able to shrink this function and do things as "old dtype" and "new dtype" rather than signed versus unsigned.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block didn't use @kmuehlbauer's trick for handling overflow by using .view. I'm wondering if I need to use that here, but no tests hit it. I've never actually seen _Unsigned == "false" in the wild.

That was requested at some point in time for a specific use case. I'll try to dig it up.

Copy link
Contributor Author

@djhoese djhoese Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK looking at the tests, it looks like this case is not tested (but I'm triple checking). This set of ifs says that the data on-disk is unsigned, but _Unsigned is false which means the user wants signed data in-memory. The tests for _Unsigned="false" have signed data on-disk and in-memory so no casting/conversion happens.

Edit: Scratch that. test_backends doesn't test it, but test_coding does but never uses _FillValue. Let's see what I can do.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xref #4966

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well I thought I was being smart and added tests for the _Unsigned: "false" case and now things are just all weird. It made me realize I wasn't handling this case in the encoding step, but it also kind of seems like it never was handled or that I have the wrong impression of how that configuration is supposed to be handled. The tests added in that PR @kmuehlbauer don't present the _FillValue so I've tried adding that to the backend tests but now non-NC4 backends are complaining about converting uint8 to int8. This coercion was added to solve #4014 it seems.

My assumption is that if _Unsigned: "false" then the data is saved as uint8 and _FillValue should be uint8. But again, I'm not sure why my new tests are even trying to get to int8. I'll do some more testing later tonight hopefully. I'm not sure if it is better to spend a ton of time getting this small functionality working "as expected" or leave it undefined/untested as it is right now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So my main concern and all of your handling in the decode pipeline for casting fill values turns out to not be an issue anymore as the raw fill values turn out to be numpy scalars (np.uint8) when they get loaded from the file. Or at least they are for the NetCDF4 cases. So numpy is perfectly happy casting uint8 to int8 and back if they are already numpy scalars. If my new tests cases make sense then I think this is fine.

encoded_fill_values.remove(raw_fill_value)
encoded_fill_values.add(new_fill)
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)
return data


class CFMaskCoder(VariableCoder):
"""Mask or unmask fill values according to CF conventions."""

def encode(self, variable: Variable, name: T_Name = None):
dims, data, attrs, encoding = unpack_for_encoding(variable)

dtype = np.dtype(encoding.get("dtype", data.dtype))
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
is_unsigned = encoding.get("_Unsigned", "false") == "true"
# only used for _Unsigned cases
signed_dtype = np.dtype(
encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
)
fv = encoding.get("_FillValue")
mv = encoding.get("missing_value")
# to properly handle _FillValue/missing_value below [a], [b]
# we need to check if unsigned data is written as signed data
unsigned = encoding.get("_Unsigned") is not None
fill_value = None

fv_exists = fv is not None
mv_exists = mv is not None
Expand All @@ -324,23 +367,28 @@ def encode(self, variable: Variable, name: T_Name = None):

if fv_exists:
# Ensure _FillValue is cast to same dtype as data's
# [a] need to skip this if _Unsigned is available
if not unsigned:
encoding["_FillValue"] = dtype.type(fv)
encoding["_FillValue"] = (
self._encode_unsigned_fill_value(name, fv, signed_dtype)
if is_unsigned
else dtype.type(fv)
)
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)

if mv_exists:
# try to use _FillValue, if it exists to align both values
# or use missing_value and ensure it's cast to same dtype as data's
# [b] need to provide mv verbatim if _Unsigned is available
encoding["missing_value"] = attrs.get(
"_FillValue",
(dtype.type(mv) if not unsigned else mv),
(
self._encode_unsigned_fill_value(name, mv, signed_dtype)
if is_unsigned
else dtype.type(mv)
),
)
fill_value = pop_to(encoding, attrs, "missing_value", name=name)

# apply fillna
if not pd.isnull(fill_value):
if fill_value is not None and not pd.isnull(fill_value):
# special case DateTime to properly handle NaT
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
data = duck_array_ops.where(
Expand All @@ -349,46 +397,92 @@ def encode(self, variable: Variable, name: T_Name = None):
else:
data = duck_array_ops.fillna(data, fill_value)

if fill_value is not None and is_unsigned:
pop_to(encoding, attrs, "_Unsigned")
# XXX: Is this actually needed? Doesn't the backend handle this?
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
attrs["_FillValue"] = fill_value

return Variable(dims, data, attrs, encoding, fastpath=True)

def _encode_unsigned_fill_value(
self,
name: T_Name,
fill_value: Any,
signed_dtype: np.dtype,
) -> Any:
try:
# user provided the on-disk signed fill
if hasattr(fill_value, "item"):
# if numpy type, convert to python native integer to determine overflow
# otherwise numpy unsigned ints will silently cast to the signed counterpart
fill_value = fill_value.item()
new_fill = signed_dtype.type(fill_value)
except OverflowError:
warnings.warn(
f"variable {name!r} will be stored as signed integers "
f"but _FillValue attribute can't be represented as a "
f"signed integer.",
SerializationWarning,
stacklevel=3,
)
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
)
return new_fill

def decode(self, variable: Variable, name: T_Name = None):
raw_fill_dict, encoded_fill_values = _check_fill_values(
variable.attrs, name, variable.dtype
)
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
return variable

if raw_fill_dict:
dims, data, attrs, encoding = unpack_for_decoding(variable)
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
dims, data, attrs, encoding = unpack_for_decoding(variable)

# Even if _Unsigned is use, retain on-disk _FillValue
[
safe_setitem(encoding, attr, value, name=name)
for attr, value in raw_fill_dict.items()
]

if "_Unsigned" in attrs:
dcherian marked this conversation as resolved.
Show resolved Hide resolved
unsigned = pop_to(attrs, encoding, "_Unsigned")
data = _convert_unsigned_fill_value(
name,
data,
unsigned,
raw_fill_dict.get("_FillValue"),
encoded_fill_values,
)

if encoded_fill_values:
# special case DateTime to properly handle NaT
dtype: np.typing.DTypeLike
decoded_fill_value: Any
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
if "scale_factor" not in attrs and "add_offset" not in attrs:
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
else:
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)
dtype, decoded_fill_value = (
_choose_float_dtype(data.dtype, attrs),
np.nan,
)

transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)
transform = partial(
_apply_mask,
encoded_fill_values=encoded_fill_values,
decoded_fill_value=decoded_fill_value,
dtype=dtype,
)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
return Variable(dims, data, attrs, encoding, fastpath=True)


def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
Expand Down Expand Up @@ -506,74 +600,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return variable


class UnsignedIntegerCoder(VariableCoder):
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
# from netCDF best practices
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
# "_Unsigned = "true" to indicate that
# integer data should be treated as unsigned"
if variable.encoding.get("_Unsigned", "false") == "true":
dims, data, attrs, encoding = unpack_for_encoding(variable)

pop_to(encoding, attrs, "_Unsigned")
# we need the on-disk type here
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
if "_FillValue" in attrs:
try:
# user provided the on-disk signed fill
new_fill = signed_dtype.type(attrs["_FillValue"])
except OverflowError:
# user provided the in-memory unsigned fill, convert to signed type
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
# use view here to prevent OverflowError
new_fill = (
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
.view(signed_dtype)
.item()
)
attrs["_FillValue"] = new_fill
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if "_Unsigned" in variable.attrs:
dims, data, attrs, encoding = unpack_for_decoding(variable)
unsigned = pop_to(attrs, encoding, "_Unsigned")

if data.dtype.kind == "i":
if unsigned == "true":
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=unsigned_dtype)
if "_FillValue" in attrs:
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
# use view here to prevent OverflowError
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
data = lazy_elemwise_func(data, transform, unsigned_dtype)
elif data.dtype.kind == "u":
if unsigned == "false":
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
transform = partial(np.asarray, dtype=signed_dtype)
data = lazy_elemwise_func(data, transform, signed_dtype)
if "_FillValue" in attrs:
new_fill = signed_dtype.type(attrs["_FillValue"])
attrs["_FillValue"] = new_fill
else:
warnings.warn(
f"variable {name!r} has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
SerializationWarning,
stacklevel=3,
)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable


class DefaultFillvalueCoder(VariableCoder):
"""Encode default _FillValue if needed."""

Expand Down
2 changes: 0 additions & 2 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def encode_cf_variable(
times.CFTimedeltaCoder(),
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.UnsignedIntegerCoder(),
variables.NativeEnumCoder(),
variables.NonStringCoder(),
variables.DefaultFillvalueCoder(),
Expand Down Expand Up @@ -279,7 +278,6 @@ def decode_cf_variable(

if mask_and_scale:
for coder in [
variables.UnsignedIntegerCoder(),
variables.CFMaskCoder(),
variables.CFScaleOffsetCoder(),
]:
Expand Down
Loading
Loading