diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index c38b4f2d11c..1900c208532 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -684,7 +684,6 @@ conventions.decode_cf_variables - coding.variables.UnsignedIntegerCoder coding.variables.CFMaskCoder coding.variables.CFScaleOffsetCoder diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 441ddfe7bfd..74916886026 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -261,7 +261,7 @@ def _is_time_like(units): def _check_fill_values(attrs, name, dtype): - """ "Check _FillValue and missing_value if available. + """Check _FillValue and missing_value if available. Return dictionary with raw fill values and set with encoded fill values. @@ -298,6 +298,72 @@ def _check_fill_values(attrs, name, dtype): return raw_fill_dict, encoded_fill_values +def _convert_unsigned_fill_value( + name: T_Name, + data: Any, + unsigned: str, + raw_fill_value: Any, + encoded_fill_values: set, +) -> Any: + if data.dtype.kind == "i": + if unsigned == "true": + unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=unsigned_dtype) + if raw_fill_value is not None: + new_fill = np.array(raw_fill_value, dtype=data.dtype) + encoded_fill_values.remove(raw_fill_value) + # use view here to prevent OverflowError + encoded_fill_values.add(new_fill.view(unsigned_dtype).item()) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + elif data.dtype.kind == "u": + if unsigned == "false": + signed_dtype = np.dtype(f"i{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=signed_dtype) + data = lazy_elemwise_func(data, transform, signed_dtype) + if raw_fill_value is not None: + new_fill = signed_dtype.type(raw_fill_value) + encoded_fill_values.remove(raw_fill_value) + encoded_fill_values.add(new_fill) + else: + warnings.warn( + f"variable {name!r} has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + SerializationWarning, + stacklevel=3, + ) + return data + + +def _encode_unsigned_fill_value( + name: T_Name, + fill_value: Any, + encoded_dtype: np.dtype, +) -> Any: + try: + if hasattr(fill_value, "item"): + # if numpy type, convert to python native integer to determine overflow + # otherwise numpy unsigned ints will silently cast to the signed counterpart + fill_value = fill_value.item() + # passes if provided fill value fits in encoded on-disk type + new_fill = encoded_dtype.type(fill_value) + except OverflowError: + encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned" + warnings.warn( + f"variable {name!r} will be stored as {encoded_kind_str} integers " + f"but _FillValue attribute can't be represented as a " + f"{encoded_kind_str} integer.", + SerializationWarning, + stacklevel=3, + ) + # user probably provided the fill as the in-memory dtype, + # convert to on-disk type to match CF standard + orig_kind = "u" if encoded_dtype.kind == "i" else "i" + orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}") + # use view here to prevent OverflowError + new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item() + return new_fill + + class CFMaskCoder(VariableCoder): """Mask or unmask fill values according to CF conventions.""" @@ -305,11 +371,14 @@ def encode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_encoding(variable) dtype = np.dtype(encoding.get("dtype", data.dtype)) + # from netCDF best practices + # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data + # "_Unsigned = "true" to indicate that + # integer data should be treated as unsigned" + has_unsigned = encoding.get("_Unsigned") is not None fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - # to properly handle _FillValue/missing_value below [a], [b] - # we need to check if unsigned data is written as signed data - unsigned = encoding.get("_Unsigned") is not None + fill_value = None fv_exists = fv is not None mv_exists = mv is not None @@ -324,23 +393,28 @@ def encode(self, variable: Variable, name: T_Name = None): if fv_exists: # Ensure _FillValue is cast to same dtype as data's - # [a] need to skip this if _Unsigned is available - if not unsigned: - encoding["_FillValue"] = dtype.type(fv) + encoding["_FillValue"] = ( + _encode_unsigned_fill_value(name, fv, dtype) + if has_unsigned + else dtype.type(fv) + ) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if mv_exists: # try to use _FillValue, if it exists to align both values # or use missing_value and ensure it's cast to same dtype as data's - # [b] need to provide mv verbatim if _Unsigned is available encoding["missing_value"] = attrs.get( "_FillValue", - (dtype.type(mv) if not unsigned else mv), + ( + _encode_unsigned_fill_value(name, mv, dtype) + if has_unsigned + else dtype.type(mv) + ), ) fill_value = pop_to(encoding, attrs, "missing_value", name=name) # apply fillna - if not pd.isnull(fill_value): + if fill_value is not None and not pd.isnull(fill_value): # special case DateTime to properly handle NaT if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": data = duck_array_ops.where( @@ -349,46 +423,63 @@ def encode(self, variable: Variable, name: T_Name = None): else: data = duck_array_ops.fillna(data, fill_value) + if fill_value is not None and has_unsigned: + pop_to(encoding, attrs, "_Unsigned") + # XXX: Is this actually needed? Doesn't the backend handle this? + data = duck_array_ops.astype(duck_array_ops.around(data), dtype) + attrs["_FillValue"] = fill_value + return Variable(dims, data, attrs, encoding, fastpath=True) def decode(self, variable: Variable, name: T_Name = None): raw_fill_dict, encoded_fill_values = _check_fill_values( variable.attrs, name, variable.dtype ) + if "_Unsigned" not in variable.attrs and not raw_fill_dict: + return variable - if raw_fill_dict: - dims, data, attrs, encoding = unpack_for_decoding(variable) - [ - safe_setitem(encoding, attr, value, name=name) - for attr, value in raw_fill_dict.items() - ] - - if encoded_fill_values: - # special case DateTime to properly handle NaT - dtype: np.typing.DTypeLike - decoded_fill_value: Any - if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": - dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min + dims, data, attrs, encoding = unpack_for_decoding(variable) + + # Even if _Unsigned is use, retain on-disk _FillValue + [ + safe_setitem(encoding, attr, value, name=name) + for attr, value in raw_fill_dict.items() + ] + + if "_Unsigned" in attrs: + unsigned = pop_to(attrs, encoding, "_Unsigned") + data = _convert_unsigned_fill_value( + name, + data, + unsigned, + raw_fill_dict.get("_FillValue"), + encoded_fill_values, + ) + + if encoded_fill_values: + # special case DateTime to properly handle NaT + dtype: np.typing.DTypeLike + decoded_fill_value: Any + if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": + dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min + else: + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) else: - if "scale_factor" not in attrs and "add_offset" not in attrs: - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) - else: - dtype, decoded_fill_value = ( - _choose_float_dtype(data.dtype, attrs), - np.nan, - ) + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) - transform = partial( - _apply_mask, - encoded_fill_values=encoded_fill_values, - decoded_fill_value=decoded_fill_value, - dtype=dtype, - ) - data = lazy_elemwise_func(data, transform, dtype) + transform = partial( + _apply_mask, + encoded_fill_values=encoded_fill_values, + decoded_fill_value=decoded_fill_value, + dtype=dtype, + ) + data = lazy_elemwise_func(data, transform, dtype) - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable + return Variable(dims, data, attrs, encoding, fastpath=True) def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike): @@ -506,74 +597,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable -class UnsignedIntegerCoder(VariableCoder): - def encode(self, variable: Variable, name: T_Name = None) -> Variable: - # from netCDF best practices - # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data - # "_Unsigned = "true" to indicate that - # integer data should be treated as unsigned" - if variable.encoding.get("_Unsigned", "false") == "true": - dims, data, attrs, encoding = unpack_for_encoding(variable) - - pop_to(encoding, attrs, "_Unsigned") - # we need the on-disk type here - # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available - signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}")) - if "_FillValue" in attrs: - try: - # user provided the on-disk signed fill - new_fill = signed_dtype.type(attrs["_FillValue"]) - except OverflowError: - # user provided the in-memory unsigned fill, convert to signed type - unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") - # use view here to prevent OverflowError - new_fill = ( - np.array(attrs["_FillValue"], dtype=unsigned_dtype) - .view(signed_dtype) - .item() - ) - attrs["_FillValue"] = new_fill - data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - def decode(self, variable: Variable, name: T_Name = None) -> Variable: - if "_Unsigned" in variable.attrs: - dims, data, attrs, encoding = unpack_for_decoding(variable) - unsigned = pop_to(attrs, encoding, "_Unsigned") - - if data.dtype.kind == "i": - if unsigned == "true": - unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=unsigned_dtype) - if "_FillValue" in attrs: - new_fill = np.array(attrs["_FillValue"], dtype=data.dtype) - # use view here to prevent OverflowError - attrs["_FillValue"] = new_fill.view(unsigned_dtype).item() - data = lazy_elemwise_func(data, transform, unsigned_dtype) - elif data.dtype.kind == "u": - if unsigned == "false": - signed_dtype = np.dtype(f"i{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=signed_dtype) - data = lazy_elemwise_func(data, transform, signed_dtype) - if "_FillValue" in attrs: - new_fill = signed_dtype.type(attrs["_FillValue"]) - attrs["_FillValue"] = new_fill - else: - warnings.warn( - f"variable {name!r} has _Unsigned attribute but is not " - "of integer type. Ignoring attribute.", - SerializationWarning, - stacklevel=3, - ) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - class DefaultFillvalueCoder(VariableCoder): """Encode default _FillValue if needed.""" diff --git a/xarray/conventions.py b/xarray/conventions.py index d572b215d2d..18a81938225 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -187,7 +187,6 @@ def encode_cf_variable( times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), - variables.UnsignedIntegerCoder(), variables.NativeEnumCoder(), variables.NonStringCoder(), variables.DefaultFillvalueCoder(), @@ -279,7 +278,6 @@ def decode_cf_variable( if mask_and_scale: for coder in [ - variables.UnsignedIntegerCoder(), variables.CFMaskCoder(), variables.CFScaleOffsetCoder(), ]: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bbe48663c1f..c755924f583 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing from xarray.core.options import set_options +from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type from xarray.tests import ( assert_allclose, @@ -166,7 +167,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { - "_FillValue": 255, + "_FillValue": -1, "_Unsigned": "true", "dtype": "i1", "add_offset": dtype.type(10), @@ -242,6 +243,32 @@ def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset: return Dataset({"x": ("t", sb, attributes)}) +def create_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + encoding = { + "_FillValue": 255, + "_Unsigned": "false", + "dtype": "u1", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) + return Dataset({"x": ("t", x, {}, encoding)}) + + +def create_encoded_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + # These are values as written to the file: the _FillValue will + # be represented in the unsigned form. + attributes = { + "_FillValue": 255, + "_Unsigned": "false", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + # Create unsigned data corresponding to [-110, 1, 127, 255] signed + sb = np.asarray([146, 1, 127, 255], dtype="u1") + return Dataset({"x": ("t", sb, attributes)}) + + def create_boolean_data() -> Dataset: attributes = {"units": "-"} return Dataset({"x": ("t", [True, False, False, True], attributes)}) @@ -890,6 +917,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: create_signed_masked_scaled_data, create_encoded_signed_masked_scaled_data, ), + ( + create_unsigned_false_masked_scaled_data, + create_encoded_unsigned_false_masked_scaled_data, + ), (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) @@ -899,9 +930,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: pytest.skip("float32 will be treated as float64 in zarr") decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) + if decoded["x"].encoding["dtype"] == "u1" and not ( + self.engine == "netcdf4" + and self.file_format is None + or self.file_format == "NETCDF4" + ): + pytest.skip("uint8 data can't be written to non-NetCDF4 data") + with self.roundtrip(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].encoding["_FillValue"] + ) assert_allclose(decoded, actual, decode_bytes=False) with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: @@ -909,11 +952,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: # encode. Is that something we want to test for? for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + encoded.variables[k].attrs["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) # make sure roundtrip encoding didn't change the @@ -925,11 +978,33 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) - @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255), -1, 255]) - def test_roundtrip_unsigned(self, fillvalue): + @pytest.mark.parametrize( + ("fill_value", "exp_fill_warning"), + [ + (np.int8(-1), False), + (np.uint8(255), True), + (-1, False), + (255, True), + ], + ) + def test_roundtrip_unsigned(self, fill_value, exp_fill_warning): + @contextlib.contextmanager + def _roundtrip_with_warnings(*args, **kwargs): + is_np2 = module_available("numpy", minversion="2.0.0.dev0") + if exp_fill_warning and is_np2: + warn_checker: contextlib.AbstractContextManager = pytest.warns( + SerializationWarning, + match="_FillValue attribute can't be represented", + ) + else: + warn_checker = contextlib.nullcontext() + with warn_checker: + with self.roundtrip(*args, **kwargs) as actual: + yield actual + # regression/numpy2 test for encoding = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", "dtype": "i1", } @@ -937,21 +1012,32 @@ def test_roundtrip_unsigned(self, fillvalue): decoded = Dataset({"x": ("t", x, {}, encoding)}) attributes = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1") encoded = Dataset({"x": ("t", sb, attributes)}) + unsigned_dtype = np.dtype(f"u{sb.dtype.itemsize}") - with self.roundtrip(decoded) as actual: + with _roundtrip_with_warnings(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = decoded.variables[k].encoding["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].encoding["_FillValue"] assert_allclose(decoded, actual, decode_bytes=False) - with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: + with _roundtrip_with_warnings( + decoded, open_kwargs=dict(decode_cf=False) + ) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = encoded.variables[k].attrs["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].attrs["_FillValue"] assert_allclose(encoded, actual, decode_bytes=False) @staticmethod diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 6d81d6f5dc8..acb32504948 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -129,7 +129,7 @@ def test_decode_unsigned_from_signed(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == unsigned_dtype assert decoded.values == original_values @@ -143,7 +143,7 @@ def test_decode_signed_from_unsigned(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == signed_dtype assert decoded.values == original_values