From e1baa93ad669d65032b936b530b666aa9d689f2e Mon Sep 17 00:00:00 2001 From: David Hoese Date: Mon, 22 Jul 2024 14:11:02 -0500 Subject: [PATCH 1/8] Fix small typo in docstring --- xarray/coding/variables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8a3afe650f2..fb2fa241773 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -261,7 +261,7 @@ def _is_time_like(units): def _check_fill_values(attrs, name, dtype): - """ "Check _FillValue and missing_value if available. + """Check _FillValue and missing_value if available. Return dictionary with raw fill values and set with encoded fill values. From cae77aae964299bc3cf0be45a9cb9e3166534db2 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Wed, 24 Jul 2024 14:23:55 -0500 Subject: [PATCH 2/8] Combine CF Unsigned and Mask handling --- xarray/coding/variables.py | 224 ++++++++++++++++++---------------- xarray/conventions.py | 2 - xarray/tests/test_backends.py | 61 +++++++-- 3 files changed, 171 insertions(+), 116 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index fb2fa241773..7653b4c4bf9 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -305,11 +305,18 @@ def encode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_encoding(variable) dtype = np.dtype(encoding.get("dtype", data.dtype)) + # from netCDF best practices + # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data + # "_Unsigned = "true" to indicate that + # integer data should be treated as unsigned" + is_unsigned = encoding.get("_Unsigned", "false") == "true" + # only used for _Unsigned cases + signed_dtype = np.dtype( + encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype) + ) fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - # to properly handle _FillValue/missing_value below [a], [b] - # we need to check if unsigned data is written as signed data - unsigned = encoding.get("_Unsigned") is not None + fill_value = None fv_exists = fv is not None mv_exists = mv is not None @@ -324,23 +331,28 @@ def encode(self, variable: Variable, name: T_Name = None): if fv_exists: # Ensure _FillValue is cast to same dtype as data's - # [a] need to skip this if _Unsigned is available - if not unsigned: - encoding["_FillValue"] = dtype.type(fv) + encoding["_FillValue"] = ( + self._encode_unsigned_fill_value(name, fv, signed_dtype) + if is_unsigned + else dtype.type(fv) + ) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if mv_exists: # try to use _FillValue, if it exists to align both values # or use missing_value and ensure it's cast to same dtype as data's - # [b] need to provide mv verbatim if _Unsigned is available encoding["missing_value"] = attrs.get( "_FillValue", - (dtype.type(mv) if not unsigned else mv), + ( + self._encode_unsigned_fill_value(name, fv, signed_dtype) + if is_unsigned + else dtype.type(mv) + ), ) fill_value = pop_to(encoding, attrs, "missing_value", name=name) # apply fillna - if not pd.isnull(fill_value): + if fill_value is not None and not pd.isnull(fill_value): # special case DateTime to properly handle NaT if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": data = duck_array_ops.where( @@ -349,46 +361,112 @@ def encode(self, variable: Variable, name: T_Name = None): else: data = duck_array_ops.fillna(data, fill_value) + if fill_value is not None and is_unsigned: + pop_to(encoding, attrs, "_Unsigned") + # XXX: Is this actually needed? Doesn't the backend handle this? + data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) + attrs["_FillValue"] = fill_value + return Variable(dims, data, attrs, encoding, fastpath=True) + def _encode_unsigned_fill_value( + self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike + ) -> Any: + try: + # user provided the on-disk signed fill + if hasattr(fill_value, "item"): + # if numpy type, convert to python native integer to determine overflow + # otherwise numpy unsigned ints will silently cast to the signed counterpart + fill_value = fill_value.item() + new_fill = signed_dtype.type(fill_value) + except OverflowError: + warnings.warn( + f"variable {name!r} will be stored as signed integers " + f"but _FillValue attribute can't be represented as a " + f"signed integer.", + SerializationWarning, + stacklevel=3, + ) + # user provided the in-memory unsigned fill, convert to signed type + unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") + # use view here to prevent OverflowError + new_fill = ( + np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item() + ) + return new_fill + def decode(self, variable: Variable, name: T_Name = None): raw_fill_dict, encoded_fill_values = _check_fill_values( variable.attrs, name, variable.dtype ) + if "_Unsigned" not in variable.attrs and not raw_fill_dict: + return variable - if raw_fill_dict: - dims, data, attrs, encoding = unpack_for_decoding(variable) - [ - safe_setitem(encoding, attr, value, name=name) - for attr, value in raw_fill_dict.items() - ] - - if encoded_fill_values: - # special case DateTime to properly handle NaT - dtype: np.typing.DTypeLike - decoded_fill_value: Any - if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": - dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min - else: - if "scale_factor" not in attrs and "add_offset" not in attrs: - dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) - else: - dtype, decoded_fill_value = ( - _choose_float_dtype(data.dtype, attrs), - np.nan, - ) + dims, data, attrs, encoding = unpack_for_decoding(variable) + + # dims, data, attrs, encoding = unpack_for_decoding(variable) + # Even if _Unsigned is use, retain on-disk _FillValue + [ + safe_setitem(encoding, attr, value, name=name) + for attr, value in raw_fill_dict.items() + ] - transform = partial( - _apply_mask, - encoded_fill_values=encoded_fill_values, - decoded_fill_value=decoded_fill_value, - dtype=dtype, + if "_Unsigned" in attrs: + unsigned = pop_to(attrs, encoding, "_Unsigned") + + if data.dtype.kind == "i": + if unsigned == "true": + unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=unsigned_dtype) + if "_FillValue" in raw_fill_dict: + new_fill = np.array( + raw_fill_dict["_FillValue"], dtype=data.dtype + ) + encoded_fill_values.remove(raw_fill_dict["_FillValue"]) + # use view here to prevent OverflowError + encoded_fill_values.add(new_fill.view(unsigned_dtype).item()) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + elif data.dtype.kind == "u": + if unsigned == "false": + signed_dtype = np.dtype(f"i{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=signed_dtype) + data = lazy_elemwise_func(data, transform, signed_dtype) + if "_FillValue" in raw_fill_dict: + new_fill = signed_dtype.type(raw_fill_dict["_FillValue"]) + encoded_fill_values.remove(raw_fill_dict["_FillValue"]) + encoded_fill_values.add(new_fill) + else: + warnings.warn( + f"variable {name!r} has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + SerializationWarning, + stacklevel=3, ) - data = lazy_elemwise_func(data, transform, dtype) - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable + if encoded_fill_values: + # special case DateTime to properly handle NaT + dtype: np.typing.DTypeLike + decoded_fill_value: Any + if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": + dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min + else: + if "scale_factor" not in attrs and "add_offset" not in attrs: + dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) + else: + dtype, decoded_fill_value = ( + _choose_float_dtype(data.dtype, attrs), + np.nan, + ) + + transform = partial( + _apply_mask, + encoded_fill_values=encoded_fill_values, + decoded_fill_value=decoded_fill_value, + dtype=dtype, + ) + data = lazy_elemwise_func(data, transform, dtype) + + return Variable(dims, data, attrs, encoding, fastpath=True) def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike): @@ -506,74 +584,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable -class UnsignedIntegerCoder(VariableCoder): - def encode(self, variable: Variable, name: T_Name = None) -> Variable: - # from netCDF best practices - # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data - # "_Unsigned = "true" to indicate that - # integer data should be treated as unsigned" - if variable.encoding.get("_Unsigned", "false") == "true": - dims, data, attrs, encoding = unpack_for_encoding(variable) - - pop_to(encoding, attrs, "_Unsigned") - # we need the on-disk type here - # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available - signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}")) - if "_FillValue" in attrs: - try: - # user provided the on-disk signed fill - new_fill = signed_dtype.type(attrs["_FillValue"]) - except OverflowError: - # user provided the in-memory unsigned fill, convert to signed type - unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") - # use view here to prevent OverflowError - new_fill = ( - np.array(attrs["_FillValue"], dtype=unsigned_dtype) - .view(signed_dtype) - .item() - ) - attrs["_FillValue"] = new_fill - data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - def decode(self, variable: Variable, name: T_Name = None) -> Variable: - if "_Unsigned" in variable.attrs: - dims, data, attrs, encoding = unpack_for_decoding(variable) - unsigned = pop_to(attrs, encoding, "_Unsigned") - - if data.dtype.kind == "i": - if unsigned == "true": - unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=unsigned_dtype) - if "_FillValue" in attrs: - new_fill = np.array(attrs["_FillValue"], dtype=data.dtype) - # use view here to prevent OverflowError - attrs["_FillValue"] = new_fill.view(unsigned_dtype).item() - data = lazy_elemwise_func(data, transform, unsigned_dtype) - elif data.dtype.kind == "u": - if unsigned == "false": - signed_dtype = np.dtype(f"i{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=signed_dtype) - data = lazy_elemwise_func(data, transform, signed_dtype) - if "_FillValue" in attrs: - new_fill = signed_dtype.type(attrs["_FillValue"]) - attrs["_FillValue"] = new_fill - else: - warnings.warn( - f"variable {name!r} has _Unsigned attribute but is not " - "of integer type. Ignoring attribute.", - SerializationWarning, - stacklevel=3, - ) - - return Variable(dims, data, attrs, encoding, fastpath=True) - else: - return variable - - class DefaultFillvalueCoder(VariableCoder): """Encode default _FillValue if needed.""" diff --git a/xarray/conventions.py b/xarray/conventions.py index d572b215d2d..18a81938225 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -187,7 +187,6 @@ def encode_cf_variable( times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), - variables.UnsignedIntegerCoder(), variables.NativeEnumCoder(), variables.NonStringCoder(), variables.DefaultFillvalueCoder(), @@ -279,7 +278,6 @@ def decode_cf_variable( if mask_and_scale: for coder in [ - variables.UnsignedIntegerCoder(), variables.CFMaskCoder(), variables.CFScaleOffsetCoder(), ]: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4fa8736427d..f8d2e23616e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset: def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset: encoding = { - "_FillValue": 255, + "_FillValue": -1, "_Unsigned": "true", "dtype": "i1", "add_offset": dtype.type(10), @@ -902,6 +902,11 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: with self.roundtrip(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].encoding["_FillValue"] + ) assert_allclose(decoded, actual, decode_bytes=False) with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: @@ -909,11 +914,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: # encode. Is that something we want to test for? for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + decoded.variables[k].encoding["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + # CF _FillValue is always on-disk type + assert ( + encoded.variables[k].attrs["_FillValue"] + == actual.variables[k].attrs["_FillValue"] + ) assert_allclose(encoded, actual, decode_bytes=False) # make sure roundtrip encoding didn't change the @@ -925,11 +940,32 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: assert decoded.variables[k].dtype == actual.variables[k].dtype assert_allclose(decoded, actual, decode_bytes=False) - @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255), -1, 255]) - def test_roundtrip_unsigned(self, fillvalue): + @pytest.mark.parametrize( + ("fill_value", "exp_fill_warning"), + [ + (np.int8(-1), False), + (np.uint8(255), True), + (-1, False), + (255, True), + ], + ) + def test_roundtrip_unsigned(self, fill_value, exp_fill_warning): + @contextlib.contextmanager + def _roundtrip_with_warnings(*args, **kwargs): + if exp_fill_warning: + warn_checker = pytest.warns( + SerializationWarning, + match="_FillValue attribute can't be represented", + ) + else: + warn_checker = contextlib.nullcontext() + with warn_checker: + with self.roundtrip(*args, **kwargs) as actual: + yield actual + # regression/numpy2 test for encoding = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", "dtype": "i1", } @@ -937,21 +973,32 @@ def test_roundtrip_unsigned(self, fillvalue): decoded = Dataset({"x": ("t", x, {}, encoding)}) attributes = { - "_FillValue": fillvalue, + "_FillValue": fill_value, "_Unsigned": "true", } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1") encoded = Dataset({"x": ("t", sb, attributes)}) + unsigned_dtype = np.dtype(f"u{sb.dtype.itemsize}") - with self.roundtrip(decoded) as actual: + with _roundtrip_with_warnings(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = decoded.variables[k].encoding["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].encoding["_FillValue"] assert_allclose(decoded, actual, decode_bytes=False) - with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: + with _roundtrip_with_warnings( + decoded, open_kwargs=dict(decode_cf=False) + ) as actual: for k in encoded.variables: assert encoded.variables[k].dtype == actual.variables[k].dtype + exp_fv = encoded.variables[k].attrs["_FillValue"] + if exp_fill_warning: + exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype) + assert exp_fv == actual.variables[k].attrs["_FillValue"] assert_allclose(encoded, actual, decode_bytes=False) @staticmethod From a37c5d0126ea19143b53d8c66a8ac2f80fa9464b Mon Sep 17 00:00:00 2001 From: David Hoese Date: Thu, 25 Jul 2024 21:16:51 -0500 Subject: [PATCH 3/8] Replace UnsignedIntegerCode tests with CFMaskCoder usage --- xarray/tests/test_coding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 6d81d6f5dc8..acb32504948 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -129,7 +129,7 @@ def test_decode_unsigned_from_signed(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == unsigned_dtype assert decoded.values == original_values @@ -143,7 +143,7 @@ def test_decode_signed_from_unsigned(bits) -> None: encoded = xr.Variable( ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"} ) - coder = variables.UnsignedIntegerCoder() + coder = variables.CFMaskCoder() decoded = coder.decode(encoded) assert decoded.dtype == signed_dtype assert decoded.values == original_values From 39b65c5ee5fc0e0cdbef1fb19532123770f0445e Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 26 Jul 2024 10:18:54 -0500 Subject: [PATCH 4/8] Fix dtype type annotation --- xarray/coding/variables.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 7653b4c4bf9..71d28a92c78 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -370,7 +370,10 @@ def encode(self, variable: Variable, name: T_Name = None): return Variable(dims, data, attrs, encoding, fastpath=True) def _encode_unsigned_fill_value( - self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike + self, + name: T_Name, + fill_value: Any, + signed_dtype: np.dtype, ) -> Any: try: # user provided the on-disk signed fill From b72ded93828860df74091bd39c9272c0a2ae94a4 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 26 Jul 2024 10:45:54 -0500 Subject: [PATCH 5/8] Fix when unsigned serialization warning is expected in tests --- xarray/coding/variables.py | 1 - xarray/tests/test_backends.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 71d28a92c78..a7751ca37bf 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -407,7 +407,6 @@ def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) - # dims, data, attrs, encoding = unpack_for_decoding(variable) # Even if _Unsigned is use, retain on-disk _FillValue [ safe_setitem(encoding, attr, value, name=name) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f8d2e23616e..b0ad9b163ad 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -54,6 +54,7 @@ from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing from xarray.core.options import set_options +from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type from xarray.tests import ( assert_allclose, @@ -952,8 +953,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: def test_roundtrip_unsigned(self, fill_value, exp_fill_warning): @contextlib.contextmanager def _roundtrip_with_warnings(*args, **kwargs): - if exp_fill_warning: - warn_checker = pytest.warns( + is_np2 = module_available("numpy", minversion="2.0.0.dev0") + if exp_fill_warning and is_np2: + warn_checker: contextlib.AbstractContextManager = pytest.warns( SerializationWarning, match="_FillValue attribute can't be represented", ) From e6e71e2626504fafff6a430c4f3d11e643ca846b Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 26 Jul 2024 11:20:03 -0500 Subject: [PATCH 6/8] Small refactor of CFMaskCoder decoding --- xarray/coding/variables.py | 74 ++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index a7751ca37bf..1c7e35ebd34 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -298,6 +298,42 @@ def _check_fill_values(attrs, name, dtype): return raw_fill_dict, encoded_fill_values +def _convert_unsigned_fill_value( + name: T_Name, + data: Any, + unsigned: str, + raw_fill_value: Any, + encoded_fill_values: set, +) -> Any: + if data.dtype.kind == "i": + if unsigned == "true": + unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=unsigned_dtype) + if raw_fill_value is not None: + new_fill = np.array(raw_fill_value, dtype=data.dtype) + encoded_fill_values.remove(raw_fill_value) + # use view here to prevent OverflowError + encoded_fill_values.add(new_fill.view(unsigned_dtype).item()) + data = lazy_elemwise_func(data, transform, unsigned_dtype) + elif data.dtype.kind == "u": + if unsigned == "false": + signed_dtype = np.dtype(f"i{data.dtype.itemsize}") + transform = partial(np.asarray, dtype=signed_dtype) + data = lazy_elemwise_func(data, transform, signed_dtype) + if raw_fill_value is not None: + new_fill = signed_dtype.type(raw_fill_value) + encoded_fill_values.remove(raw_fill_value) + encoded_fill_values.add(new_fill) + else: + warnings.warn( + f"variable {name!r} has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + SerializationWarning, + stacklevel=3, + ) + return data + + class CFMaskCoder(VariableCoder): """Mask or unmask fill values according to CF conventions.""" @@ -344,7 +380,7 @@ def encode(self, variable: Variable, name: T_Name = None): encoding["missing_value"] = attrs.get( "_FillValue", ( - self._encode_unsigned_fill_value(name, fv, signed_dtype) + self._encode_unsigned_fill_value(name, mv, signed_dtype) if is_unsigned else dtype.type(mv) ), @@ -415,35 +451,13 @@ def decode(self, variable: Variable, name: T_Name = None): if "_Unsigned" in attrs: unsigned = pop_to(attrs, encoding, "_Unsigned") - - if data.dtype.kind == "i": - if unsigned == "true": - unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=unsigned_dtype) - if "_FillValue" in raw_fill_dict: - new_fill = np.array( - raw_fill_dict["_FillValue"], dtype=data.dtype - ) - encoded_fill_values.remove(raw_fill_dict["_FillValue"]) - # use view here to prevent OverflowError - encoded_fill_values.add(new_fill.view(unsigned_dtype).item()) - data = lazy_elemwise_func(data, transform, unsigned_dtype) - elif data.dtype.kind == "u": - if unsigned == "false": - signed_dtype = np.dtype(f"i{data.dtype.itemsize}") - transform = partial(np.asarray, dtype=signed_dtype) - data = lazy_elemwise_func(data, transform, signed_dtype) - if "_FillValue" in raw_fill_dict: - new_fill = signed_dtype.type(raw_fill_dict["_FillValue"]) - encoded_fill_values.remove(raw_fill_dict["_FillValue"]) - encoded_fill_values.add(new_fill) - else: - warnings.warn( - f"variable {name!r} has _Unsigned attribute but is not " - "of integer type. Ignoring attribute.", - SerializationWarning, - stacklevel=3, - ) + data = _convert_unsigned_fill_value( + name, + data, + unsigned, + raw_fill_dict.get("_FillValue"), + encoded_fill_values, + ) if encoded_fill_values: # special case DateTime to properly handle NaT From c9969184513c78696884d2b6e00bc9380fec8875 Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 26 Jul 2024 21:32:36 -0500 Subject: [PATCH 7/8] Add CF encoder tests for _Unsigned=false cases --- xarray/coding/variables.py | 77 +++++++++++++++++------------------ xarray/tests/test_backends.py | 37 +++++++++++++++++ 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 1c7e35ebd34..55c12029dbf 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -334,6 +334,36 @@ def _convert_unsigned_fill_value( return data +def _encode_unsigned_fill_value( + name: T_Name, + fill_value: Any, + encoded_dtype: np.dtype, +) -> Any: + try: + if hasattr(fill_value, "item"): + # if numpy type, convert to python native integer to determine overflow + # otherwise numpy unsigned ints will silently cast to the signed counterpart + fill_value = fill_value.item() + # passes if provided fill value fits in encoded on-disk type + new_fill = encoded_dtype.type(fill_value) + except OverflowError: + encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned" + warnings.warn( + f"variable {name!r} will be stored as {encoded_kind_str} integers " + f"but _FillValue attribute can't be represented as a " + f"{encoded_kind_str} integer.", + SerializationWarning, + stacklevel=3, + ) + # user probably provided the fill as the in-memory dtype, + # convert to on-disk type to match CF standard + orig_kind = "u" if encoded_dtype.kind == "i" else "i" + orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}") + # use view here to prevent OverflowError + new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item() + return new_fill + + class CFMaskCoder(VariableCoder): """Mask or unmask fill values according to CF conventions.""" @@ -345,11 +375,7 @@ def encode(self, variable: Variable, name: T_Name = None): # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data # "_Unsigned = "true" to indicate that # integer data should be treated as unsigned" - is_unsigned = encoding.get("_Unsigned", "false") == "true" - # only used for _Unsigned cases - signed_dtype = np.dtype( - encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype) - ) + has_unsigned = encoding.get("_Unsigned") is not None fv = encoding.get("_FillValue") mv = encoding.get("missing_value") fill_value = None @@ -368,8 +394,8 @@ def encode(self, variable: Variable, name: T_Name = None): if fv_exists: # Ensure _FillValue is cast to same dtype as data's encoding["_FillValue"] = ( - self._encode_unsigned_fill_value(name, fv, signed_dtype) - if is_unsigned + _encode_unsigned_fill_value(name, fv, dtype) + if has_unsigned else dtype.type(fv) ) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) @@ -380,8 +406,8 @@ def encode(self, variable: Variable, name: T_Name = None): encoding["missing_value"] = attrs.get( "_FillValue", ( - self._encode_unsigned_fill_value(name, mv, signed_dtype) - if is_unsigned + _encode_unsigned_fill_value(name, mv, dtype) + if has_unsigned else dtype.type(mv) ), ) @@ -397,43 +423,14 @@ def encode(self, variable: Variable, name: T_Name = None): else: data = duck_array_ops.fillna(data, fill_value) - if fill_value is not None and is_unsigned: + if fill_value is not None and has_unsigned: pop_to(encoding, attrs, "_Unsigned") # XXX: Is this actually needed? Doesn't the backend handle this? - data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype) + data = duck_array_ops.astype(duck_array_ops.around(data), dtype) attrs["_FillValue"] = fill_value return Variable(dims, data, attrs, encoding, fastpath=True) - def _encode_unsigned_fill_value( - self, - name: T_Name, - fill_value: Any, - signed_dtype: np.dtype, - ) -> Any: - try: - # user provided the on-disk signed fill - if hasattr(fill_value, "item"): - # if numpy type, convert to python native integer to determine overflow - # otherwise numpy unsigned ints will silently cast to the signed counterpart - fill_value = fill_value.item() - new_fill = signed_dtype.type(fill_value) - except OverflowError: - warnings.warn( - f"variable {name!r} will be stored as signed integers " - f"but _FillValue attribute can't be represented as a " - f"signed integer.", - SerializationWarning, - stacklevel=3, - ) - # user provided the in-memory unsigned fill, convert to signed type - unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}") - # use view here to prevent OverflowError - new_fill = ( - np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item() - ) - return new_fill - def decode(self, variable: Variable, name: T_Name = None): raw_fill_dict, encoded_fill_values = _check_fill_values( variable.attrs, name, variable.dtype diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b0ad9b163ad..fa9fd4b5bc5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -243,6 +243,32 @@ def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset: return Dataset({"x": ("t", sb, attributes)}) +def create_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + encoding = { + "_FillValue": 255, + "_Unsigned": "false", + "dtype": "u1", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) + return Dataset({"x": ("t", x, {}, encoding)}) + + +def create_encoded_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset: + # These are values as written to the file: the _FillValue will + # be represented in the unsigned form. + attributes = { + "_FillValue": 255, + "_Unsigned": "false", + "add_offset": dtype.type(10), + "scale_factor": dtype.type(0.1), + } + # Create unsigned data corresponding to [-110, 1, 127, 255] signed + sb = np.asarray([146, 1, 127, 255], dtype="u1") + return Dataset({"x": ("t", sb, attributes)}) + + def create_boolean_data() -> Dataset: attributes = {"units": "-"} return Dataset({"x": ("t", [True, False, False, True], attributes)}) @@ -891,6 +917,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: create_signed_masked_scaled_data, create_encoded_signed_masked_scaled_data, ), + ( + create_unsigned_false_masked_scaled_data, + create_encoded_unsigned_false_masked_scaled_data, + ), (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) @@ -900,6 +930,13 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None: pytest.skip("float32 will be treated as float64 in zarr") decoded = decoded_fn(dtype) encoded = encoded_fn(dtype) + if decoded["x"].encoding["dtype"] == "u1" and not ( + self.engine == "netcdf4" + and self.file_format is None + or self.file_format == "NETCDF4" + ): + pytest.skip("uint8 data can't be written to non-NetCDF4 data") + with self.roundtrip(decoded) as actual: for k in decoded.variables: assert decoded.variables[k].dtype == actual.variables[k].dtype From bdc122ea8399b25899e6ab6e2893567dacc1f3fa Mon Sep 17 00:00:00 2001 From: David Hoese Date: Fri, 2 Aug 2024 10:52:53 -0500 Subject: [PATCH 8/8] Remove UnsignedIntegerCoder from api docs --- doc/api-hidden.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index c38b4f2d11c..1900c208532 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -684,7 +684,6 @@ conventions.decode_cf_variables - coding.variables.UnsignedIntegerCoder coding.variables.CFMaskCoder coding.variables.CFScaleOffsetCoder