From e1baa93ad669d65032b936b530b666aa9d689f2e Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Mon, 22 Jul 2024 14:11:02 -0500
Subject: [PATCH 1/8] Fix small typo in docstring

---
 xarray/coding/variables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 8a3afe650f2..fb2fa241773 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -261,7 +261,7 @@ def _is_time_like(units):
 
 
 def _check_fill_values(attrs, name, dtype):
-    """ "Check _FillValue and missing_value if available.
+    """Check _FillValue and missing_value if available.
 
     Return dictionary with raw fill values and set with encoded fill values.
 

From cae77aae964299bc3cf0be45a9cb9e3166534db2 Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Wed, 24 Jul 2024 14:23:55 -0500
Subject: [PATCH 2/8] Combine CF Unsigned and Mask handling

---
 xarray/coding/variables.py    | 224 ++++++++++++++++++----------------
 xarray/conventions.py         |   2 -
 xarray/tests/test_backends.py |  61 +++++++--
 3 files changed, 171 insertions(+), 116 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index fb2fa241773..7653b4c4bf9 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -305,11 +305,18 @@ def encode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         dtype = np.dtype(encoding.get("dtype", data.dtype))
+        # from netCDF best practices
+        # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
+        #     "_Unsigned = "true" to indicate that
+        #      integer data should be treated as unsigned"
+        is_unsigned = encoding.get("_Unsigned", "false") == "true"
+        # only used for _Unsigned cases
+        signed_dtype = np.dtype(
+            encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
+        )
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
-        # to properly handle _FillValue/missing_value below [a], [b]
-        # we need to check if unsigned data is written as signed data
-        unsigned = encoding.get("_Unsigned") is not None
+        fill_value = None
 
         fv_exists = fv is not None
         mv_exists = mv is not None
@@ -324,23 +331,28 @@ def encode(self, variable: Variable, name: T_Name = None):
 
         if fv_exists:
             # Ensure _FillValue is cast to same dtype as data's
-            # [a] need to skip this if _Unsigned is available
-            if not unsigned:
-                encoding["_FillValue"] = dtype.type(fv)
+            encoding["_FillValue"] = (
+                self._encode_unsigned_fill_value(name, fv, signed_dtype)
+                if is_unsigned
+                else dtype.type(fv)
+            )
             fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
 
         if mv_exists:
             # try to use _FillValue, if it exists to align both values
             # or use missing_value and ensure it's cast to same dtype as data's
-            # [b] need to provide mv verbatim if _Unsigned is available
             encoding["missing_value"] = attrs.get(
                 "_FillValue",
-                (dtype.type(mv) if not unsigned else mv),
+                (
+                    self._encode_unsigned_fill_value(name, fv, signed_dtype)
+                    if is_unsigned
+                    else dtype.type(mv)
+                ),
             )
             fill_value = pop_to(encoding, attrs, "missing_value", name=name)
 
         # apply fillna
-        if not pd.isnull(fill_value):
+        if fill_value is not None and not pd.isnull(fill_value):
             # special case DateTime to properly handle NaT
             if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
                 data = duck_array_ops.where(
@@ -349,46 +361,112 @@ def encode(self, variable: Variable, name: T_Name = None):
             else:
                 data = duck_array_ops.fillna(data, fill_value)
 
+        if fill_value is not None and is_unsigned:
+            pop_to(encoding, attrs, "_Unsigned")
+            # XXX: Is this actually needed? Doesn't the backend handle this?
+            data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
+            attrs["_FillValue"] = fill_value
+
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
+    def _encode_unsigned_fill_value(
+        self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike
+    ) -> Any:
+        try:
+            # user provided the on-disk signed fill
+            if hasattr(fill_value, "item"):
+                # if numpy type, convert to python native integer to determine overflow
+                # otherwise numpy unsigned ints will silently cast to the signed counterpart
+                fill_value = fill_value.item()
+            new_fill = signed_dtype.type(fill_value)
+        except OverflowError:
+            warnings.warn(
+                f"variable {name!r} will be stored as signed integers "
+                f"but _FillValue attribute can't be represented as a "
+                f"signed integer.",
+                SerializationWarning,
+                stacklevel=3,
+            )
+            # user provided the in-memory unsigned fill, convert to signed type
+            unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
+            # use view here to prevent OverflowError
+            new_fill = (
+                np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
+            )
+        return new_fill
+
     def decode(self, variable: Variable, name: T_Name = None):
         raw_fill_dict, encoded_fill_values = _check_fill_values(
             variable.attrs, name, variable.dtype
         )
+        if "_Unsigned" not in variable.attrs and not raw_fill_dict:
+            return variable
 
-        if raw_fill_dict:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-            [
-                safe_setitem(encoding, attr, value, name=name)
-                for attr, value in raw_fill_dict.items()
-            ]
-
-            if encoded_fill_values:
-                # special case DateTime to properly handle NaT
-                dtype: np.typing.DTypeLike
-                decoded_fill_value: Any
-                if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
-                    dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
-                else:
-                    if "scale_factor" not in attrs and "add_offset" not in attrs:
-                        dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
-                    else:
-                        dtype, decoded_fill_value = (
-                            _choose_float_dtype(data.dtype, attrs),
-                            np.nan,
-                        )
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+
+        # dims, data, attrs, encoding = unpack_for_decoding(variable)
+        # Even if _Unsigned is use, retain on-disk _FillValue
+        [
+            safe_setitem(encoding, attr, value, name=name)
+            for attr, value in raw_fill_dict.items()
+        ]
 
-                transform = partial(
-                    _apply_mask,
-                    encoded_fill_values=encoded_fill_values,
-                    decoded_fill_value=decoded_fill_value,
-                    dtype=dtype,
+        if "_Unsigned" in attrs:
+            unsigned = pop_to(attrs, encoding, "_Unsigned")
+
+            if data.dtype.kind == "i":
+                if unsigned == "true":
+                    unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
+                    transform = partial(np.asarray, dtype=unsigned_dtype)
+                    if "_FillValue" in raw_fill_dict:
+                        new_fill = np.array(
+                            raw_fill_dict["_FillValue"], dtype=data.dtype
+                        )
+                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
+                        # use view here to prevent OverflowError
+                        encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
+                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
+            elif data.dtype.kind == "u":
+                if unsigned == "false":
+                    signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
+                    transform = partial(np.asarray, dtype=signed_dtype)
+                    data = lazy_elemwise_func(data, transform, signed_dtype)
+                    if "_FillValue" in raw_fill_dict:
+                        new_fill = signed_dtype.type(raw_fill_dict["_FillValue"])
+                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
+                        encoded_fill_values.add(new_fill)
+            else:
+                warnings.warn(
+                    f"variable {name!r} has _Unsigned attribute but is not "
+                    "of integer type. Ignoring attribute.",
+                    SerializationWarning,
+                    stacklevel=3,
                 )
-                data = lazy_elemwise_func(data, transform, dtype)
 
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
+        if encoded_fill_values:
+            # special case DateTime to properly handle NaT
+            dtype: np.typing.DTypeLike
+            decoded_fill_value: Any
+            if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
+                dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
+            else:
+                if "scale_factor" not in attrs and "add_offset" not in attrs:
+                    dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+                else:
+                    dtype, decoded_fill_value = (
+                        _choose_float_dtype(data.dtype, attrs),
+                        np.nan,
+                    )
+
+            transform = partial(
+                _apply_mask,
+                encoded_fill_values=encoded_fill_values,
+                decoded_fill_value=decoded_fill_value,
+                dtype=dtype,
+            )
+            data = lazy_elemwise_func(data, transform, dtype)
+
+        return Variable(dims, data, attrs, encoding, fastpath=True)
 
 
 def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
@@ -506,74 +584,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return variable
 
 
-class UnsignedIntegerCoder(VariableCoder):
-    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        # from netCDF best practices
-        # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
-        #     "_Unsigned = "true" to indicate that
-        #      integer data should be treated as unsigned"
-        if variable.encoding.get("_Unsigned", "false") == "true":
-            dims, data, attrs, encoding = unpack_for_encoding(variable)
-
-            pop_to(encoding, attrs, "_Unsigned")
-            # we need the on-disk type here
-            # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
-            signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
-            if "_FillValue" in attrs:
-                try:
-                    # user provided the on-disk signed fill
-                    new_fill = signed_dtype.type(attrs["_FillValue"])
-                except OverflowError:
-                    # user provided the in-memory unsigned fill, convert to signed type
-                    unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
-                    # use view here to prevent OverflowError
-                    new_fill = (
-                        np.array(attrs["_FillValue"], dtype=unsigned_dtype)
-                        .view(signed_dtype)
-                        .item()
-                    )
-                attrs["_FillValue"] = new_fill
-            data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
-
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
-
-    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if "_Unsigned" in variable.attrs:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-            unsigned = pop_to(attrs, encoding, "_Unsigned")
-
-            if data.dtype.kind == "i":
-                if unsigned == "true":
-                    unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=unsigned_dtype)
-                    if "_FillValue" in attrs:
-                        new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
-                        # use view here to prevent OverflowError
-                        attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
-                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
-            elif data.dtype.kind == "u":
-                if unsigned == "false":
-                    signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=signed_dtype)
-                    data = lazy_elemwise_func(data, transform, signed_dtype)
-                    if "_FillValue" in attrs:
-                        new_fill = signed_dtype.type(attrs["_FillValue"])
-                        attrs["_FillValue"] = new_fill
-            else:
-                warnings.warn(
-                    f"variable {name!r} has _Unsigned attribute but is not "
-                    "of integer type. Ignoring attribute.",
-                    SerializationWarning,
-                    stacklevel=3,
-                )
-
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
-
-
 class DefaultFillvalueCoder(VariableCoder):
     """Encode default _FillValue if needed."""
 
diff --git a/xarray/conventions.py b/xarray/conventions.py
index d572b215d2d..18a81938225 100644
--- a/xarray/conventions.py
+++ b/xarray/conventions.py
@@ -187,7 +187,6 @@ def encode_cf_variable(
         times.CFTimedeltaCoder(),
         variables.CFScaleOffsetCoder(),
         variables.CFMaskCoder(),
-        variables.UnsignedIntegerCoder(),
         variables.NativeEnumCoder(),
         variables.NonStringCoder(),
         variables.DefaultFillvalueCoder(),
@@ -279,7 +278,6 @@ def decode_cf_variable(
 
     if mask_and_scale:
         for coder in [
-            variables.UnsignedIntegerCoder(),
             variables.CFMaskCoder(),
             variables.CFScaleOffsetCoder(),
         ]:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index 4fa8736427d..f8d2e23616e 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -166,7 +166,7 @@ def create_encoded_masked_and_scaled_data(dtype: np.dtype) -> Dataset:
 
 def create_unsigned_masked_scaled_data(dtype: np.dtype) -> Dataset:
     encoding = {
-        "_FillValue": 255,
+        "_FillValue": -1,
         "_Unsigned": "true",
         "dtype": "i1",
         "add_offset": dtype.type(10),
@@ -902,6 +902,11 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
+                # CF _FillValue is always on-disk type
+                assert (
+                    decoded.variables[k].encoding["_FillValue"]
+                    == actual.variables[k].encoding["_FillValue"]
+                )
             assert_allclose(decoded, actual, decode_bytes=False)
 
         with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
@@ -909,11 +914,21 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
             # encode.  Is that something we want to test for?
             for k in encoded.variables:
                 assert encoded.variables[k].dtype == actual.variables[k].dtype
+                # CF _FillValue is always on-disk type
+                assert (
+                    decoded.variables[k].encoding["_FillValue"]
+                    == actual.variables[k].attrs["_FillValue"]
+                )
             assert_allclose(encoded, actual, decode_bytes=False)
 
         with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
             for k in encoded.variables:
                 assert encoded.variables[k].dtype == actual.variables[k].dtype
+                # CF _FillValue is always on-disk type
+                assert (
+                    encoded.variables[k].attrs["_FillValue"]
+                    == actual.variables[k].attrs["_FillValue"]
+                )
             assert_allclose(encoded, actual, decode_bytes=False)
 
         # make sure roundtrip encoding didn't change the
@@ -925,11 +940,32 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
             assert_allclose(decoded, actual, decode_bytes=False)
 
-    @pytest.mark.parametrize("fillvalue", [np.int8(-1), np.uint8(255), -1, 255])
-    def test_roundtrip_unsigned(self, fillvalue):
+    @pytest.mark.parametrize(
+        ("fill_value", "exp_fill_warning"),
+        [
+            (np.int8(-1), False),
+            (np.uint8(255), True),
+            (-1, False),
+            (255, True),
+        ],
+    )
+    def test_roundtrip_unsigned(self, fill_value, exp_fill_warning):
+        @contextlib.contextmanager
+        def _roundtrip_with_warnings(*args, **kwargs):
+            if exp_fill_warning:
+                warn_checker = pytest.warns(
+                    SerializationWarning,
+                    match="_FillValue attribute can't be represented",
+                )
+            else:
+                warn_checker = contextlib.nullcontext()
+            with warn_checker:
+                with self.roundtrip(*args, **kwargs) as actual:
+                    yield actual
+
         # regression/numpy2 test for
         encoding = {
-            "_FillValue": fillvalue,
+            "_FillValue": fill_value,
             "_Unsigned": "true",
             "dtype": "i1",
         }
@@ -937,21 +973,32 @@ def test_roundtrip_unsigned(self, fillvalue):
         decoded = Dataset({"x": ("t", x, {}, encoding)})
 
         attributes = {
-            "_FillValue": fillvalue,
+            "_FillValue": fill_value,
             "_Unsigned": "true",
         }
         # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned
         sb = np.asarray([0, 1, 127, -128, -2, -1], dtype="i1")
         encoded = Dataset({"x": ("t", sb, attributes)})
+        unsigned_dtype = np.dtype(f"u{sb.dtype.itemsize}")
 
-        with self.roundtrip(decoded) as actual:
+        with _roundtrip_with_warnings(decoded) as actual:
             for k in decoded.variables:
                 assert decoded.variables[k].dtype == actual.variables[k].dtype
+                exp_fv = decoded.variables[k].encoding["_FillValue"]
+                if exp_fill_warning:
+                    exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype)
+                assert exp_fv == actual.variables[k].encoding["_FillValue"]
             assert_allclose(decoded, actual, decode_bytes=False)
 
-        with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
+        with _roundtrip_with_warnings(
+            decoded, open_kwargs=dict(decode_cf=False)
+        ) as actual:
             for k in encoded.variables:
                 assert encoded.variables[k].dtype == actual.variables[k].dtype
+                exp_fv = encoded.variables[k].attrs["_FillValue"]
+                if exp_fill_warning:
+                    exp_fv = np.array(exp_fv, dtype=unsigned_dtype).view(sb.dtype)
+                assert exp_fv == actual.variables[k].attrs["_FillValue"]
             assert_allclose(encoded, actual, decode_bytes=False)
 
     @staticmethod

From a37c5d0126ea19143b53d8c66a8ac2f80fa9464b Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Thu, 25 Jul 2024 21:16:51 -0500
Subject: [PATCH 3/8] Replace UnsignedIntegerCode tests with CFMaskCoder usage

---
 xarray/tests/test_coding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
index 6d81d6f5dc8..acb32504948 100644
--- a/xarray/tests/test_coding.py
+++ b/xarray/tests/test_coding.py
@@ -129,7 +129,7 @@ def test_decode_unsigned_from_signed(bits) -> None:
     encoded = xr.Variable(
         ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"}
     )
-    coder = variables.UnsignedIntegerCoder()
+    coder = variables.CFMaskCoder()
     decoded = coder.decode(encoded)
     assert decoded.dtype == unsigned_dtype
     assert decoded.values == original_values
@@ -143,7 +143,7 @@ def test_decode_signed_from_unsigned(bits) -> None:
     encoded = xr.Variable(
         ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"}
     )
-    coder = variables.UnsignedIntegerCoder()
+    coder = variables.CFMaskCoder()
     decoded = coder.decode(encoded)
     assert decoded.dtype == signed_dtype
     assert decoded.values == original_values

From 39b65c5ee5fc0e0cdbef1fb19532123770f0445e Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Fri, 26 Jul 2024 10:18:54 -0500
Subject: [PATCH 4/8] Fix dtype type annotation

---
 xarray/coding/variables.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 7653b4c4bf9..71d28a92c78 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -370,7 +370,10 @@ def encode(self, variable: Variable, name: T_Name = None):
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
     def _encode_unsigned_fill_value(
-        self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike
+        self,
+        name: T_Name,
+        fill_value: Any,
+        signed_dtype: np.dtype,
     ) -> Any:
         try:
             # user provided the on-disk signed fill

From b72ded93828860df74091bd39c9272c0a2ae94a4 Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Fri, 26 Jul 2024 10:45:54 -0500
Subject: [PATCH 5/8] Fix when unsigned serialization warning is expected in
 tests

---
 xarray/coding/variables.py    | 1 -
 xarray/tests/test_backends.py | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 71d28a92c78..a7751ca37bf 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -407,7 +407,6 @@ def decode(self, variable: Variable, name: T_Name = None):
 
         dims, data, attrs, encoding = unpack_for_decoding(variable)
 
-        # dims, data, attrs, encoding = unpack_for_decoding(variable)
         # Even if _Unsigned is use, retain on-disk _FillValue
         [
             safe_setitem(encoding, attr, value, name=name)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index f8d2e23616e..b0ad9b163ad 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -54,6 +54,7 @@
 from xarray.conventions import encode_dataset_coordinates
 from xarray.core import indexing
 from xarray.core.options import set_options
+from xarray.core.utils import module_available
 from xarray.namedarray.pycompat import array_type
 from xarray.tests import (
     assert_allclose,
@@ -952,8 +953,9 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
     def test_roundtrip_unsigned(self, fill_value, exp_fill_warning):
         @contextlib.contextmanager
         def _roundtrip_with_warnings(*args, **kwargs):
-            if exp_fill_warning:
-                warn_checker = pytest.warns(
+            is_np2 = module_available("numpy", minversion="2.0.0.dev0")
+            if exp_fill_warning and is_np2:
+                warn_checker: contextlib.AbstractContextManager = pytest.warns(
                     SerializationWarning,
                     match="_FillValue attribute can't be represented",
                 )

From e6e71e2626504fafff6a430c4f3d11e643ca846b Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Fri, 26 Jul 2024 11:20:03 -0500
Subject: [PATCH 6/8] Small refactor of CFMaskCoder decoding

---
 xarray/coding/variables.py | 74 ++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 30 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index a7751ca37bf..1c7e35ebd34 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -298,6 +298,42 @@ def _check_fill_values(attrs, name, dtype):
     return raw_fill_dict, encoded_fill_values
 
 
+def _convert_unsigned_fill_value(
+    name: T_Name,
+    data: Any,
+    unsigned: str,
+    raw_fill_value: Any,
+    encoded_fill_values: set,
+) -> Any:
+    if data.dtype.kind == "i":
+        if unsigned == "true":
+            unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
+            transform = partial(np.asarray, dtype=unsigned_dtype)
+            if raw_fill_value is not None:
+                new_fill = np.array(raw_fill_value, dtype=data.dtype)
+                encoded_fill_values.remove(raw_fill_value)
+                # use view here to prevent OverflowError
+                encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
+            data = lazy_elemwise_func(data, transform, unsigned_dtype)
+    elif data.dtype.kind == "u":
+        if unsigned == "false":
+            signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
+            transform = partial(np.asarray, dtype=signed_dtype)
+            data = lazy_elemwise_func(data, transform, signed_dtype)
+            if raw_fill_value is not None:
+                new_fill = signed_dtype.type(raw_fill_value)
+                encoded_fill_values.remove(raw_fill_value)
+                encoded_fill_values.add(new_fill)
+    else:
+        warnings.warn(
+            f"variable {name!r} has _Unsigned attribute but is not "
+            "of integer type. Ignoring attribute.",
+            SerializationWarning,
+            stacklevel=3,
+        )
+    return data
+
+
 class CFMaskCoder(VariableCoder):
     """Mask or unmask fill values according to CF conventions."""
 
@@ -344,7 +380,7 @@ def encode(self, variable: Variable, name: T_Name = None):
             encoding["missing_value"] = attrs.get(
                 "_FillValue",
                 (
-                    self._encode_unsigned_fill_value(name, fv, signed_dtype)
+                    self._encode_unsigned_fill_value(name, mv, signed_dtype)
                     if is_unsigned
                     else dtype.type(mv)
                 ),
@@ -415,35 +451,13 @@ def decode(self, variable: Variable, name: T_Name = None):
 
         if "_Unsigned" in attrs:
             unsigned = pop_to(attrs, encoding, "_Unsigned")
-
-            if data.dtype.kind == "i":
-                if unsigned == "true":
-                    unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=unsigned_dtype)
-                    if "_FillValue" in raw_fill_dict:
-                        new_fill = np.array(
-                            raw_fill_dict["_FillValue"], dtype=data.dtype
-                        )
-                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
-                        # use view here to prevent OverflowError
-                        encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
-                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
-            elif data.dtype.kind == "u":
-                if unsigned == "false":
-                    signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=signed_dtype)
-                    data = lazy_elemwise_func(data, transform, signed_dtype)
-                    if "_FillValue" in raw_fill_dict:
-                        new_fill = signed_dtype.type(raw_fill_dict["_FillValue"])
-                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
-                        encoded_fill_values.add(new_fill)
-            else:
-                warnings.warn(
-                    f"variable {name!r} has _Unsigned attribute but is not "
-                    "of integer type. Ignoring attribute.",
-                    SerializationWarning,
-                    stacklevel=3,
-                )
+            data = _convert_unsigned_fill_value(
+                name,
+                data,
+                unsigned,
+                raw_fill_dict.get("_FillValue"),
+                encoded_fill_values,
+            )
 
         if encoded_fill_values:
             # special case DateTime to properly handle NaT

From c9969184513c78696884d2b6e00bc9380fec8875 Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Fri, 26 Jul 2024 21:32:36 -0500
Subject: [PATCH 7/8] Add CF encoder tests for _Unsigned=false cases

---
 xarray/coding/variables.py    | 77 +++++++++++++++++------------------
 xarray/tests/test_backends.py | 37 +++++++++++++++++
 2 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
index 1c7e35ebd34..55c12029dbf 100644
--- a/xarray/coding/variables.py
+++ b/xarray/coding/variables.py
@@ -334,6 +334,36 @@ def _convert_unsigned_fill_value(
     return data
 
 
+def _encode_unsigned_fill_value(
+    name: T_Name,
+    fill_value: Any,
+    encoded_dtype: np.dtype,
+) -> Any:
+    try:
+        if hasattr(fill_value, "item"):
+            # if numpy type, convert to python native integer to determine overflow
+            # otherwise numpy unsigned ints will silently cast to the signed counterpart
+            fill_value = fill_value.item()
+        # passes if provided fill value fits in encoded on-disk type
+        new_fill = encoded_dtype.type(fill_value)
+    except OverflowError:
+        encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned"
+        warnings.warn(
+            f"variable {name!r} will be stored as {encoded_kind_str} integers "
+            f"but _FillValue attribute can't be represented as a "
+            f"{encoded_kind_str} integer.",
+            SerializationWarning,
+            stacklevel=3,
+        )
+        # user probably provided the fill as the in-memory dtype,
+        # convert to on-disk type to match CF standard
+        orig_kind = "u" if encoded_dtype.kind == "i" else "i"
+        orig_dtype = np.dtype(f"{orig_kind}{encoded_dtype.itemsize}")
+        # use view here to prevent OverflowError
+        new_fill = np.array(fill_value, dtype=orig_dtype).view(encoded_dtype).item()
+    return new_fill
+
+
 class CFMaskCoder(VariableCoder):
     """Mask or unmask fill values according to CF conventions."""
 
@@ -345,11 +375,7 @@ def encode(self, variable: Variable, name: T_Name = None):
         # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
         #     "_Unsigned = "true" to indicate that
         #      integer data should be treated as unsigned"
-        is_unsigned = encoding.get("_Unsigned", "false") == "true"
-        # only used for _Unsigned cases
-        signed_dtype = np.dtype(
-            encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
-        )
+        has_unsigned = encoding.get("_Unsigned") is not None
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
         fill_value = None
@@ -368,8 +394,8 @@ def encode(self, variable: Variable, name: T_Name = None):
         if fv_exists:
             # Ensure _FillValue is cast to same dtype as data's
             encoding["_FillValue"] = (
-                self._encode_unsigned_fill_value(name, fv, signed_dtype)
-                if is_unsigned
+                _encode_unsigned_fill_value(name, fv, dtype)
+                if has_unsigned
                 else dtype.type(fv)
             )
             fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
@@ -380,8 +406,8 @@ def encode(self, variable: Variable, name: T_Name = None):
             encoding["missing_value"] = attrs.get(
                 "_FillValue",
                 (
-                    self._encode_unsigned_fill_value(name, mv, signed_dtype)
-                    if is_unsigned
+                    _encode_unsigned_fill_value(name, mv, dtype)
+                    if has_unsigned
                     else dtype.type(mv)
                 ),
             )
@@ -397,43 +423,14 @@ def encode(self, variable: Variable, name: T_Name = None):
             else:
                 data = duck_array_ops.fillna(data, fill_value)
 
-        if fill_value is not None and is_unsigned:
+        if fill_value is not None and has_unsigned:
             pop_to(encoding, attrs, "_Unsigned")
             # XXX: Is this actually needed? Doesn't the backend handle this?
-            data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
+            data = duck_array_ops.astype(duck_array_ops.around(data), dtype)
             attrs["_FillValue"] = fill_value
 
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
-    def _encode_unsigned_fill_value(
-        self,
-        name: T_Name,
-        fill_value: Any,
-        signed_dtype: np.dtype,
-    ) -> Any:
-        try:
-            # user provided the on-disk signed fill
-            if hasattr(fill_value, "item"):
-                # if numpy type, convert to python native integer to determine overflow
-                # otherwise numpy unsigned ints will silently cast to the signed counterpart
-                fill_value = fill_value.item()
-            new_fill = signed_dtype.type(fill_value)
-        except OverflowError:
-            warnings.warn(
-                f"variable {name!r} will be stored as signed integers "
-                f"but _FillValue attribute can't be represented as a "
-                f"signed integer.",
-                SerializationWarning,
-                stacklevel=3,
-            )
-            # user provided the in-memory unsigned fill, convert to signed type
-            unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
-            # use view here to prevent OverflowError
-            new_fill = (
-                np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
-            )
-        return new_fill
-
     def decode(self, variable: Variable, name: T_Name = None):
         raw_fill_dict, encoded_fill_values = _check_fill_values(
             variable.attrs, name, variable.dtype
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
index b0ad9b163ad..fa9fd4b5bc5 100644
--- a/xarray/tests/test_backends.py
+++ b/xarray/tests/test_backends.py
@@ -243,6 +243,32 @@ def create_encoded_signed_masked_scaled_data(dtype: np.dtype) -> Dataset:
     return Dataset({"x": ("t", sb, attributes)})
 
 
+def create_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset:
+    encoding = {
+        "_FillValue": 255,
+        "_Unsigned": "false",
+        "dtype": "u1",
+        "add_offset": dtype.type(10),
+        "scale_factor": dtype.type(0.1),
+    }
+    x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype)
+    return Dataset({"x": ("t", x, {}, encoding)})
+
+
+def create_encoded_unsigned_false_masked_scaled_data(dtype: np.dtype) -> Dataset:
+    # These are values as written to the file: the _FillValue will
+    # be represented in the unsigned form.
+    attributes = {
+        "_FillValue": 255,
+        "_Unsigned": "false",
+        "add_offset": dtype.type(10),
+        "scale_factor": dtype.type(0.1),
+    }
+    # Create unsigned data corresponding to [-110, 1, 127, 255] signed
+    sb = np.asarray([146, 1, 127, 255], dtype="u1")
+    return Dataset({"x": ("t", sb, attributes)})
+
+
 def create_boolean_data() -> Dataset:
     attributes = {"units": "-"}
     return Dataset({"x": ("t", [True, False, False, True], attributes)})
@@ -891,6 +917,10 @@ def test_roundtrip_empty_vlen_string_array(self) -> None:
                 create_signed_masked_scaled_data,
                 create_encoded_signed_masked_scaled_data,
             ),
+            (
+                create_unsigned_false_masked_scaled_data,
+                create_encoded_unsigned_false_masked_scaled_data,
+            ),
             (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data),
         ],
     )
@@ -900,6 +930,13 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn, dtype) -> None:
             pytest.skip("float32 will be treated as float64 in zarr")
         decoded = decoded_fn(dtype)
         encoded = encoded_fn(dtype)
+        if decoded["x"].encoding["dtype"] == "u1" and not (
+            self.engine == "netcdf4"
+            and self.file_format is None
+            or self.file_format == "NETCDF4"
+        ):
+            pytest.skip("uint8 data can't be written to non-NetCDF4 data")
+
         with self.roundtrip(decoded) as actual:
             for k in decoded.variables:
                 assert decoded.variables[k].dtype == actual.variables[k].dtype

From bdc122ea8399b25899e6ab6e2893567dacc1f3fa Mon Sep 17 00:00:00 2001
From: David Hoese <david.hoese@ssec.wisc.edu>
Date: Fri, 2 Aug 2024 10:52:53 -0500
Subject: [PATCH 8/8] Remove UnsignedIntegerCoder from api docs

---
 doc/api-hidden.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
index c38b4f2d11c..1900c208532 100644
--- a/doc/api-hidden.rst
+++ b/doc/api-hidden.rst
@@ -684,7 +684,6 @@
 
    conventions.decode_cf_variables
 
-   coding.variables.UnsignedIntegerCoder
    coding.variables.CFMaskCoder
    coding.variables.CFScaleOffsetCoder