pydata · dcherian · Nov 17, 2023 · May 23, 2023 · Nov 16, 2023 · Nov 16, 2023
diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -16,11 +16,11 @@ dependencies:
   - dask-core=2022.7
   - distributed=2022.7
   - flox=0.5
-  - h5netcdf=1.0
+  - h5netcdf=1.1
   # h5py and hdf5 tend to cause conflicts
   # for e.g. hdf5 1.12 conflicts with h5py=3.1
   # prioritize bumping other packages instead
-  - h5py=3.6
+  - h5py=3.7
   - hdf5=1.12
   - hypothesis
   - iris=3.2

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -32,20 +32,23 @@ New Features
   By `Sam Levang <https://github.com/slevang>`_.
 - Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`).
   By `Ezequiel Cimadevilla <https://github.com/zequihg50>`_.
+- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
 - drop support for `cdms2 <https://github.com/CDAT/cdms>`_. Please use
   `xcdat <https://github.com/xCDAT/xcdat>`_ instead (:pull:`8441`).
   By `Justus Magin <https://github.com/keewis>`_.
-
 - Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``,
   ``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or
   ``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``,
   ``"S"``, ``"L"``, ``"U"``, or ``"N"``.  This is to be consistent with the
   deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By
   `Spencer Clark <https://github.com/spencerkclark>`_.
 - Bump minimum tested pint version to ``>=0.22``. By `Deepak Cherian <https://github.com/dcherian>`_.
+- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``.
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -271,15 +271,6 @@ def prepare_variable(
         dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)
 
         fillvalue = attrs.pop("_FillValue", None)
-        if dtype is str and fillvalue is not None:
-            raise NotImplementedError(
-                "h5netcdf does not yet support setting a fill value for "
-                "variable-length strings "
-                "(https://github.com/h5netcdf/h5netcdf/issues/37). "
-                f"Either remove '_FillValue' from encoding on variable {name!r} "
-                "or set {'dtype': 'S1'} in encoding to use the fixed width "
-                "NC_CHAR type."
-            )
 
         if dtype is str:
             dtype = h5py.special_dtype(vlen=str)

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -490,16 +490,6 @@ def prepare_variable(
 
         fill_value = attrs.pop("_FillValue", None)
 
-        if datatype is str and fill_value is not None:
-            raise NotImplementedError(
-                "netCDF4 does not yet support setting a fill value for "
-                "variable-length strings "
-                "(https://github.com/Unidata/netcdf4-python/issues/730). "
-                f"Either remove '_FillValue' from encoding on variable {name!r} "
-                "or set {'dtype': 'S1'} in encoding to use the fixed width "
-                "NC_CHAR type."
-            )
-
         encoding = _extract_nc4_variable_encoding(
             variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims
         )

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -562,3 +562,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
 
     def decode(self):
         raise NotImplementedError()
+
+
+class ObjectVLenStringCoder(VariableCoder):
+    def encode(self):
+        return NotImplementedError
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if variable.dtype == object and variable.encoding.get("dtype", False) == str:
+            variable = variable.astype(variable.encoding["dtype"])
+            return variable
+        else:
+            return variable
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -265,6 +265,10 @@ def decode_cf_variable(
             var = strings.CharacterArrayCoder().decode(var, name=name)
         var = strings.EncodedStringCoder().decode(var)
 
+    if original_dtype == object:
+        var = variables.ObjectVLenStringCoder().decode(var)
+        original_dtype = var.dtype
+
     if mask_and_scale:
         for coder in [
             variables.UnsignedIntegerCoder(),

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -864,12 +864,13 @@
         assert check_vlen_dtype(original["a"].dtype) == str
         with self.roundtrip(original) as actual:
             assert_identical(original, actual)
-            assert object == actual["a"].dtype
-            assert actual["a"].dtype == original["a"].dtype
-            # only check metadata for capable backends
-            # eg. NETCDF3 based backends do not roundtrip metadata
-            if actual["a"].dtype.metadata is not None:
-                assert check_vlen_dtype(actual["a"].dtype) == str
+            if np.issubdtype(actual["a"].dtype, object):
+                # only check metadata for capable backends
+                # eg. NETCDF3 based backends do not roundtrip metadata
+                if actual["a"].dtype.metadata is not None:
+                    assert check_vlen_dtype(actual["a"].dtype) == str
+            else:
+                assert actual["a"].dtype == np.dtype("<U1")
 
     @pytest.mark.parametrize(
         "decoded_fn, encoded_fn",
@@ -1374,32 +1375,39 @@
             with self.open(tmp_file, group="data/2") as actual2:
                 assert_identical(data2, actual2)
 
-    def test_encoding_kwarg_vlen_string(self) -> None:
-        for input_strings in [[b"foo", b"bar", b"baz"], ["foo", "bar", "baz"]]:
-            original = Dataset({"x": input_strings})
-            expected = Dataset({"x": ["foo", "bar", "baz"]})
-            kwargs = dict(encoding={"x": {"dtype": str}})
-            with self.roundtrip(original, save_kwargs=kwargs) as actual:
-                assert actual["x"].encoding["dtype"] is str
-                assert_identical(actual, expected)
-
-    def test_roundtrip_string_with_fill_value_vlen(self) -> None:
+    @pytest.mark.parametrize(
+        "input_strings, is_bytes",
+        [
+            ([b"foo", b"bar", b"baz"], True),
+            (["foo", "bar", "baz"], False),
+            (["foó", "bár", "baź"], False),
+        ],
+    )
+    def test_encoding_kwarg_vlen_string(
+        self, input_strings: list[str], is_bytes: bool
+    ) -> None:
+        original = Dataset({"x": input_strings})
+
+        expected_string = ["foo", "bar", "baz"] if is_bytes else input_strings
+        expected = Dataset({"x": expected_string})
+        kwargs = dict(encoding={"x": {"dtype": str}})
+        with self.roundtrip(original, save_kwargs=kwargs) as actual:
+            assert actual["x"].encoding["dtype"] == "<U3"
+            assert actual["x"].dtype == "<U3"
+            assert_identical(actual, expected)
+
+    @pytest.mark.parametrize("fill_value", ["XXX", "", "bár"])
+    def test_roundtrip_string_with_fill_value_vlen(self, fill_value: str) -> None:
         values = np.array(["ab", "cdef", np.nan], dtype=object)
         expected = Dataset({"x": ("t", values)})
 
-        # netCDF4-based backends don't support an explicit fillvalue
-        # for variable length strings yet.
-        # https://github.com/Unidata/netcdf4-python/issues/730
-        # https://github.com/h5netcdf/h5netcdf/issues/37
-        original = Dataset({"x": ("t", values, {}, {"_FillValue": "XXX"})})
-        with pytest.raises(NotImplementedError):
-            with self.roundtrip(original) as actual:
-                assert_identical(expected, actual)
+        original = Dataset({"x": ("t", values, {}, {"_FillValue": fill_value})})
+        with self.roundtrip(original) as actual:
+            assert_identical(expected, actual)
 
         original = Dataset({"x": ("t", values, {}, {"_FillValue": ""})})
-        with pytest.raises(NotImplementedError):
-            with self.roundtrip(original) as actual:
-                assert_identical(expected, actual)
+        with self.roundtrip(original) as actual:
+            assert_identical(expected, actual)
 
     def test_roundtrip_character_array(self) -> None:
         with create_tmp_file() as tmp_file:
@@ -4245,14 +4253,14 @@
            with open_dataset(tmp, chunks=chunks) as dask_ds:
                assert_identical(data, dask_ds)
                with create_tmp_file() as tmp2:
                    dask_ds.to_netcdf(tmp2)
                    with open_dataset(tmp2) as on_disk:
                        assert_identical(data, on_disk)

    def test_deterministic_names(self) -> None:
        with create_tmp_file() as tmp:
            data = create_test_data()
            data.to_netcdf(tmp)
            with open_mfdataset(tmp, combine="by_coords") as ds:
                original_names = {k: v.data.name for k, v in ds.data_vars.items()}
            with open_mfdataset(tmp, combine="by_coords") as ds:
@@ -4270,7 +4278,7 @@
        computed = actual.compute()
        assert not actual._in_memory
        assert computed._in_memory
        assert_allclose(actual, computed, decode_bytes=False)

    def test_save_mfdataset_compute_false_roundtrip(self) -> None:
        from dask.delayed import Delayed
@@ -4283,7 +4291,7 @@
                    datasets, [tmp1, tmp2], engine=self.engine, compute=False
                )
                assert isinstance(delayed_obj, Delayed)
                delayed_obj.compute()
                with open_mfdataset(
                    [tmp1, tmp2], combine="nested", concat_dim="x"
                ) as actual:
@@ -4292,7 +4300,7 @@
    def test_load_dataset(self) -> None:
        with create_tmp_file() as tmp:
            original = Dataset({"foo": ("x", np.random.randn(10))})
            original.to_netcdf(tmp)
            ds = load_dataset(tmp)
            # this would fail if we used open_dataset instead of load_dataset
            ds.to_netcdf(tmp)
@@ -4300,7 +4308,7 @@
    def test_load_dataarray(self) -> None:
        with create_tmp_file() as tmp:
            original = Dataset({"foo": ("x", np.random.randn(10))})
            original.to_netcdf(tmp)
            ds = load_dataarray(tmp)
            # this would fail if we used open_dataarray instead of
            # load_dataarray
@@ -4313,7 +4321,7 @@
    def test_inline_array(self) -> None:
        with create_tmp_file() as tmp:
            original = Dataset({"foo": ("x", np.random.randn(10))})
            original.to_netcdf(tmp)
            chunks = {"time": 10}

            def num_graph_nodes(obj):