From ec94cddbdc1142770e860a64abc503ee66e6a817 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Tue, 23 May 2023 20:48:17 +0200 Subject: [PATCH 1/5] preserve vlen string dtypes, allow vlen string fill_values --- xarray/backends/h5netcdf_.py | 9 ----- xarray/backends/netCDF4_.py | 10 ------ xarray/coding/variables.py | 12 +++++++ xarray/conventions.py | 4 +++ xarray/tests/test_backends.py | 62 ++++++++++++++++++++--------------- 5 files changed, 51 insertions(+), 46 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index a68a44b5f6f..d9385fc68a9 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -271,15 +271,6 @@ def prepare_variable( dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) fillvalue = attrs.pop("_FillValue", None) - if dtype is str and fillvalue is not None: - raise NotImplementedError( - "h5netcdf does not yet support setting a fill value for " - "variable-length strings " - "(https://github.com/h5netcdf/h5netcdf/issues/37). " - f"Either remove '_FillValue' from encoding on variable {name!r} " - "or set {'dtype': 'S1'} in encoding to use the fixed width " - "NC_CHAR type." - ) if dtype is str: dtype = h5py.special_dtype(vlen=str) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index f21f15bf795..1aee4c1c726 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -490,16 +490,6 @@ def prepare_variable( fill_value = attrs.pop("_FillValue", None) - if datatype is str and fill_value is not None: - raise NotImplementedError( - "netCDF4 does not yet support setting a fill value for " - "variable-length strings " - "(https://github.com/Unidata/netcdf4-python/issues/730). " - f"Either remove '_FillValue' from encoding on variable {name!r} " - "or set {'dtype': 'S1'} in encoding to use the fixed width " - "NC_CHAR type." - ) - encoding = _extract_nc4_variable_encoding( variable, raise_on_invalid=check_encoding, unlimited_dims=unlimited_dims ) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index df660f90d9e..085725b4b7c 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -562,3 +562,15 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: def decode(self): raise NotImplementedError() + + +class ObjectStringCoder(VariableCoder): + def encode(self): + return NotImplementedError + + def decode(self, variable: Variable, name: T_Name = None) -> Variable: + if variable.dtype == object and variable.encoding.get("dtype", False) == str: + variable = variable.astype(variable.encoding["dtype"]) + return variable + else: + return variable diff --git a/xarray/conventions.py b/xarray/conventions.py index cf207f0c37a..65c55207e2a 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -265,6 +265,10 @@ def decode_cf_variable( var = strings.CharacterArrayCoder().decode(var, name=name) var = strings.EncodedStringCoder().decode(var) + if original_dtype == object: + var = variables.ObjectStringCoder().decode(var) + original_dtype = var.dtype + if mask_and_scale: for coder in [ variables.UnsignedIntegerCoder(), diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 80b6951dbff..85248b5c40a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -864,12 +864,13 @@ def test_roundtrip_empty_vlen_string_array(self) -> None: assert check_vlen_dtype(original["a"].dtype) == str with self.roundtrip(original) as actual: assert_identical(original, actual) - assert object == actual["a"].dtype - assert actual["a"].dtype == original["a"].dtype - # only check metadata for capable backends - # eg. NETCDF3 based backends do not roundtrip metadata - if actual["a"].dtype.metadata is not None: - assert check_vlen_dtype(actual["a"].dtype) == str + if np.issubdtype(actual["a"].dtype, object): + # only check metadata for capable backends + # eg. NETCDF3 based backends do not roundtrip metadata + if actual["a"].dtype.metadata is not None: + assert check_vlen_dtype(actual["a"].dtype) == str + else: + assert actual["a"].dtype == np.dtype(" None: with self.open(tmp_file, group="data/2") as actual2: assert_identical(data2, actual2) - def test_encoding_kwarg_vlen_string(self) -> None: - for input_strings in [[b"foo", b"bar", b"baz"], ["foo", "bar", "baz"]]: - original = Dataset({"x": input_strings}) - expected = Dataset({"x": ["foo", "bar", "baz"]}) - kwargs = dict(encoding={"x": {"dtype": str}}) - with self.roundtrip(original, save_kwargs=kwargs) as actual: - assert actual["x"].encoding["dtype"] is str - assert_identical(actual, expected) - - def test_roundtrip_string_with_fill_value_vlen(self) -> None: + @pytest.mark.parametrize( + "input_strings, is_bytes", + [ + ([b"foo", b"bar", b"baz"], True), + (["foo", "bar", "baz"], False), + (["foó", "bár", "baź"], False), + ], + ) + def test_encoding_kwarg_vlen_string( + self, input_strings: list[str], is_bytes: bool + ) -> None: + original = Dataset({"x": input_strings}) + + expected_string = ["foo", "bar", "baz"] if is_bytes else input_strings + expected = Dataset({"x": expected_string}) + kwargs = dict(encoding={"x": {"dtype": str}}) + with self.roundtrip(original, save_kwargs=kwargs) as actual: + assert actual["x"].encoding["dtype"] == " None: values = np.array(["ab", "cdef", np.nan], dtype=object) expected = Dataset({"x": ("t", values)}) - # netCDF4-based backends don't support an explicit fillvalue - # for variable length strings yet. - # https://github.com/Unidata/netcdf4-python/issues/730 - # https://github.com/h5netcdf/h5netcdf/issues/37 - original = Dataset({"x": ("t", values, {}, {"_FillValue": "XXX"})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - assert_identical(expected, actual) + original = Dataset({"x": ("t", values, {}, {"_FillValue": fill_value})}) + with self.roundtrip(original) as actual: + assert_identical(expected, actual) original = Dataset({"x": ("t", values, {}, {"_FillValue": ""})}) - with pytest.raises(NotImplementedError): - with self.roundtrip(original) as actual: - assert_identical(expected, actual) + with self.roundtrip(original) as actual: + assert_identical(expected, actual) def test_roundtrip_character_array(self) -> None: with create_tmp_file() as tmp_file: From 9783f6e9419cb6a1ee58b4b68c9219c8d5671c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 16 Nov 2023 08:45:32 +0100 Subject: [PATCH 2/5] update min-deps h5py->3.7, h5netcdf -> 1.1 --- ci/requirements/min-all-deps.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 7d0f29c0960..22076f88854 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -16,11 +16,11 @@ dependencies: - dask-core=2022.7 - distributed=2022.7 - flox=0.5 - - h5netcdf=1.0 + - h5netcdf=1.1 # h5py and hdf5 tend to cause conflicts # for e.g. hdf5 1.12 conflicts with h5py=3.1 # prioritize bumping other packages instead - - h5py=3.6 + - h5py=3.7 - hdf5=1.12 - hypothesis - iris=3.2 From b61792452e4eef276e5c31348e99a5f8a60aae78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 16 Nov 2023 08:54:38 +0100 Subject: [PATCH 3/5] add doc/whats-new.rst entry --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2fb76cfe8c2..af5b37ff07e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,6 +32,8 @@ New Features By `Sam Levang `_. - Allow the usage of h5py drivers (eg: ros3) via h5netcdf (:pull:`8360`). By `Ezequiel Cimadevilla `_. +- Enable VLEN string fill_values, preserve VLEN string dtypes (:issue:`1647`, :issue:`7652`, :issue:`7868`, :pull:`7869`). + By `Kai Mühlbauer `_. Breaking changes ~~~~~~~~~~~~~~~~ From ee0bf6e251044774e0e788f5d41e43479fac17ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 17 Nov 2023 10:41:05 +0100 Subject: [PATCH 4/5] add suggestions from review --- doc/whats-new.rst | 3 +++ xarray/coding/variables.py | 2 +- xarray/conventions.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index af5b37ff07e..6fab90feba8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -47,7 +47,10 @@ Breaking changes ``"S"``, ``"L"``, ``"U"``, or ``"N"``. This is to be consistent with the deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By `Spencer Clark `_. + By `Justus Magin =0.22``. By `Deepak Cherian `_. +- Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``. + By `Kai Mühlbauer `_. Deprecations ~~~~~~~~~~~~ diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 085725b4b7c..487197605e8 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -564,7 +564,7 @@ def decode(self): raise NotImplementedError() -class ObjectStringCoder(VariableCoder): +class ObjectVLenStringCoder(VariableCoder): def encode(self): return NotImplementedError diff --git a/xarray/conventions.py b/xarray/conventions.py index 65c55207e2a..75f816e6cb4 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -266,7 +266,7 @@ def decode_cf_variable( var = strings.EncodedStringCoder().decode(var) if original_dtype == object: - var = variables.ObjectStringCoder().decode(var) + var = variables.ObjectVLenStringCoder().decode(var) original_dtype = var.dtype if mask_and_scale: From 3d0355e4205337de8a34e3f2afddb7a1463a58e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 17 Nov 2023 12:03:09 +0100 Subject: [PATCH 5/5] remove stale entry --- doc/whats-new.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6fab90feba8..928a2c95cac 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,14 +40,12 @@ Breaking changes - drop support for `cdms2 `_. Please use `xcdat `_ instead (:pull:`8441`). By `Justus Magin `_. - - Following pandas, :py:meth:`infer_freq` will return ``"Y"``, ``"YS"``, ``"QE"``, ``"ME"``, ``"h"``, ``"min"``, ``"s"``, ``"ms"``, ``"us"``, or ``"ns"`` instead of ``"A"``, ``"AS"``, ``"Q"``, ``"M"``, ``"H"``, ``"T"``, ``"S"``, ``"L"``, ``"U"``, or ``"N"``. This is to be consistent with the deprecation of the latter frequency strings (:issue:`8394`, :pull:`8415`). By `Spencer Clark `_. - By `Justus Magin =0.22``. By `Deepak Cherian `_. - Minimum supported versions for the following packages have changed: ``h5py >=3.7``, ``h5netcdf>=1.1``. By `Kai Mühlbauer `_.