From 6d93a95d05bdbfc33fff24064f67d29dd891ab58 Mon Sep 17 00:00:00 2001 From: James McCreight Date: Fri, 19 Apr 2019 11:52:58 -0600 Subject: [PATCH] Handle the character array dim name (#2896) * Handle the charachter array dim name in a variables encoding, set in decode and reapply in encode * Document char_dim_name * Minor change to set of char_dim_name * Test the roundtrip of the char_dim_name in encoding. * pep8 or die * Better test for char_dim_name * pep8 79char madness * nix test logic, use multiple parameterized vars * When encoding and encoding, remove it from encoding * Simpler is better * pep8 visual indent complaint * what is new! * what is newer than new! * what is newer than newer! * what is newer than newer-er! * what is newer than newer-est! --- doc/io.rst | 27 +++++++++++++++++---------- doc/whats-new.rst | 4 ++++ xarray/coding/strings.py | 8 ++++++-- xarray/tests/test_coding_strings.py | 18 ++++++++++++++++++ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index 51c747189da..b470284f071 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -302,16 +302,23 @@ to using encoded character arrays. Character arrays can be selected even for netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1`` (corresponding to NumPy's single-character bytes dtype). -If character arrays are used, the string encoding that was used is stored on -disk in the ``_Encoding`` attribute, which matches an ad-hoc convention -`adopted by the netCDF4-Python library `_. -At the time of this writing (October 2017), a standard convention for indicating -string encoding for character arrays in netCDF files was -`still under discussion `_. -Technically, you can use -`any string encoding recognized by Python `_ if you feel the need to deviate from UTF-8, -by setting the ``_Encoding`` field in ``encoding``. But -`we don't recommend it `_. +If character arrays are used: + +- The string encoding that was used is stored on + disk in the ``_Encoding`` attribute, which matches an ad-hoc convention + `adopted by the netCDF4-Python library `_. + At the time of this writing (October 2017), a standard convention for indicating + string encoding for character arrays in netCDF files was + `still under discussion `_. + Technically, you can use + `any string encoding recognized by Python `_ if you feel the need to deviate from UTF-8, + by setting the ``_Encoding`` field in ``encoding``. But + `we don't recommend it `_. +- The character dimension name can be specifed by the ``char_dim_name`` field of a variable's + ``encoding``. If this is not specified the default name for the character dimension is + ``'string%s' % data.shape[-1]``. When decoding character arrays from existing files, the + ``char_dim_name`` is added to the variables ``encoding`` to preserve if encoding happens, but + the field can be edited by the user. .. warning:: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 059b6e8b544..27709a09e7a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,10 @@ v0.12.2 (unreleased) Enhancements ~~~~~~~~~~~~ +- Character arrays' character dimension name decoding and encoding handled by + ``var.encoding['char_dim_name']`` (:issue:`2895`) + By `James McCreight `_. + Bug fixes ~~~~~~~~~ diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index 205d285cd81..007bcb8a502 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -103,16 +103,20 @@ def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) if data.dtype.kind == 'S' and encoding.get('dtype') is not str: data = bytes_to_char(data) - dims = dims + ('string%s' % data.shape[-1],) + if 'char_dim_name' in encoding.keys(): + char_dim_name = encoding.pop('char_dim_name') + else: + char_dim_name = 'string%s' % data.shape[-1] + dims = dims + (char_dim_name,) return Variable(dims, data, attrs, encoding) def decode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_decoding(variable) if data.dtype == 'S1' and dims: + encoding['char_dim_name'] = dims[-1] dims = dims[:-1] data = char_to_bytes(data) - return Variable(dims, data, attrs, encoding) diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py index c50376a5841..98824c9136c 100644 --- a/xarray/tests/test_coding_strings.py +++ b/xarray/tests/test_coding_strings.py @@ -107,6 +107,24 @@ def test_CharacterArrayCoder_encode(data): assert_identical(actual, expected) +@pytest.mark.parametrize( + ['original', 'expected_char_dim_name'], + [ + (Variable(('x',), [b'ab', b'cdef']), + 'string4'), + (Variable(('x',), [b'ab', b'cdef'], encoding={'char_dim_name': 'foo'}), + 'foo') + ] +) +def test_CharacterArrayCoder_char_dim_name(original, expected_char_dim_name): + coder = strings.CharacterArrayCoder() + encoded = coder.encode(original) + roundtripped = coder.decode(encoded) + assert encoded.dims[-1] == expected_char_dim_name + assert roundtripped.encoding['char_dim_name'] == expected_char_dim_name + assert roundtripped.dims[-1] == original.dims[-1] + + def test_StackedBytesArray(): array = np.array([[b'a', b'b', b'c'], [b'd', b'e', b'f']], dtype='S') actual = strings.StackedBytesArray(array)