Handle the character array dim name (#2896)

* Handle the charachter array dim name in a variables encoding, set in decode and reapply in encode * Document char_dim_name * Minor change to set of char_dim_name * Test the roundtrip of the char_dim_name in encoding. * pep8 or die * Better test for char_dim_name * pep8 79char madness * nix test logic, use multiple parameterized vars * When encoding and encoding, remove it from encoding * Simpler is better * pep8 visual indent complaint * what is new! * what is newer than new! * what is newer than newer! * what is newer than newer-er! * what is newer than newer-est!
pydata · Apr 19, 2019 · 6d93a95 · 6d93a95
1 parent c8251e3
commit 6d93a95
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 12 deletions.
diff --git a/doc/io.rst b/doc/io.rst
@@ -302,16 +302,23 @@ to using encoded character arrays. Character arrays can be selected even for
 netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1``
 (corresponding to NumPy's single-character bytes dtype).
 
-If character arrays are used, the string encoding that was used is stored on
-disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
-`adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
-At the time of this writing (October 2017), a standard convention for indicating
-string encoding for character arrays in netCDF files was
-`still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
-Technically, you can use
-`any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
-by setting the ``_Encoding`` field in ``encoding``. But
-`we don't recommend it <http://utf8everywhere.org/>`_.
+If character arrays are used:
+
+- The string encoding that was used is stored on
+  disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
+  `adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
+  At the time of this writing (October 2017), a standard convention for indicating
+  string encoding for character arrays in netCDF files was
+  `still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
+  Technically, you can use
+  `any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
+  by setting the ``_Encoding`` field in ``encoding``. But
+  `we don't recommend it <http://utf8everywhere.org/>`_.
+- The character dimension name can be specifed by the ``char_dim_name`` field of a variable's
+  ``encoding``. If this is not specified the default name for the character dimension is
+  ``'string%s' % data.shape[-1]``. When decoding character arrays from existing files, the
+  ``char_dim_name`` is added to the variables ``encoding`` to preserve if encoding happens, but
+  the field can be edited by the user.
 
 .. warning::
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -21,6 +21,10 @@ v0.12.2 (unreleased)
 Enhancements
 ~~~~~~~~~~~~
 
+- Character arrays' character dimension name decoding and encoding handled by
+  ``var.encoding['char_dim_name']`` (:issue:`2895`)
+  By `James McCreight <https://github.com/jmccreight>`_.
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py
@@ -103,16 +103,20 @@ def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
         if data.dtype.kind == 'S' and encoding.get('dtype') is not str:
             data = bytes_to_char(data)
-            dims = dims + ('string%s' % data.shape[-1],)
+            if 'char_dim_name' in encoding.keys():
+                char_dim_name = encoding.pop('char_dim_name')
+            else:
+                char_dim_name = 'string%s' % data.shape[-1]
+            dims = dims + (char_dim_name,)
         return Variable(dims, data, attrs, encoding)
 
     def decode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_decoding(variable)
 
         if data.dtype == 'S1' and dims:
+            encoding['char_dim_name'] = dims[-1]
             dims = dims[:-1]
             data = char_to_bytes(data)
-
         return Variable(dims, data, attrs, encoding)
 
 

diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py
@@ -107,6 +107,24 @@ def test_CharacterArrayCoder_encode(data):
     assert_identical(actual, expected)
 
 
+@pytest.mark.parametrize(
+    ['original', 'expected_char_dim_name'],
+    [
+        (Variable(('x',), [b'ab', b'cdef']),
+         'string4'),
+        (Variable(('x',), [b'ab', b'cdef'], encoding={'char_dim_name': 'foo'}),
+         'foo')
+    ]
+)
+def test_CharacterArrayCoder_char_dim_name(original, expected_char_dim_name):
+    coder = strings.CharacterArrayCoder()
+    encoded = coder.encode(original)
+    roundtripped = coder.decode(encoded)
+    assert encoded.dims[-1] == expected_char_dim_name
+    assert roundtripped.encoding['char_dim_name'] == expected_char_dim_name
+    assert roundtripped.dims[-1] == original.dims[-1]
+
+
 def test_StackedBytesArray():
     array = np.array([[b'a', b'b', b'c'], [b'd', b'e', b'f']], dtype='S')
     actual = strings.StackedBytesArray(array)