From 6d93a95d05bdbfc33fff24064f67d29dd891ab58 Mon Sep 17 00:00:00 2001
From: James McCreight <jamesmcc@ucar.edu>
Date: Fri, 19 Apr 2019 11:52:58 -0600
Subject: [PATCH] Handle the character array dim name  (#2896)

* Handle the charachter array dim name in a variables encoding, set in decode and reapply in encode

* Document char_dim_name

* Minor change to set of char_dim_name

* Test the roundtrip of the char_dim_name in encoding.

* pep8 or die

* Better test for char_dim_name

* pep8 79char madness

* nix test logic, use multiple parameterized vars

* When encoding and encoding, remove it from encoding

* Simpler is better

* pep8 visual indent complaint

* what is new!

* what is newer than new!

* what is newer than newer!

* what is newer than newer-er!

* what is newer than newer-est!
---
 doc/io.rst                          | 27 +++++++++++++++++----------
 doc/whats-new.rst                   |  4 ++++
 xarray/coding/strings.py            |  8 ++++++--
 xarray/tests/test_coding_strings.py | 18 ++++++++++++++++++
 4 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/doc/io.rst b/doc/io.rst
index 51c747189da..b470284f071 100644
--- a/doc/io.rst
+++ b/doc/io.rst
@@ -302,16 +302,23 @@ to using encoded character arrays. Character arrays can be selected even for
 netCDF4 files by setting the ``dtype`` field in ``encoding`` to ``S1``
 (corresponding to NumPy's single-character bytes dtype).
 
-If character arrays are used, the string encoding that was used is stored on
-disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
-`adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
-At the time of this writing (October 2017), a standard convention for indicating
-string encoding for character arrays in netCDF files was
-`still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
-Technically, you can use
-`any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
-by setting the ``_Encoding`` field in ``encoding``. But
-`we don't recommend it <http://utf8everywhere.org/>`_.
+If character arrays are used:
+
+- The string encoding that was used is stored on
+  disk in the ``_Encoding`` attribute, which matches an ad-hoc convention
+  `adopted by the netCDF4-Python library <https://github.com/Unidata/netcdf4-python/pull/665>`_.
+  At the time of this writing (October 2017), a standard convention for indicating
+  string encoding for character arrays in netCDF files was
+  `still under discussion <https://github.com/Unidata/netcdf-c/issues/402>`_.
+  Technically, you can use
+  `any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
+  by setting the ``_Encoding`` field in ``encoding``. But
+  `we don't recommend it <http://utf8everywhere.org/>`_.
+- The character dimension name can be specifed by the ``char_dim_name`` field of a variable's
+  ``encoding``. If this is not specified the default name for the character dimension is
+  ``'string%s' % data.shape[-1]``. When decoding character arrays from existing files, the
+  ``char_dim_name`` is added to the variables ``encoding`` to preserve if encoding happens, but
+  the field can be edited by the user.
 
 .. warning::
 
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
index 059b6e8b544..27709a09e7a 100644
--- a/doc/whats-new.rst
+++ b/doc/whats-new.rst
@@ -21,6 +21,10 @@ v0.12.2 (unreleased)
 Enhancements
 ~~~~~~~~~~~~
 
+- Character arrays' character dimension name decoding and encoding handled by
+  ``var.encoding['char_dim_name']`` (:issue:`2895`)
+  By `James McCreight <https://github.com/jmccreight>`_.
+   
 Bug fixes
 ~~~~~~~~~
 
diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py
index 205d285cd81..007bcb8a502 100644
--- a/xarray/coding/strings.py
+++ b/xarray/coding/strings.py
@@ -103,16 +103,20 @@ def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
         if data.dtype.kind == 'S' and encoding.get('dtype') is not str:
             data = bytes_to_char(data)
-            dims = dims + ('string%s' % data.shape[-1],)
+            if 'char_dim_name' in encoding.keys():
+                char_dim_name = encoding.pop('char_dim_name')
+            else:
+                char_dim_name = 'string%s' % data.shape[-1]
+            dims = dims + (char_dim_name,)
         return Variable(dims, data, attrs, encoding)
 
     def decode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_decoding(variable)
 
         if data.dtype == 'S1' and dims:
+            encoding['char_dim_name'] = dims[-1]
             dims = dims[:-1]
             data = char_to_bytes(data)
-
         return Variable(dims, data, attrs, encoding)
 
 
diff --git a/xarray/tests/test_coding_strings.py b/xarray/tests/test_coding_strings.py
index c50376a5841..98824c9136c 100644
--- a/xarray/tests/test_coding_strings.py
+++ b/xarray/tests/test_coding_strings.py
@@ -107,6 +107,24 @@ def test_CharacterArrayCoder_encode(data):
     assert_identical(actual, expected)
 
 
+@pytest.mark.parametrize(
+    ['original', 'expected_char_dim_name'],
+    [
+        (Variable(('x',), [b'ab', b'cdef']),
+         'string4'),
+        (Variable(('x',), [b'ab', b'cdef'], encoding={'char_dim_name': 'foo'}),
+         'foo')
+    ]
+)
+def test_CharacterArrayCoder_char_dim_name(original, expected_char_dim_name):
+    coder = strings.CharacterArrayCoder()
+    encoded = coder.encode(original)
+    roundtripped = coder.decode(encoded)
+    assert encoded.dims[-1] == expected_char_dim_name
+    assert roundtripped.encoding['char_dim_name'] == expected_char_dim_name
+    assert roundtripped.dims[-1] == original.dims[-1]
+
+
 def test_StackedBytesArray():
     array = np.array([[b'a', b'b', b'c'], [b'd', b'e', b'f']], dtype='S')
     actual = strings.StackedBytesArray(array)