Skip to content

Commit

Permalink
Automate interpretation of _Unsigned attribute (pydata#1453)
Browse files Browse the repository at this point in the history
* Add support for _Unsigned attribute

* Update docstrings with new unsigned behavior

* Cast to unsigned with copy instead of view, fixing infinite recursion. Move _Unsigned between attributes and encoding

* Fix default argument for is_unsigned

* Separate test for unsigned roundtrip

* Move unsigned support out of mask_and_scale, update whats-new

* Fix what's new date and add issue

* Putting enhancement in correct section of whats-new

* Turn off _FillValue support provided by PyNIO. Let xarray handle it.

* Convert _FillValue when _Unsigned is present

* PEP8

* No need to convert unsigned fill value if there is not fill value

* PEP8

* yet more PEP8

* Be more careful with _Unsigned attribute check

* Better fencing for attribute checks. Style fixes. Test for dtypes.
  • Loading branch information
deeplycloudy authored and shoyer committed Jul 28, 2017
1 parent dbf9307 commit e3e6db5
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 7 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ Enhancements
- More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when
raster files are opened with :py:func:`~xarray.open_rasterio`.
By `Greg Brener <https://github.com/gbrener>`_
- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a
a signed integer data type should be interpreted as unsigned bytes
(:issue:`1444`).
By `Eric Bruning <https://github.com/deeplycloudy>`_.

- Speed-up (x 100) of :py:func:`~xarray.conventions.decode_cf_datetime`.
By `Christian Chwala <https://github.com/cchwala>`_.
Expand Down
3 changes: 3 additions & 0 deletions xarray/backends/pynio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False):
import Nio
opener = functools.partial(Nio.open_file, filename, mode=mode)
self.ds = opener()
# xarray provides its own support for FillValue,
# so turn off PyNIO's support for the same.
self.ds.set_option('MaskedArrayMode', 'MaskedNever')
self._autoclose = autoclose
self._isopen = True
self._opener = opener
Expand Down
61 changes: 57 additions & 4 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,34 @@ def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


class UnsignedIntTypeArray(utils.NDArrayMixin):
"""Decode arrays on the fly from signed integer to unsigned
integer. Typically used when _Unsigned is set at as a netCDF
attribute on a signed integer variable.
>>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
>>> sb.dtype
dtype('int8')
>>> UnsignedIntTypeArray(sb).dtype
dtype('uint8')
>>> UnsignedIntTypeArray(sb)[:]
array([ 0, 1, 127, 128, 255], dtype=uint8)
"""
def __init__(self, array):
self.array = array
self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize)

@property
def dtype(self):
return self.unsigned_dtype

def __getitem__(self, key):
return np.asarray(self.array[key], dtype=self.dtype)


def string_to_char(arr):
"""Like netCDF4.stringtochar, but faster and more flexible.
"""
Expand Down Expand Up @@ -655,6 +683,14 @@ def maybe_encode_dtype(var, name=None):
'any _FillValue to use for NaNs' % name,
RuntimeWarning, stacklevel=3)
data = duck_array_ops.around(data)[...]
if encoding.get('_Unsigned', False):
signed_dtype = 'i%s' % dtype.itemsize
if '_FillValue' in var.attrs:
old_fill = np.asarray(attrs['_FillValue'])
new_fill = old_fill.astype(signed_dtype)
attrs['_FillValue'] = new_fill
data = data.astype(signed_dtype)
pop_to(encoding, attrs, '_Unsigned')
if dtype == 'S1' and data.dtype != 'S1':
data = string_to_char(np.asarray(data, 'S'))
dims = dims + ('string%s' % data.shape[-1],)
Expand Down Expand Up @@ -779,7 +815,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
example: ['h', 'e', 'l', 'l', 'o'] -> 'hello'
mask_and_scale: bool
Lazily scale (using scale_factor and add_offset) and mask
(using _FillValue).
(using _FillValue). If the _Unsigned attribute is present
treat integer arrays as unsigned.
decode_times : bool
Decode cf times ('hours since 2000-01-01') to np.datetime64.
decode_endianness : bool
Expand All @@ -804,6 +841,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
dimensions = dimensions[:-1]
data = CharToStringArray(data)

pop_to(attributes, encoding, '_Unsigned')
is_unsigned = encoding.get('_Unsigned', False)
if is_unsigned and mask_and_scale:
if data.dtype.kind == 'i':
data = UnsignedIntTypeArray(data)
else:
warnings.warn("variable has _Unsigned attribute but is not "
"of integer type. Ignoring attribute.",
RuntimeWarning, stacklevel=3)

if mask_and_scale:
if 'missing_value' in attributes:
# missing_value is deprecated, but we still want to support it as
Expand All @@ -818,20 +865,26 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
"and decoding explicitly using "
"xarray.conventions.decode_cf(ds)")
attributes['_FillValue'] = attributes.pop('missing_value')

fill_value = np.array(pop_to(attributes, encoding, '_FillValue'))
if fill_value.size > 1:
warnings.warn("variable has multiple fill values {0}, decoding "
"all values to NaN.".format(str(fill_value)),
RuntimeWarning, stacklevel=3)
scale_factor = pop_to(attributes, encoding, 'scale_factor')
add_offset = pop_to(attributes, encoding, 'add_offset')
if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or
scale_factor is not None or add_offset is not None):
has_fill = (fill_value is not None and
not np.any(pd.isnull(fill_value)))
if (has_fill or scale_factor is not None or add_offset is not None):
if fill_value.dtype.kind in ['U', 'S']:
dtype = object
else:
dtype = float
# According to the CF spec, the fill value is of the same
# type as its variable, i.e. its storage format on disk.
# This handles the case where the fill_value also needs to be
# converted to its unsigned value.
if has_fill:
fill_value = np.asarray(fill_value, dtype=data.dtype)
data = MaskedAndScaledArray(data, fill_value, scale_factor,
add_offset, dtype)

Expand Down
62 changes: 59 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data():
return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)})


def create_unsigned_masked_scaled_data():
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
return Dataset({'x': ('t', x, {}, encoding)})


def create_encoded_unsigned_masked_scaled_data():
# These are values as written to the file: the _FillValue will
# be represented in the signed form.
attributes = {'_FillValue': -1, '_Unsigned': 'true',
'add_offset': 10, 'scale_factor': np.float32(0.1)}
# Create signed data corresponding to [0, 1, 127, 128, 255] unsigned
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
return Dataset({'x': ('t', sb, attributes)})


def create_boolean_data():
attributes = {'units': '-'}
return Dataset({'x': ('t', [True, False, False, True], attributes)})
Expand Down Expand Up @@ -360,24 +377,63 @@ def test_roundtrip_strings_with_fill_value(self):
with self.roundtrip(original) as actual:
self.assertDatasetIdentical(expected, actual)

def test_unsigned_roundtrip_mask_and_scale(self):
decoded = create_unsigned_masked_scaled_data()
encoded = create_encoded_unsigned_masked_scaled_data()
with self.roundtrip(decoded) as actual:
for k in decoded.variables:
self.assertEqual(decoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(
encoded, create_encoded_unsigned_masked_scaled_data())
with self.roundtrip(encoded) as actual:
for k in decoded.variables:
self.assertEqual(decoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
for k in encoded.variables:
self.assertEqual(encoded.variables[k].dtype,
actual.variables[k].dtype)
self.assertDatasetAllClose(encoded, actual)

def test_roundtrip_mask_and_scale(self):
decoded = create_masked_and_scaled_data()
encoded = create_encoded_masked_and_scaled_data()
with self.roundtrip(decoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(decoded,
open_kwargs=dict(decode_cf=False)) as actual:
# TODO: this assumes that all roundtrips will first
# encode. Is that something we want to test for?
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(encoded,
create_encoded_masked_and_scaled_data())
with self.roundtrip(encoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual:
with self.roundtrip(encoded,
open_kwargs=dict(decode_cf=False)) as actual:
self.assertDatasetAllClose(encoded, actual)

def test_coordinates_encoding(self):
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ def test_string_to_char(self):
self.assertArrayEqual(actual, expected)


class TestUnsignedIntTypeArray(TestCase):
def test_unsignedinttype_array(self):
sb = np.asarray([0, 1, 127, -128, -1], dtype='i1')
ub = conventions.UnsignedIntTypeArray(sb)
self.assertEqual(ub.dtype, np.dtype('u1'))
self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255],
dtype=np.dtype('u1')))


class TestBoolTypeArray(TestCase):
def test_booltype_array(self):
x = np.array([1, 0, 1, 1, 0], dtype='i1')
Expand Down

0 comments on commit e3e6db5

Please sign in to comment.