Skip to content

Commit

Permalink
Add use_cftime option to open_dataset (#2759)
Browse files Browse the repository at this point in the history
* Add use_cftime option to open_dataset

* Remove f-strings

* Fix test-skipping logic and remove 'dummy' from warning

* Note that use_cftime is only relevant for standard calendar dates

* Move use_cftime option to CFDatetimeCoder constructor
  • Loading branch information
spencerkclark authored Feb 19, 2019
1 parent 57cd76d commit 612d390
Show file tree
Hide file tree
Showing 6 changed files with 420 additions and 75 deletions.
10 changes: 9 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,15 @@ Enhancements
- :py:meth:`pandas.Series.dropna` is now supported for a
:py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex`
(:issue:`2688`). By `Spencer Clark <https://github.com/spencerkclark>`_.

- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which
can be used to require that ``cftime.datetime`` objects are always used, or
never used when decoding dates encoded with a standard calendar. This can be
used to ensure consistent date types are returned when using
:py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence
serialization warnings raised if dates from a standard calendar are found to
be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By
`Spencer Clark <https://github.com/spencerkclark>`_.

Bug fixes
~~~~~~~~~

Expand Down
32 changes: 27 additions & 5 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
mask_and_scale=None, decode_times=True, autoclose=None,
concat_characters=True, decode_coords=True, engine=None,
chunks=None, lock=None, cache=None, drop_variables=None,
backend_kwargs=None):
backend_kwargs=None, use_cftime=None):
"""Load and decode a dataset from a file or file-like object.
Parameters
Expand Down Expand Up @@ -231,6 +231,16 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
A dictionary of keyword arguments to pass on to the backend. This
may be useful when backend options would improve performance or
allow user control of dataset processing.
use_cftime: bool, optional
Only relevant if encoded dates come from a standard calendar
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
specified). If None (default), attempt to decode times to
``np.datetime64[ns]`` objects; if this is not possible, decode times to
``cftime.datetime`` objects. If True, always decode times to
``cftime.datetime`` objects, regardless of whether or not they can be
represented using ``np.datetime64[ns]`` objects. If False, always
decode times to ``np.datetime64[ns]`` objects; if this is not possible
raise an error.
Returns
-------
Expand Down Expand Up @@ -269,7 +279,7 @@ def maybe_decode_store(store, lock=False):
ds = conventions.decode_cf(
store, mask_and_scale=mask_and_scale, decode_times=decode_times,
concat_characters=concat_characters, decode_coords=decode_coords,
drop_variables=drop_variables)
drop_variables=drop_variables, use_cftime=use_cftime)

_protect_dataset_variables_inplace(ds, cache)

Expand All @@ -284,7 +294,8 @@ def maybe_decode_store(store, lock=False):
mtime = None
token = tokenize(filename_or_obj, mtime, group, decode_cf,
mask_and_scale, decode_times, concat_characters,
decode_coords, engine, chunks, drop_variables)
decode_coords, engine, chunks, drop_variables,
use_cftime)
name_prefix = 'open_dataset-%s' % token
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
ds2._file_obj = ds._file_obj
Expand Down Expand Up @@ -360,7 +371,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
mask_and_scale=None, decode_times=True, autoclose=None,
concat_characters=True, decode_coords=True, engine=None,
chunks=None, lock=None, cache=None, drop_variables=None,
backend_kwargs=None):
backend_kwargs=None, use_cftime=None):
"""Open an DataArray from a netCDF file containing a single data variable.
This is designed to read netCDF files with only one data variable. If
Expand Down Expand Up @@ -428,6 +439,16 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
A dictionary of keyword arguments to pass on to the backend. This
may be useful when backend options would improve performance or
allow user control of dataset processing.
use_cftime: bool, optional
Only relevant if encoded dates come from a standard calendar
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
specified). If None (default), attempt to decode times to
``np.datetime64[ns]`` objects; if this is not possible, decode times to
``cftime.datetime`` objects. If True, always decode times to
``cftime.datetime`` objects, regardless of whether or not they can be
represented using ``np.datetime64[ns]`` objects. If False, always
decode times to ``np.datetime64[ns]`` objects; if this is not possible
raise an error.
Notes
-----
Expand All @@ -450,7 +471,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
decode_coords=decode_coords, engine=engine,
chunks=chunks, lock=lock, cache=cache,
drop_variables=drop_variables,
backend_kwargs=backend_kwargs)
backend_kwargs=backend_kwargs,
use_cftime=use_cftime)

if len(dataset.data_vars) != 1:
raise ValueError('Given file dataset contains more than one data '
Expand Down
138 changes: 76 additions & 62 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,32 +80,7 @@ def _unpack_netcdf_time_units(units):
return delta_units, ref_date


def _decode_datetime_with_cftime(num_dates, units, calendar):
cftime = _import_cftime()

if cftime.__name__ == 'cftime':
dates = np.asarray(cftime.num2date(num_dates, units, calendar,
only_use_cftime_datetimes=True))
else:
# Must be using num2date from an old version of netCDF4 which
# does not have the only_use_cftime_datetimes option.
dates = np.asarray(cftime.num2date(num_dates, units, calendar))

if (dates[np.nanargmin(num_dates)].year < 1678 or
dates[np.nanargmax(num_dates)].year >= 2262):
if calendar in _STANDARD_CALENDARS:
warnings.warn(
'Unable to decode time axis into full '
'numpy.datetime64 objects, continuing using dummy '
'cftime.datetime objects instead, reason: dates out '
'of range', SerializationWarning, stacklevel=3)
else:
if calendar in _STANDARD_CALENDARS:
dates = cftime_to_nptime(dates)
return dates


def _decode_cf_datetime_dtype(data, units, calendar):
def _decode_cf_datetime_dtype(data, units, calendar, use_cftime):
# Verify that at least the first and last date can be decoded
# successfully. Otherwise, tracebacks end up swallowed by
# Dataset.__repr__ when users try to view their lazily decoded array.
Expand All @@ -115,7 +90,8 @@ def _decode_cf_datetime_dtype(data, units, calendar):
last_item(values) or [0]])

try:
result = decode_cf_datetime(example_value, units, calendar)
result = decode_cf_datetime(example_value, units, calendar,
use_cftime)
except Exception:
calendar_msg = ('the default calendar' if calendar is None
else 'calendar %r' % calendar)
Expand All @@ -129,7 +105,52 @@ def _decode_cf_datetime_dtype(data, units, calendar):
return dtype


def decode_cf_datetime(num_dates, units, calendar=None):
def _decode_datetime_with_cftime(num_dates, units, calendar):
cftime = _import_cftime()

if cftime.__name__ == 'cftime':
return np.asarray(cftime.num2date(num_dates, units, calendar,
only_use_cftime_datetimes=True))
else:
# Must be using num2date from an old version of netCDF4 which
# does not have the only_use_cftime_datetimes option.
return np.asarray(cftime.num2date(num_dates, units, calendar))


def _decode_datetime_with_pandas(flat_num_dates, units, calendar):
if calendar not in _STANDARD_CALENDARS:
raise OutOfBoundsDatetime(
'Cannot decode times from a non-standard calendar, {!r}, using '
'pandas.'.format(calendar))

delta, ref_date = _unpack_netcdf_time_units(units)
delta = _netcdf_to_numpy_timeunit(delta)
try:
ref_date = pd.Timestamp(ref_date)
except ValueError:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
raise OutOfBoundsDatetime

# fixes: https://github.com/pydata/pandas/issues/14068
# these lines check if the the lowest or the highest value in dates
# cause an OutOfBoundsDatetime (Overflow) error
with warnings.catch_warnings():
warnings.filterwarnings('ignore', 'invalid value encountered',
RuntimeWarning)
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date

# Cast input dates to integers of nanoseconds because `pd.to_datetime`
# works much faster when dealing with integers
# make _NS_PER_TIME_DELTA an array to ensure type upcasting
flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
_NS_PER_TIME_DELTA[delta]).astype(np.int64)

return (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + ref_date).values


def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
"""Given an array of numeric dates in netCDF format, convert it into a
numpy array of date time objects.
Expand All @@ -149,41 +170,30 @@ def decode_cf_datetime(num_dates, units, calendar=None):
if calendar is None:
calendar = 'standard'

delta, ref_date = _unpack_netcdf_time_units(units)

try:
if calendar not in _STANDARD_CALENDARS:
raise OutOfBoundsDatetime

delta = _netcdf_to_numpy_timeunit(delta)
if use_cftime is None:
try:
ref_date = pd.Timestamp(ref_date)
except ValueError:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
raise OutOfBoundsDatetime

# fixes: https://github.com/pydata/pandas/issues/14068
# these lines check if the the lowest or the highest value in dates
# cause an OutOfBoundsDatetime (Overflow) error
with warnings.catch_warnings():
warnings.filterwarnings('ignore', 'invalid value encountered',
RuntimeWarning)
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date

# Cast input dates to integers of nanoseconds because `pd.to_datetime`
# works much faster when dealing with integers
# make _NS_PER_TIME_DELTA an array to ensure type upcasting
flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
_NS_PER_TIME_DELTA[delta]).astype(np.int64)

dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') +
ref_date).values

except (OutOfBoundsDatetime, OverflowError):
dates = _decode_datetime_with_pandas(flat_num_dates, units,
calendar)
except (OutOfBoundsDatetime, OverflowError):
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(np.float), units, calendar)

if (dates[np.nanargmin(num_dates)].year < 1678 or
dates[np.nanargmax(num_dates)].year >= 2262):
if calendar in _STANDARD_CALENDARS:
warnings.warn(
'Unable to decode time axis into full '
'numpy.datetime64 objects, continuing using '
'cftime.datetime objects instead, reason: dates out '
'of range', SerializationWarning, stacklevel=3)
else:
if calendar in _STANDARD_CALENDARS:
dates = cftime_to_nptime(dates)
elif use_cftime:
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(np.float), units, calendar)
else:
dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)

return dates.reshape(num_dates.shape)

Expand Down Expand Up @@ -383,6 +393,8 @@ def encode_cf_timedelta(timedeltas, units=None):


class CFDatetimeCoder(VariableCoder):
def __init__(self, use_cftime=None):
self.use_cftime = use_cftime

def encode(self, variable, name=None):
dims, data, attrs, encoding = unpack_for_encoding(variable)
Expand All @@ -403,9 +415,11 @@ def decode(self, variable, name=None):
if 'units' in attrs and 'since' in attrs['units']:
units = pop_to(attrs, encoding, 'units')
calendar = pop_to(attrs, encoding, 'calendar')
dtype = _decode_cf_datetime_dtype(data, units, calendar)
dtype = _decode_cf_datetime_dtype(data, units, calendar,
self.use_cftime)
transform = partial(
decode_cf_datetime, units=units, calendar=calendar)
decode_cf_datetime, units=units, calendar=calendar,
use_cftime=self.use_cftime)
data = lazy_elemwise_func(data, transform, dtype)

return Variable(dims, data, attrs, encoding)
Expand Down
34 changes: 28 additions & 6 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def encode_cf_variable(var, needs_copy=True, name=None):

def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
decode_times=True, decode_endianness=True,
stack_char_dim=True):
stack_char_dim=True, use_cftime=None):
"""
Decodes a variable which may hold CF encoded information.
Expand Down Expand Up @@ -270,6 +270,16 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
Whether to stack characters into bytes along the last dimension of this
array. Passed as an argument because we need to look at the full
dataset to figure out if this is appropriate.
use_cftime: bool, optional
Only relevant if encoded dates come from a standard calendar
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
specified). If None (default), attempt to decode times to
``np.datetime64[ns]`` objects; if this is not possible, decode times to
``cftime.datetime`` objects. If True, always decode times to
``cftime.datetime`` objects, regardless of whether or not they can be
represented using ``np.datetime64[ns]`` objects. If False, always
decode times to ``np.datetime64[ns]`` objects; if this is not possible
raise an error.
Returns
-------
Expand All @@ -292,7 +302,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,

if decode_times:
for coder in [times.CFTimedeltaCoder(),
times.CFDatetimeCoder()]:
times.CFDatetimeCoder(use_cftime=use_cftime)]:
var = coder.decode(var, name=name)

dimensions, data, attributes, encoding = (
Expand Down Expand Up @@ -346,7 +356,8 @@ def _update_bounds_attributes(variables):

def decode_cf_variables(variables, attributes, concat_characters=True,
mask_and_scale=True, decode_times=True,
decode_coords=True, drop_variables=None):
decode_coords=True, drop_variables=None,
use_cftime=None):
"""
Decode several CF encoded variables.
Expand Down Expand Up @@ -387,7 +398,7 @@ def stackable(dim):
new_vars[k] = decode_cf_variable(
k, v, concat_characters=concat_characters,
mask_and_scale=mask_and_scale, decode_times=decode_times,
stack_char_dim=stack_char_dim)
stack_char_dim=stack_char_dim, use_cftime=use_cftime)
if decode_coords:
var_attrs = new_vars[k].attrs
if 'coordinates' in var_attrs:
Expand All @@ -406,7 +417,8 @@ def stackable(dim):


def decode_cf(obj, concat_characters=True, mask_and_scale=True,
decode_times=True, decode_coords=True, drop_variables=None):
decode_times=True, decode_coords=True, drop_variables=None,
use_cftime=None):
"""Decode the given Dataset or Datastore according to CF conventions into
a new Dataset.
Expand All @@ -430,6 +442,16 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.
use_cftime: bool, optional
Only relevant if encoded dates come from a standard calendar
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
specified). If None (default), attempt to decode times to
``np.datetime64[ns]`` objects; if this is not possible, decode times to
``cftime.datetime`` objects. If True, always decode times to
``cftime.datetime`` objects, regardless of whether or not they can be
represented using ``np.datetime64[ns]`` objects. If False, always
decode times to ``np.datetime64[ns]`` objects; if this is not possible
raise an error.
Returns
-------
Expand All @@ -454,7 +476,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,

vars, attrs, coord_names = decode_cf_variables(
vars, attrs, concat_characters, mask_and_scale, decode_times,
decode_coords, drop_variables=drop_variables)
decode_coords, drop_variables=drop_variables, use_cftime=use_cftime)
ds = Dataset(vars, attrs=attrs)
ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
ds._file_obj = file_obj
Expand Down
Loading

0 comments on commit 612d390

Please sign in to comment.