Preserve nanosecond resolution when encoding/decoding times (#7827)

* preserve nanosecond resolution when encoding/decoding times. * Apply suggestions from code review Co-authored-by: Spencer Clark <spencerkclark@gmail.com> * use emit_user_level_warning * move time alignment for nc3 to encode_nc3_variable * fix test for encode_cf_timedelta * fix CFMaskCoder for time-like (also allow timedelta64), add first tests * rename to _unpack_time_units_and_ref_date as suggested in review * refactor delta -> time_units as suggested in review * refactor out function _time_units_to_timedelta64, reorder flow and remove unneeded checks, apply filterwarnings, adapt tests * import _is_time_like from coding.variables * adapt tests, add _numpy_to_netcdf_timeunit-conversion function * adapt tests, add _numpy_to_netcdf_timeunit-conversion function * adapt test as per review, remove arm_xfail for backend test * add whats-new.rst entry * Update doc/whats-new.rst Co-authored-by: Spencer Clark <spencerkclark@gmail.com> * Update doc/whats-new.rst Co-authored-by: Spencer Clark <spencerkclark@gmail.com> * fix whats-new.rst --------- Co-authored-by: Spencer Clark <spencerkclark@gmail.com>
pydata · Sep 17, 2023 · b6d4bf7 · b6d4bf7
1 parent ecdb8c1
commit b6d4bf7
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 45 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -76,6 +76,10 @@ Bug fixes
 - Calling plot with kwargs ``col``, ``row`` or ``hue`` no longer squeezes dimensions passed via these arguments
   (:issue:`7552`, :pull:`8174`).
   By `Wiktor Kraśnicki <https://github.com/wkrasnicki>`_.
+- Fixed a bug where casting from ``float`` to ``int64`` (undefined for ``NaN``) led to varying
+  issues (:issue:`7817`, :issue:`7942`, :issue:`7790`, :issue:`6191`, :issue:`7096`,
+  :issue:`1064`, :pull:`7827`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -86,6 +90,8 @@ Internal Changes
 
 - Many error messages related to invalid dimensions or coordinates now always show the list of valid dims/coords (:pull:`8079`).
   By `András Gunyhó <https://github.com/mgunyho>`_.
+- Refactor of encoding and decoding times/timedeltas to preserve nanosecond resolution in arrays that contain missing values (:pull:`7827`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 .. _whats-new.2023.08.0:
 

diff --git a/xarray/backends/netcdf3.py b/xarray/backends/netcdf3.py
@@ -88,13 +88,30 @@ def encode_nc3_attrs(attrs):
     return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
 
 
+def _maybe_prepare_times(var):
+    # checks for integer-based time-like and
+    # replaces np.iinfo(np.int64).min with _FillValue or np.nan
+    # this keeps backwards compatibility
+
+    data = var.data
+    if data.dtype.kind in "iu":
+        units = var.attrs.get("units", None)
+        if units is not None:
+            if coding.variables._is_time_like(units):
+                mask = data == np.iinfo(np.int64).min
+                if mask.any():
+                    data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
+    return data
+
+
 def encode_nc3_variable(var):
     for coder in [
         coding.strings.EncodedStringCoder(allows_unicode=False),
         coding.strings.CharacterArrayCoder(),
     ]:
         var = coder.encode(var)
-    data = coerce_nc3_dtype(var.data)
+    data = _maybe_prepare_times(var)
+    data = coerce_nc3_dtype(data)
     attrs = encode_nc3_attrs(var.attrs)
     return Variable(var.dims, data, attrs, var.encoding)
 

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -25,6 +25,7 @@
 from xarray.core.formatting import first_n_items, format_timestamp, last_item
 from xarray.core.pdcompat import nanosecond_precision_timestamp
 from xarray.core.pycompat import is_duck_dask_array
+from xarray.core.utils import emit_user_level_warning
 from xarray.core.variable import Variable
 
 try:
@@ -122,6 +123,18 @@ def _netcdf_to_numpy_timeunit(units: str) -> str:
     }[units]
 
 
+def _numpy_to_netcdf_timeunit(units: str) -> str:
+    return {
+        "ns": "nanoseconds",
+        "us": "microseconds",
+        "ms": "milliseconds",
+        "s": "seconds",
+        "m": "minutes",
+        "h": "hours",
+        "D": "days",
+    }[units]
+
+
 def _ensure_padded_year(ref_date: str) -> str:
     # Reference dates without a padded year (e.g. since 1-1-1 or since 2-3-4)
     # are ambiguous (is it YMD or DMY?). This can lead to some very odd
@@ -171,6 +184,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
     return delta_units, ref_date
 
 
+def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
+    # same us _unpack_netcdf_time_units but finalizes ref_date for
+    # processing in encode_cf_datetime
+    time_units, _ref_date = _unpack_netcdf_time_units(units)
+    # TODO: the strict enforcement of nanosecond precision Timestamps can be
+    # relaxed when addressing GitHub issue #7493.
+    ref_date = nanosecond_precision_timestamp(_ref_date)
+    # If the ref_date Timestamp is timezone-aware, convert to UTC and
+    # make it timezone-naive (GH 2649).
+    if ref_date.tz is not None:
+        ref_date = ref_date.tz_convert(None)
+    return time_units, ref_date
+
+
 def _decode_cf_datetime_dtype(
     data, units: str, calendar: str, use_cftime: bool | None
 ) -> np.dtype:
@@ -222,8 +249,8 @@ def _decode_datetime_with_pandas(
             "pandas."
         )
 
-    delta, ref_date = _unpack_netcdf_time_units(units)
-    delta = _netcdf_to_numpy_timeunit(delta)
+    time_units, ref_date = _unpack_netcdf_time_units(units)
+    time_units = _netcdf_to_numpy_timeunit(time_units)
     try:
         # TODO: the strict enforcement of nanosecond precision Timestamps can be
         # relaxed when addressing GitHub issue #7493.
@@ -237,8 +264,8 @@ def _decode_datetime_with_pandas(
         warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
         if flat_num_dates.size > 0:
             # avoid size 0 datetimes GH1329
-            pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
-            pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
+            pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
+            pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
 
     # To avoid integer overflow when converting to nanosecond units for integer
     # dtypes smaller than np.int64 cast all integer and unsigned integer dtype
@@ -251,9 +278,12 @@ def _decode_datetime_with_pandas(
 
     # Cast input ordinals to integers of nanoseconds because pd.to_timedelta
     # works much faster when dealing with integers (GH 1399).
-    flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype(
-        np.int64
-    )
+    # properly handle NaN/NaT to prevent casting NaN to int
+    nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
+    flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
+    flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
+    flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
+    flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
 
     # Use pd.to_timedelta to safely cast integer values to timedeltas,
     # and add those to a Timestamp to safely produce a DatetimeIndex.  This
@@ -364,6 +394,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str:
     return "seconds"
 
 
+def _time_units_to_timedelta64(units: str) -> np.timedelta64:
+    return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]")
+
+
 def infer_calendar_name(dates) -> CFCalendar:
     """Given an array of datetimes, infer the CF calendar name"""
     if is_np_datetime_like(dates.dtype):
@@ -572,9 +606,12 @@ def _should_cftime_be_used(
 
 
 def _cleanup_netcdf_time_units(units: str) -> str:
-    delta, ref_date = _unpack_netcdf_time_units(units)
+    time_units, ref_date = _unpack_netcdf_time_units(units)
+    time_units = time_units.lower()
+    if not time_units.endswith("s"):
+        time_units = f"{time_units}s"
     try:
-        units = f"{delta} since {format_timestamp(ref_date)}"
+        units = f"{time_units} since {format_timestamp(ref_date)}"
     except (OutOfBoundsDatetime, ValueError):
         # don't worry about reifying the units if they're out of bounds or
         # formatted badly
@@ -633,62 +670,93 @@ def encode_cf_datetime(
     """
     dates = np.asarray(dates)
 
+    data_units = infer_datetime_units(dates)
+
     if units is None:
-        units = infer_datetime_units(dates)
+        units = data_units
     else:
         units = _cleanup_netcdf_time_units(units)
 
     if calendar is None:
         calendar = infer_calendar_name(dates)
 
-    delta, _ref_date = _unpack_netcdf_time_units(units)
     try:
         if not _is_standard_calendar(calendar) or dates.dtype.kind == "O":
             # parse with cftime instead
             raise OutOfBoundsDatetime
         assert dates.dtype == "datetime64[ns]"
 
-        delta_units = _netcdf_to_numpy_timeunit(delta)
-        time_delta = np.timedelta64(1, delta_units).astype("timedelta64[ns]")
-
-        # TODO: the strict enforcement of nanosecond precision Timestamps can be
-        # relaxed when addressing GitHub issue #7493.
-        ref_date = nanosecond_precision_timestamp(_ref_date)
+        time_units, ref_date = _unpack_time_units_and_ref_date(units)
+        time_delta = _time_units_to_timedelta64(time_units)
 
-        # If the ref_date Timestamp is timezone-aware, convert to UTC and
-        # make it timezone-naive (GH 2649).
-        if ref_date.tz is not None:
-            ref_date = ref_date.tz_convert(None)
+        # retrieve needed units to faithfully encode to int64
+        needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
+        if data_units != units:
+            # this accounts for differences in the reference times
+            ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
+            if ref_delta > np.timedelta64(0, "ns"):
+                needed_units = _infer_time_units_from_diff(ref_delta)
 
         # Wrap the dates in a DatetimeIndex to do the subtraction to ensure
         # an OverflowError is raised if the ref_date is too far away from
         # dates to be encoded (GH 2272).
         dates_as_index = pd.DatetimeIndex(dates.ravel())
         time_deltas = dates_as_index - ref_date
 
-        # Use floor division if time_delta evenly divides all differences
-        # to preserve integer dtype if possible (GH 4045).
-        if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")):
-            num = time_deltas // time_delta
+        # needed time delta to encode faithfully to int64
+        needed_time_delta = _time_units_to_timedelta64(needed_units)
+        if time_delta <= needed_time_delta:
+            # calculate int64 floor division
+            # to preserve integer dtype if possible (GH 4045, GH7817).
+            num = time_deltas // time_delta.astype(np.int64)
+            num = num.astype(np.int64, copy=False)
         else:
+            emit_user_level_warning(
+                f"Times can't be serialized faithfully with requested units {units!r}. "
+                f"Resolution of {needed_units!r} needed. "
+                f"Serializing timeseries to floating point."
+            )
             num = time_deltas / time_delta
         num = num.values.reshape(dates.shape)
 
     except (OutOfBoundsDatetime, OverflowError, ValueError):
         num = _encode_datetime_with_cftime(dates, units, calendar)
+        # do it now only for cftime-based flow
+        # we already covered for this in pandas-based flow
+        num = cast_to_int_if_safe(num)
 
-    num = cast_to_int_if_safe(num)
     return (num, units, calendar)
 
 
 def encode_cf_timedelta(timedeltas, units: str | None = None) -> tuple[np.ndarray, str]:
-    if units is None:
-        units = infer_timedelta_units(timedeltas)
+    data_units = infer_timedelta_units(timedeltas)
 
-    np_unit = _netcdf_to_numpy_timeunit(units)
-    num = 1.0 * timedeltas / np.timedelta64(1, np_unit)
-    num = np.where(pd.isnull(timedeltas), np.nan, num)
-    num = cast_to_int_if_safe(num)
+    if units is None:
+        units = data_units
+
+    time_delta = _time_units_to_timedelta64(units)
+    time_deltas = pd.TimedeltaIndex(timedeltas.ravel())
+
+    # retrieve needed units to faithfully encode to int64
+    needed_units = data_units
+    if data_units != units:
+        needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna()))
+
+    # needed time delta to encode faithfully to int64
+    needed_time_delta = _time_units_to_timedelta64(needed_units)
+    if time_delta <= needed_time_delta:
+        # calculate int64 floor division
+        # to preserve integer dtype if possible
+        num = time_deltas // time_delta.astype(np.int64)
+        num = num.astype(np.int64, copy=False)
+    else:
+        emit_user_level_warning(
+            f"Timedeltas can't be serialized faithfully with requested units {units!r}. "
+            f"Resolution of {needed_units!r} needed. "
+            f"Serializing timedeltas to floating point."
+        )
+        num = time_deltas / time_delta
+    num = num.values.reshape(timedeltas.shape)
     return (num, units)
 
 
@@ -702,9 +770,10 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         ) or contains_cftime_datetimes(variable):
             dims, data, attrs, encoding = unpack_for_encoding(variable)
 
-            (data, units, calendar) = encode_cf_datetime(
-                data, encoding.pop("units", None), encoding.pop("calendar", None)
-            )
+            units = encoding.pop("units", None)
+            calendar = encoding.pop("calendar", None)
+            (data, units, calendar) = encode_cf_datetime(data, units, calendar)
+
             safe_setitem(attrs, "units", units, name=name)
             safe_setitem(attrs, "calendar", calendar, name=name)
 

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -215,6 +215,21 @@ def _apply_mask(
     return np.where(condition, decoded_fill_value, data)
 
 
+def _is_time_like(units):
+    # test for time-like
+    time_strings = [
+        "since",
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "milliseconds",
+        "microseconds",
+        "nanoseconds",
+    ]
+    return any(tstr in str(units) for tstr in time_strings)
+
+
 class CFMaskCoder(VariableCoder):
     """Mask or unmask fill values according to CF conventions."""
 
@@ -236,19 +251,32 @@ def encode(self, variable: Variable, name: T_Name = None):
                 f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
             )
 
+        # special case DateTime to properly handle NaT
+        is_time_like = _is_time_like(attrs.get("units"))
+
         if fv_exists:
             # Ensure _FillValue is cast to same dtype as data's
             encoding["_FillValue"] = dtype.type(fv)
             fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
             if not pd.isnull(fill_value):
-                data = duck_array_ops.fillna(data, fill_value)
+                if is_time_like and data.dtype.kind in "iu":
+                    data = duck_array_ops.where(
+                        data != np.iinfo(np.int64).min, data, fill_value
+                    )
+                else:
+                    data = duck_array_ops.fillna(data, fill_value)
 
         if mv_exists:
             # Ensure missing_value is cast to same dtype as data's
             encoding["missing_value"] = dtype.type(mv)
             fill_value = pop_to(encoding, attrs, "missing_value", name=name)
             if not pd.isnull(fill_value) and not fv_exists:
-                data = duck_array_ops.fillna(data, fill_value)
+                if is_time_like and data.dtype.kind in "iu":
+                    data = duck_array_ops.where(
+                        data != np.iinfo(np.int64).min, data, fill_value
+                    )
+                else:
+                    data = duck_array_ops.fillna(data, fill_value)
 
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
@@ -275,7 +303,11 @@ def decode(self, variable: Variable, name: T_Name = None):
                     stacklevel=3,
                 )
 
-            dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+            # special case DateTime to properly handle NaT
+            if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
+                dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
+            else:
+                dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
 
             if encoded_fill_values:
                 transform = partial(

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -54,7 +54,6 @@
 from xarray.core.options import set_options
 from xarray.core.pycompat import array_type
 from xarray.tests import (
-    arm_xfail,
     assert_allclose,
     assert_array_equal,
     assert_equal,
@@ -526,7 +525,6 @@ def test_roundtrip_string_encoded_characters(self) -> None:
             assert_identical(expected, actual)
             assert actual["x"].encoding["_Encoding"] == "ascii"
 
-    @arm_xfail
     def test_roundtrip_numpy_datetime_data(self) -> None:
         times = pd.to_datetime(["2000-01-01", "2000-01-02", "NaT"])
         expected = Dataset({"t": ("t", times), "t0": times[0]})