Skip to content

Commit

Permalink
Use original_shape of final ds and fix test
Browse files Browse the repository at this point in the history
  • Loading branch information
tomvothecoder committed Aug 15, 2022
1 parent 51815e7 commit 86760e7
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 20 deletions.
38 changes: 19 additions & 19 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,34 +306,34 @@ def setUp(self, tmp_path):
self.file_path2 = f"{dir}/file2.nc"

def test_mfdataset_keeps_time_encoding_dict(self):
# FIXME: This test always passes because `xr.open_mfdatset()` always
# keeps the time encoding attrs, which isn't the expected behavior.
# Based on this test, if datasets are generated in xarray and written
# out with `to_netcdf()` and then opened and merged using
# `xr.open_mfdataset()`, the time encoding attributes are not dropped.
# On the other hand, if multiple real world datasets that did not
# originate from xarray (written out with `.to_netcdf()`) are opened
# using `xr.open_mfdataset()`, the time encoding attrs are dropped.
# (Refer to https://github.com/pydata/xarray/issues/2436). My theory is
# that xarray maintains the time encoding attrs if datasets are written
# out with `.to_netcdf()`, and drops it for other cases such
# as opening multiple datasets from other sources.
ds1 = generate_dataset(cf_compliant=True, has_bounds=True)
ds1.to_netcdf(self.file_path1)

# Create another dataset that extends the time coordinates by 1 value,
# to mimic a multifile dataset.
ds2 = generate_dataset(cf_compliant=True, has_bounds=True)
ds2 = ds2.rename_vars({"ts": "tas"})
ds2 = ds2.isel(dict(time=slice(0, 1)))
ds2["time"].values[:] = np.array(
["2002-01-16T12:00:00.000000000"],
dtype="datetime64[ns]",
)
ds2.to_netcdf(self.file_path2)

result = open_mfdataset([self.file_path1, self.file_path2], decode_times=True)
expected = ds1.copy().merge(ds2.copy())
expected = ds1.merge(ds2)

assert result.identical(expected)

# We mainly care for calendar and units attributes. We don't perform
# equality assertion on the entire time `.encoding` dict because there
# might be different encoding attributes added or removed between xarray
# versions (e.g., "bzip2", "ztsd", "blosc", and "szip" are added in
# v2022.06.0), which makes that assertion fragile.
# We mainly care for the "source" and "original_shape" attrs (updated
# internally by xCDAT), and the "calendar" and "units" attrs. We don't
# perform equality assertion on the entire time `.encoding` dict because
# there might be different encoding attributes added or removed between
# xarray versions (e.g., "bzip2", "ztsd", "blosc", and "szip" are added
# in v2022.06.0), which makes that assertion fragile.
paths = result.time.encoding["source"]
assert self.file_path1 in paths[0]
assert self.file_path2 in paths[1]
assert result.time.encoding["original_shape"] == (16,)
assert result.time.encoding["calendar"] == "standard"
assert result.time.encoding["units"] == "days since 2000-01-01"

Expand Down
6 changes: 5 additions & 1 deletion xcdat/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ def open_mfdataset(
if time_encoding is not None:
time_dim = get_axis_dim(ds, "T")
ds[time_dim].encoding = time_encoding
# Update "original_shape" to reflect the final time coordinates shape.
ds[time_dim].encoding["original_shape"] = ds[time_dim].shape

return ds

Expand Down Expand Up @@ -448,9 +450,11 @@ def _keep_time_encoding(paths: Paths) -> Dict[Hashable, Any]:
# FIXME: Remove `type: ignore` comment after properly handling the type
# annotations in `_get_first_path()`.
ds = open_dataset(first_path, decode_times=True, add_bounds=False) # type: ignore

time_coord = get_axis_coord(ds, "T")

time_encoding = time_coord.encoding
time_encoding["source"] = paths

return time_coord.encoding


Expand Down

0 comments on commit 86760e7

Please sign in to comment.