Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add reset_encoding to dataset/dataarray/variable #7689

Merged
merged 5 commits into from
Mar 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api-hidden.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@
Variable.dims
Variable.dtype
Variable.encoding
Variable.reset_encoding
Variable.imag
Variable.nbytes
Variable.ndim
Expand Down
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ Dataset contents
Dataset.drop_dims
Dataset.set_coords
Dataset.reset_coords
Dataset.reset_encoding
Dataset.convert_calendar
Dataset.interp_calendar
Dataset.get_index
Expand Down Expand Up @@ -303,6 +304,7 @@ DataArray contents
DataArray.drop_indexes
DataArray.drop_duplicates
DataArray.reset_coords
DataArray.reset_encoding
DataArray.copy
DataArray.convert_calendar
DataArray.interp_calendar
Expand Down
31 changes: 11 additions & 20 deletions doc/user-guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -254,31 +254,22 @@ You can view this encoding information (among others) in the
:py:attr:`DataArray.encoding` and
:py:attr:`DataArray.encoding` attributes:

.. ipython::
:verbatim:
.. ipython:: python

In [1]: ds_disk["y"].encoding
Out[1]:
{'zlib': False,
'shuffle': False,
'complevel': 0,
'fletcher32': False,
'contiguous': True,
'chunksizes': None,
'source': 'saved_on_disk.nc',
'original_shape': (5,),
'dtype': dtype('int64'),
'units': 'days since 2000-01-01 00:00:00',
'calendar': 'proleptic_gregorian'}

In [9]: ds_disk.encoding
Out[9]:
{'unlimited_dims': set(),
'source': 'saved_on_disk.nc'}
ds_disk["y"].encoding
ds_disk.encoding

Note that all operations that manipulate variables other than indexing
will remove encoding information.

In some cases it is useful to intentionally reset a dataset's original encoding values.
This can be done with either the :py:meth:`Dataset.reset_encoding` or
:py:meth:`DataArray.reset_encoding` methods.

.. ipython:: python

ds_no_encoding = ds_disk.reset_encoding()
ds_no_encoding.encoding

.. _combining multiple files:

Expand Down
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ v2023.04.0 (unreleased)

New Features
~~~~~~~~~~~~
- New methods to reset an objects encoding (:py:meth:`Dataset.reset_encoding`, :py:meth:`DataArray.reset_encoding`).
(:issue:`7686`, :pull:`7689`).
By `Joe Hamman <https://github.com/jhamman>`_.


Breaking changes
Expand Down
6 changes: 6 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,12 @@ def encoding(self) -> dict[Any, Any]:
def encoding(self, value: Mapping[Any, Any]) -> None:
self.variable.encoding = dict(value)

def reset_encoding(self: T_DataArray) -> T_DataArray:
"""Return a new DataArray without encoding on the array or any attached
coords."""
ds = self._to_temp_dataset().reset_encoding()
return self._from_temp_dataset(ds)

@property
def indexes(self) -> Indexes:
"""Mapping of pandas.Index objects used for label based indexing.
Expand Down
6 changes: 6 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,12 @@ def encoding(self) -> dict[Any, Any]:
def encoding(self, value: Mapping[Any, Any]) -> None:
self._encoding = dict(value)

def reset_encoding(self: T_Dataset) -> T_Dataset:
"""Return a new Dataset without encoding on the dataset or any of its
variables/coords."""
variables = {k: v.reset_encoding() for k, v in self.variables.items()}
return self._replace(variables=variables, encoding={})

@property
def dims(self) -> Frozen[Hashable, int]:
"""Mapping from dimension names to lengths.
Expand Down
4 changes: 4 additions & 0 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,10 @@ def encoding(self, value):
except ValueError:
raise ValueError("encoding must be castable to a dictionary")

def reset_encoding(self: T_Variable) -> T_Variable:
"""Return a new Variable without encoding."""
return self._replace(encoding={})

def copy(
self: T_Variable, deep: bool = True, data: ArrayLike | None = None
) -> T_Variable:
Expand Down
19 changes: 19 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,25 @@ def test_encoding(self) -> None:
self.dv.encoding = expected2
assert expected2 is not self.dv.encoding

def test_reset_encoding(self) -> None:
array = self.mda
encoding = {"scale_factor": 10}
array.encoding = encoding
array["x"].encoding = encoding

assert array.encoding == encoding
assert array["x"].encoding == encoding

actual = array.reset_encoding()

# did not modify in place
assert array.encoding == encoding
assert array["x"].encoding == encoding

# variable and coord encoding is empty
assert actual.encoding == {}
assert actual["x"].encoding == {}

def test_constructor(self) -> None:
data = np.random.random((2, 3))

Expand Down
15 changes: 15 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2827,6 +2827,21 @@ def test_copy_with_data_errors(self) -> None:
with pytest.raises(ValueError, match=r"contain all variables in original"):
orig.copy(data={"var1": new_var1})

def test_reset_encoding(self) -> None:
orig = create_test_data()
vencoding = {"scale_factor": 10}
orig.encoding = {"foo": "bar"}

for k, v in orig.variables.items():
orig[k].encoding = vencoding

actual = orig.reset_encoding()
assert actual.encoding == {}
for k, v in actual.variables.items():
assert v.encoding == {}

assert_equal(actual, orig)

def test_rename(self) -> None:
data = create_test_data()
newnames = {
Expand Down
23 changes: 22 additions & 1 deletion xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,23 @@ def test_encoding_preserved(self):
assert_identical(expected.to_base_variable(), actual.to_base_variable())
assert expected.encoding == actual.encoding

def test_reset_encoding(self) -> None:
encoding1 = {"scale_factor": 1}
# encoding set via cls constructor
v1 = self.cls(["a"], [0, 1, 2], encoding=encoding1)
assert v1.encoding == encoding1
v2 = v1.reset_encoding()
assert v1.encoding == encoding1
assert v2.encoding == {}

# encoding set via setter
encoding3 = {"scale_factor": 10}
v3 = self.cls(["a"], [0, 1, 2], encoding=encoding3)
assert v3.encoding == encoding3
v4 = v3.reset_encoding()
assert v3.encoding == encoding3
assert v4.encoding == {}

def test_concat(self):
x = np.arange(5)
y = np.arange(5, 10)
Expand Down Expand Up @@ -2201,9 +2218,13 @@ def test_coarsen_keep_attrs(self, operation="mean"):
assert new.attrs == _attrs


def _init_dask_variable(*args, **kwargs):
return Variable(*args, **kwargs).chunk()


@requires_dask
class TestVariableWithDask(VariableSubclassobjects):
cls = staticmethod(lambda *args: Variable(*args).chunk())
cls = staticmethod(_init_dask_variable)

def test_chunk(self):
unblocked = Variable(["dim_0", "dim_1"], np.ones((3, 4)))
Expand Down