Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement .polyfit() with apply_ufunc #5933

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3847,7 +3847,7 @@ def polyfit(
self,
dim: Hashable,
deg: int,
skipna: bool = None,
skipna: bool = True,
rcond: float = None,
w: Union[Hashable, Any] = None,
full: bool = False,
Expand All @@ -3867,11 +3867,10 @@ def polyfit(
Degree of the fitting polynomial.
skipna : bool, optional
If True, removes all invalid values before fitting each 1D slices of the array.
Default is True if data is stored in a dask.array or if there is any
invalid values, False otherwise.
Default is True.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could do

-        skipna : bool, optional
+        skipna : bool, default: True

instead.

rcond : float, optional
Relative condition number to the fit.
w : hashable or array-like, optional
w : hashable or Any, optional
Weights to apply to the y-coordinate of the sample points.
Can be an array-like object or the name of a coordinate in the dataset.
full : bool, optional
Expand All @@ -3893,11 +3892,20 @@ def polyfit(
When the matrix rank is deficient, np.nan is returned.
[dim]_matrix_rank
The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`)
[dim]_singular_value
The rank is computed ignoring the NaN values that might be skipped.
[dim]_singular_values
The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`)
polyfit_covariance
[dim]_rcond
The specified value of rcond (only included if `full=True`)
[var]_polyfit_covariance
The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`)

Warns
-----
RankWarning
The rank of the coefficient matrix in the least-squares fit is deficient.
The warning is not raised with `full=True`.

See Also
--------
numpy.polyfit
Expand Down
189 changes: 83 additions & 106 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6808,7 +6808,7 @@ def polyfit(
self,
dim: Hashable,
deg: int,
skipna: bool = None,
skipna: bool = True,
rcond: float = None,
w: Union[Hashable, Any] = None,
full: bool = False,
Expand All @@ -6828,8 +6828,7 @@ def polyfit(
Degree of the fitting polynomial.
skipna : bool, optional
If True, removes all invalid values before fitting each 1D slices of the array.
Default is True if data is stored in a dask.array or if there is any
invalid values, False otherwise.
Default is True.
rcond : float, optional
Relative condition number to the fit.
w : hashable or Any, optional
Expand Down Expand Up @@ -6857,146 +6856,124 @@ def polyfit(
The rank is computed ignoring the NaN values that might be skipped.
[dim]_singular_values
The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`)
[dim]_rcond
The specified value of rcond (only included if `full=True`)
[var]_polyfit_covariance
The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`)

Warns
-----
RankWarning
The rank of the coefficient matrix in the least-squares fit is deficient.
The warning is not raised with in-memory (not dask) data and `full=True`.
The warning is not raised with `full=True`.

See Also
--------
numpy.polyfit
numpy.polyval
xarray.polyval
"""
variables = {}
skipna_da = skipna

x = get_clean_interp_index(self, dim, strict=False)
xname = "{}_".format(self[dim].name)
order = int(deg) + 1
lhs = np.vander(x, order)

if rcond is None:
rcond = (
x.shape[0] * np.core.finfo(x.dtype).eps # type: ignore[attr-defined]
)

# Weights:
if w is not None:
if isinstance(w, Hashable):
w = self.coords[w]
w = np.asarray(w)
if w.ndim != 1:
raise TypeError("Expected a 1-d array for weights.")
if w.shape[0] != lhs.shape[0]:
raise TypeError("Expected w and {} to have the same length".format(dim))
lhs *= w[:, np.newaxis]
if w.shape[0] != self[dim].shape[0]:
raise TypeError(f"Expected w and {dim} to have the same length")

# Scaling
scale = np.sqrt((lhs * lhs).sum(axis=0))
lhs /= scale

degree_dim = utils.get_temp_dimname(self.dims, "degree")
order = int(deg) + 1
degrees = np.arange(deg, -1, -1)

rank = np.linalg.matrix_rank(lhs)
x = get_clean_interp_index(self, dim, strict=False)

# sizes and names for varying polyfit outputs
output_sizes = {"degree": order}
if full:
rank = xr.DataArray(rank, name=xname + "matrix_rank")
variables[rank.name] = rank
sing = np.linalg.svd(lhs, compute_uv=False)
sing = xr.DataArray(
sing,
dims=(degree_dim,),
coords={degree_dim: np.arange(rank - 1, -1, -1)},
name=xname + "singular_values",
)
variables[sing.name] = sing
output_core_dims = [("degree",), (), (), ("degree",), ()]
output_vars = [
"{name}polyfit_coefficients",
"{name}polyfit_residuals",
f"{dim}_matrix_rank",
f"{dim}_singular_values",
f"{dim}_rcond",
]
elif cov and not full:
output_core_dims = [("degree",), ("cov_i", "cov_j")]
output_vars = [
"{name}polyfit_coefficients",
"{name}polyfit_covariance",
]
output_sizes["cov_i"] = order
output_sizes["cov_j"] = order
else:
output_core_dims = [("degree",)]
output_vars = ["{name}polyfit_coefficients"]

def _wrapper(x, y):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might not be worth it, but you could avoid a if conditional in the inner loop as follows:

def _wrapper_skipna(x, y):
    ...

def _wrapper_noskipna(x, y):
    ...

if skipna:
    _wrapper = _wrapper_skipna
else:
    _wrapper = _wrapper_noskipna

# Wrap np.polyfit with pointwise NaN handling
if skipna:
mask = np.all([~np.isnan(x), ~np.isnan(y)], axis=0)
x = x[mask]
y = y[mask]
if not len(y):
return tuple(
np.full(len(var) * [order], np.nan) for var in output_core_dims
)
output = np.polyfit(x, y, deg, rcond=rcond, full=full, w=w, cov=cov)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

numpy recommends to use Polynomial.fit <numpy.polynomial.polynomial.Polynomial.fit> class - did you consider switching to this (no requirement just a question).

if full and output[2] < order:
# fit is poorly conditioned, need to pad higher polynomial values with zeros
output = (
output[0],
np.nan,
output[2],
np.append((order - output[2]) * [np.nan], output[3]),
output[4],
)
return output

result = xr.Dataset()
for name, da in self.data_vars.items():
if dim not in da.dims:
continue

if is_duck_dask_array(da.data) and (
rank != order or full or skipna is None
):
# Current algorithm with dask and skipna=False neither supports
# deficient ranks nor does it output the "full" info (issue dask/dask#6516)
skipna_da = True
elif skipna is None:
skipna_da = bool(np.any(da.isnull()))

dims_to_stack = [dimname for dimname in da.dims if dimname != dim]
stacked_coords: Dict[Hashable, DataArray] = {}
if dims_to_stack:
stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked")
rhs = da.transpose(dim, *dims_to_stack).stack(
{stacked_dim: dims_to_stack}
)
stacked_coords = {stacked_dim: rhs[stacked_dim]}
scale_da = scale[:, np.newaxis]
if name is xr.core.dataarray._THIS_ARRAY:
name = ""
else:
rhs = da
scale_da = scale

if w is not None:
rhs *= w[:, np.newaxis]

name = f"{str(name)}_"
with warnings.catch_warnings():
if full: # Copy np.polyfit behavior
warnings.simplefilter("ignore", np.RankWarning)
else: # Raise only once per variable
warnings.simplefilter("once", np.RankWarning)

coeffs, residuals = duck_array_ops.least_squares(
lhs, rhs.data, rcond=rcond, skipna=skipna_da
output = xr.apply_ufunc(
_wrapper,
x,
da,
vectorize=True,
dask="parallelized",
input_core_dims=[[dim], [dim]],
output_core_dims=output_core_dims,
dask_gufunc_kwargs={"output_sizes": output_sizes},
output_dtypes=len(output_core_dims) * (np.float64,),
)

if isinstance(name, str):
name = "{}_".format(name)
# handle naming of varying polyfit outputs
# including case where output is only coefficients and not a tuple
if len(output_vars) == 1:
result[output_vars[0].format(name=name)] = output
else:
# Thus a ReprObject => polyfit was called on a DataArray
name = ""

coeffs = xr.DataArray(
coeffs / scale_da,
dims=[degree_dim] + list(stacked_coords.keys()),
coords={degree_dim: np.arange(order)[::-1], **stacked_coords},
name=name + "polyfit_coefficients",
)
if dims_to_stack:
coeffs = coeffs.unstack(stacked_dim)
variables[coeffs.name] = coeffs

if full or (cov is True):
residuals = xr.DataArray(
residuals if dims_to_stack else residuals.squeeze(),
dims=list(stacked_coords.keys()),
coords=stacked_coords,
name=name + "polyfit_residuals",
)
if dims_to_stack:
residuals = residuals.unstack(stacked_dim)
variables[residuals.name] = residuals

if cov:
Vbase = np.linalg.inv(np.dot(lhs.T, lhs))
Vbase /= np.outer(scale, scale)
if cov == "unscaled":
fac = 1
else:
if x.shape[0] <= order:
raise ValueError(
"The number of data points must exceed order to scale the covariance matrix."
)
fac = residuals / (x.shape[0] - order)
covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac
variables[name + "polyfit_covariance"] = covariance
for i, var in enumerate(output_vars):
result[var.format(name=name)] = output[i]

result = result.assign_coords({"degree": degrees})
if cov:
result = result.assign_coords({"cov_i": degrees, "cov_j": degrees})
# We need this to maintain ordering of previous polyfit implementation...?
result = result.transpose("degree", "cov_i", "cov_j", ...)
else:
result = result.transpose("degree", ...)
result.attrs = self.attrs.copy()

return Dataset(data_vars=variables, attrs=self.attrs.copy())
return result

def pad(
self,
Expand Down
2 changes: 1 addition & 1 deletion xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3742,7 +3742,7 @@ def test_polyfit(self, use_dask, use_datetime):
# Skipna + Full output
out = da.polyfit("x", 2, skipna=True, full=True)
assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3)
assert out.x_matrix_rank == 3
np.testing.assert_almost_equal(out.x_matrix_rank, [3, 3])
np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0])

with warnings.catch_warnings():
Expand Down