From d4815c0b31dd7b3c9726f71074ad409229625563 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Mon, 31 Jul 2023 16:11:16 -0800 Subject: [PATCH 1/4] Fix nearest neighbour and linear interpolant of interp_nd_binning --- tests/test_spatialstats.py | 60 ++++++++++++++++++++------- xdem/spatialstats.py | 83 +++++++++++++++++++++++++++----------- 2 files changed, 106 insertions(+), 37 deletions(-) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index 547d3410..e3bbe3e2 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -158,8 +158,7 @@ def test_interp_nd_binning_artificial_data(self) -> None: ) arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape((3, 3)) fun = xdem.spatialstats.interp_nd_binning( - df, list_var_names=["var1", "var2"], statistic="statistic", min_count=None - ) + df, list_var_names=["var1", "var2"], statistic="statistic", min_count=None) # Check that the dimensions are rightly ordered assert fun((1, 3)) == df[np.logical_and(df["var1"] == 1, df["var2"] == 3)]["statistic"].values[0] @@ -198,22 +197,55 @@ def test_interp_nd_binning_artificial_data(self) -> None: x = point[0] - 1 y = point[1] - 1 val_extra = fun((y + 1, x + 1)) - # The difference between the points extrapolated outside should be linear with the grid edges, - # i.e. the same as the difference as the first points inside the grid along the same axis + # (OUTDATED: The difference between the points extrapolated outside should be linear with the grid edges, + # # i.e. the same as the difference as the first points inside the grid along the same axis) + # if point[0] == 0: + # diff_in = arr[x + 2, y] - arr[x + 1, y] + # diff_out = arr[x + 1, y] - val_extra + # elif point[0] == 4: + # diff_in = arr[x - 2, y] - arr[x - 1, y] + # diff_out = arr[x - 1, y] - val_extra + # elif point[1] == 0: + # diff_in = arr[x, y + 2] - arr[x, y + 1] + # diff_out = arr[x, y + 1] - val_extra + # # has to be y == 4 + # else: + # diff_in = arr[x, y - 2] - arr[x, y - 1] + # diff_out = arr[x, y - 1] - val_extra + # assert diff_in == diff_out + + # Update with nearest default: the value should be that of the nearest! if point[0] == 0: - diff_in = arr[x + 2, y] - arr[x + 1, y] - diff_out = arr[x + 1, y] - val_extra + near = arr[x + 1, y] elif point[0] == 4: - diff_in = arr[x - 2, y] - arr[x - 1, y] - diff_out = arr[x - 1, y] - val_extra + near = arr[x - 1, y] elif point[1] == 0: - diff_in = arr[x, y + 2] - arr[x, y + 1] - diff_out = arr[x, y + 1] - val_extra - # has to be y == 4 + near = arr[x, y+1] + else: + near = arr[x, y-1] + assert near == val_extra + + # Check that the output extrapolates as "nearest neighbour" far outside the grid + points_far_out = ( + [(-10, i) for i in np.arange(1, 4)] + + [(i, -10) for i in np.arange(1, 4)] + + [(14, i) for i in np.arange(1, 4)] + + [(i, 14) for i in np.arange(4, 1)] + ) + for point in points_far_out: + x = point[0] - 1 + y = point[1] - 1 + val_extra = fun((y + 1, x + 1)) + # Update with nearest default: the value should be that of the nearest! + if point[0] == -10: + near = arr[0, y] + elif point[0] == 14: + near = arr[-1, y] + elif point[1] == -10: + near = arr[x, 0] else: - diff_in = arr[x, y - 2] - arr[x, y - 1] - diff_out = arr[x, y - 1] - val_extra - assert diff_in == diff_out + near = arr[x, -1] + assert near == val_extra # Check that the output is rightly ordered in 3 dimensions, and works with varying dimension lengths vec1 = np.arange(1, 3) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index d9db0a98..3e7337c3 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -211,26 +211,30 @@ def interp_nd_binning( df: pd.DataFrame, list_var_names: str | list[str], statistic: str | Callable[[NDArrayf], np.floating[Any]] = nmad, + interpolate_method: Literal["nearest"] | Literal["linear"] = "linear", min_count: int | None = 100, ) -> Callable[[tuple[ArrayLike, ...]], NDArrayf]: """ Estimate an interpolant function for an N-dimensional binning. Preferably based on the output of nd_binning. For more details on the input dataframe, and associated list of variable name and statistic, see nd_binning. - If the variable pd.DataSeries corresponds to an interval (as the output of nd_binning), uses the middle of the - interval. - Otherwise, uses the variable as such. + First, interpolates nodata values of the irregular N-D binning grid with scipy.griddata. + Then, extrapolates nodata values on the N-D binning grid with scipy.griddata with "nearest neighbour" + (necessary to get a regular grid without NaNs). + Finally, creates an interpolant function (linear by default) to interpolate between points of the grid using + scipy.RegularGridInterpolator. Extrapolation is fixed to nearest neighbour by duplicating edge bins along each + dimension (linear extrapolation of two equal bins = nearest neighbour). - Workflow of the function: - Fills the no-data present on the regular N-D binning grid with nearest neighbour from scipy.griddata, then provides - an interpolant function that linearly interpolates/extrapolates using scipy.RegularGridInterpolator. + If the variable pd.DataSeries corresponds to an interval (as the output of nd_binning), uses the middle of the + interval. Otherwise, uses the variable as such. - :param df: Dataframe with statistic of binned values according to explanatory variables - :param list_var_names: Explanatory variable data series to select from the dataframe - :param statistic: Statistic to interpolate, stored as a data series in the dataframe - :param min_count: Minimum number of samples to be used as a valid statistic (replaced by nodata) + :param df: Dataframe with statistic of binned values according to explanatory variables. + :param list_var_names: Explanatory variable data series to select from the dataframe. + :param statistic: Statistic to interpolate, stored as a data series in the dataframe. + :param interpolate_method: Method to interpolate inside of edge bins, "nearest", "linear" (default). + :param min_count: Minimum number of samples to be used as a valid statistic (replaced by nodata). - :return: N-dimensional interpolant function + :return: N-dimensional interpolant function. :examples # Using a dataframe created from scratch @@ -329,22 +333,53 @@ def interp_nd_binning( list_bmid.append(bmid) shape.append(len(bmid)) - # Use griddata first to perform nearest interpolation with NaNs (irregular grid) + # The workflow below is a bit complicated because of the irregular grid and handling of nodata values! + # Steps 1/ and 2/ fill the nodata values in the irregular grid, and step 3/ creates the interpolant object to + # get a value at any point inside or outside the bin edges + + # 1/ Use griddata first to perform interpolation for nodata values within the N-D convex hull of the irregular grid + # Valid values values = values[ind_valid] - # coordinates of valid values + # Coordinates of valid values points_valid = tuple(df_sub[var].values[ind_valid] for var in list_var_names) - # Grid coordinates + # Coordinates of all grid points (convex hull points will be detected automatically and interpolated) bmid_grid = np.meshgrid(*list_bmid, indexing="ij") points_grid = tuple(bmid_grid[i].flatten() for i in range(len(list_var_names))) - # Fill grid no data with nearest neighbour - values_grid = griddata(points_valid, values, points_grid, method="nearest") - values_grid = values_grid.reshape(shape) - - # RegularGridInterpolator to perform linear interpolation/extrapolation on the grid - # (will extrapolate only outside of boundaries not filled with the nearest of griddata as fill_value = None) + # Interpolate on grid within convex hull with interpolation method + values_grid = griddata(points_valid, values, points_grid, method=interpolate_method) + + # 2/ Use griddata to extrapolate nodata values with nearest neighbour on the N-D grid and remove all NaNs + + # Valid values after above interpolation in convex hull + ind_valid_interp = np.isfinite(values_grid) + values_interp = values_grid[ind_valid_interp] + # Coordinate of valid values + points_valid_interp = tuple(points_grid[i][ind_valid_interp] for i in range(len(points_grid))) + # Extend grid by a value of "1" the point coordinates in all directions to ensure + # that 3/ will extrapolate linearly as for "nearest" + list_bmid_extended = [] + for i in range(len(list_bmid)): + bmid_bin = list_bmid[i] + # Add bin before first edge and decrease coord value + bmid_bin_extlow = np.insert(bmid_bin, 0, bmid_bin[0] - 1) + # Add bin after last edge and increase coord value + bmid_bin_extboth = np.append(bmid_bin_extlow, bmid_bin[-1] + 1) + list_bmid_extended.append(bmid_bin_extboth) + bmid_grid_extended = np.meshgrid(*list_bmid_extended, indexing="ij") + points_grid_extended = tuple(bmid_grid_extended[i].flatten() for i in range(len(list_var_names))) + # Update shape + shape_extended = tuple(x + 2 for x in shape) + + # Extrapolate on extended grid with nearest neighbour + values_grid_nearest = griddata(points_valid_interp, values_interp, points_grid_extended, method="nearest") + values_grid_nearest = values_grid_nearest.reshape(shape_extended) + + # 3/ Use RegularGridInterpolator to perform interpolation **between points** of the grid, with extrapolation forced + # to nearest neighbour by duplicating edge points + # (does not support NaNs, hence the need for 2/ above) interp_fun = RegularGridInterpolator( - tuple(list_bmid), values_grid, method="linear", bounds_error=False, fill_value=None + tuple(list_bmid_extended), values_grid_nearest, method="linear", bounds_error=False, fill_value=None ) return interp_fun # type: ignore @@ -358,9 +393,11 @@ def get_perbin_nd_binning( min_count: int | None = 0, ) -> NDArrayf: """ - Get per-bin statistic for a list of array variables based on the results of an N-D binning . + Get per-bin array statistic for a list of array input variables, based on the results of an independent N-D binning. - For example, get the median statistic for every bin of 2D arrays of input variables (systematic correction). + For example, get a 2D array of elevation uncertainty based on 2D arrays of slope and curvature and a related binning + (for uncertainty analysis) or get a 2D array of elevation bias based on 2D arrays of rotated X coordinates (for + an along-track bias correction). :param list_var: List of size (L) of explanatory variables array of size (N,). :param list_var_names: List of size (L) of names of the explanatory variables. From 3ed17e450cc57cdd47022c5566cda2d57a7d683c Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Mon, 31 Jul 2023 16:15:23 -0800 Subject: [PATCH 2/4] Linting --- tests/test_spatialstats.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_spatialstats.py b/tests/test_spatialstats.py index e3bbe3e2..90b80f3e 100644 --- a/tests/test_spatialstats.py +++ b/tests/test_spatialstats.py @@ -158,7 +158,8 @@ def test_interp_nd_binning_artificial_data(self) -> None: ) arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]).reshape((3, 3)) fun = xdem.spatialstats.interp_nd_binning( - df, list_var_names=["var1", "var2"], statistic="statistic", min_count=None) + df, list_var_names=["var1", "var2"], statistic="statistic", min_count=None + ) # Check that the dimensions are rightly ordered assert fun((1, 3)) == df[np.logical_and(df["var1"] == 1, df["var2"] == 3)]["statistic"].values[0] @@ -220,17 +221,17 @@ def test_interp_nd_binning_artificial_data(self) -> None: elif point[0] == 4: near = arr[x - 1, y] elif point[1] == 0: - near = arr[x, y+1] + near = arr[x, y + 1] else: - near = arr[x, y-1] + near = arr[x, y - 1] assert near == val_extra # Check that the output extrapolates as "nearest neighbour" far outside the grid points_far_out = ( - [(-10, i) for i in np.arange(1, 4)] - + [(i, -10) for i in np.arange(1, 4)] - + [(14, i) for i in np.arange(1, 4)] - + [(i, 14) for i in np.arange(4, 1)] + [(-10, i) for i in np.arange(1, 4)] + + [(i, -10) for i in np.arange(1, 4)] + + [(14, i) for i in np.arange(1, 4)] + + [(i, 14) for i in np.arange(4, 1)] ) for point in points_far_out: x = point[0] - 1 From 3a897c5f29546beca5cb2c7798ff9c258c9efdff Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Mon, 31 Jul 2023 18:28:26 -0800 Subject: [PATCH 3/4] Increase samples for 2d bin in biascorr tests --- tests/test_biascorr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_biascorr.py b/tests/test_biascorr.py index cc4e449c..e324a2ae 100644 --- a/tests/test_biascorr.py +++ b/tests/test_biascorr.py @@ -216,7 +216,7 @@ def test_biascorr__bin_2d(self, bin_sizes, bin_statistic) -> None: elev_fit_params.update({"bias_vars": bias_vars_dict}) # Run with input parameter, and using only 100 subsamples for speed - bcorr.fit(**elev_fit_params, subsample=1000, random_state=42) + bcorr.fit(**elev_fit_params, subsample=10000, random_state=42) # Apply the correction bcorr.apply(dem=self.tba, bias_vars=bias_vars_dict) From 103a33fe61ca61325da30f79f9c2829c29b35aac Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Tue, 1 Aug 2023 12:02:18 -0800 Subject: [PATCH 4/4] Update doctest --- xdem/spatialstats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xdem/spatialstats.py b/xdem/spatialstats.py index 3e7337c3..22d53c88 100644 --- a/xdem/spatialstats.py +++ b/xdem/spatialstats.py @@ -257,9 +257,9 @@ def interp_nd_binning( >>> fun((1.5, 1.5)) array(3.) - # Extrapolated linearly outside the 2D frame. + # Extrapolated linearly outside the 2D frame: nearest neighbour. >>> fun((-1, 1)) - array(-1.) + array(1.) """ # If list of variable input is simply a string if isinstance(list_var_names, str):