From 4e939ad57b87325f09d5bef44b25d54912adc343 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 24 May 2024 14:06:29 -0800 Subject: [PATCH 1/2] Add skip_nodata option to Raster.to_pointcloud --- geoutils/raster/raster.py | 44 ++++++++++++++++++++++---------- tests/test_raster/test_raster.py | 7 +++++ 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index 5198a936..ab303efa 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -3779,6 +3779,7 @@ def to_pointcloud( auxiliary_data_bands: list[int] | None = None, auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, + skip_nodata: bool = True, *, as_array: Literal[False] = False, random_state: int | np.random.Generator | None = None, @@ -3794,6 +3795,7 @@ def to_pointcloud( auxiliary_data_bands: list[int] | None = None, auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, + skip_nodata: bool = True, *, as_array: Literal[True], random_state: int | np.random.Generator | None = None, @@ -3809,6 +3811,7 @@ def to_pointcloud( auxiliary_data_bands: list[int] | None = None, auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, + skip_nodata: bool = True, *, as_array: bool = False, random_state: int | np.random.Generator | None = None, @@ -3823,6 +3826,7 @@ def to_pointcloud( auxiliary_data_bands: list[int] | None = None, auxiliary_column_names: list[str] | None = None, subsample: float | int = 1, + skip_nodata: bool = True, as_array: bool = False, random_state: int | np.random.Generator | None = None, force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul", @@ -3840,11 +3844,11 @@ def to_pointcloud( Optionally, all other bands can also be stored in columns "b1", "b2", etc. For more specific band selection, use Raster.split_bands previous to converting to point cloud. - Optionally, randomly subsample valid pixels for the data band (nodata values are skipped, but only for the band - that will be used as data column of the point cloud). - If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned. + Optionally, randomly subsample valid pixels for the data band (nodata values can be skipped, but only for the + band that will be used as data column of the point cloud). + If 'subsample' is either 1, or is equal to the pixel count, all (valid) points are returned. If 'subsample' is smaller than 1 (for fractions), or smaller than the pixel count, a random subsample - of valid points is returned. + of (valid) points is returned. If the raster is not loaded, sampling will be done from disk using rasterio.sample after loading only the masks of the dataset. @@ -3862,6 +3866,7 @@ def to_pointcloud( :param auxiliary_column_names: (Only for multi-band rasters) Names to use for auxiliary data bands, only if auxiliary data bands is not none, defaults to "b1", "b2", etc. :param subsample: Subsample size. If > 1, parsed as a count, otherwise a fraction. + :param skip_nodata: Whether to skip nodata values. :param as_array: Return an array instead of a vector. :param random_state: Random state or seed number. :param force_pixel_offset: Force offset to derive point coordinate with. Raster coordinates normally only @@ -3931,18 +3936,25 @@ def to_pointcloud( all_indexes = [b - 1 for b in all_bands] # We do 2D subsampling on the data band only, regardless of valid masks on other bands - if self.is_loaded: - if self.count == 1: - self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False" - else: - self_mask = get_mask_from_array( - self.data[data_band - 1, :, :] - ) # This is to avoid the case where the mask is just "False" - valid_mask = ~self_mask + if skip_nodata: + if self.is_loaded: + if self.count == 1: + self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False" + else: + self_mask = get_mask_from_array( + self.data[data_band - 1, :, :] + ) # This is to avoid the case where the mask is just "False" + valid_mask = ~self_mask - # Load only mask of valid data from disk if array not loaded + # Load only mask of valid data from disk if array not loaded + else: + valid_mask = ~self._load_only_mask(bands=data_band) + # If we are not skipping nodata values, valid mask is everwhere else: - valid_mask = ~self._load_only_mask(bands=data_band) + if self.count == 1: + valid_mask = np.ones(self.data.shape, dtype=bool) + else: + valid_mask = np.ones(self.data[0, :].shape, dtype=bool) # Get subsample on valid mask # Build a low memory boolean masked array with invalid values masked to pass to subsampling @@ -3973,6 +3985,10 @@ def to_pointcloud( if np.ma.isMaskedArray(pixel_data): pixel_data = pixel_data.data + # If nodata values were not skipped, convert them to NaNs + if skip_nodata is False: + pixel_data[pixel_data == self.nodata] = np.nan + # Now we force the coordinates we define for the point cloud, according to pixel interpretation x_coords_2, y_coords_2 = ( np.array(a) for a in self.ij2xy(indices[0], indices[1], force_offset=force_pixel_offset) diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py index c84c40b1..7cf9e92b 100644 --- a/tests/test_raster/test_raster.py +++ b/tests/test_raster/test_raster.py @@ -3324,6 +3324,13 @@ def test_to_pointcloud(self) -> None: points = img1.to_pointcloud(data_column_name="lol", subsample=10) assert np.array_equal(points.ds.columns, ["lol", "geometry"]) + # Keeping the nodata values + points_invalid = img1.to_pointcloud(subsample=10000, random_state=42, skip_nodata=False) + + # The subsampled values should not all be valid and the right shape + assert points_invalid.ds.shape == (10000, 2) # One less column here due to geometry storing X and Y + assert any(~np.isfinite(points_invalid["b1"].values)) + # 4/ Multi-band real raster img2 = gu.Raster(self.landsat_rgb_path) From 18e3ba2f7ae3ecc86e8383c35edd84dab2a04ff9 Mon Sep 17 00:00:00 2001 From: Romain Hugonnet Date: Fri, 24 May 2024 14:07:37 -0800 Subject: [PATCH 2/2] Linting --- geoutils/raster/raster.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py index ab303efa..049f2b7b 100644 --- a/geoutils/raster/raster.py +++ b/geoutils/raster/raster.py @@ -3939,7 +3939,9 @@ def to_pointcloud( if skip_nodata: if self.is_loaded: if self.count == 1: - self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False" + self_mask = get_mask_from_array( + self.data + ) # This is to avoid the case where the mask is just "False" else: self_mask = get_mask_from_array( self.data[data_band - 1, :, :] @@ -3949,7 +3951,7 @@ def to_pointcloud( # Load only mask of valid data from disk if array not loaded else: valid_mask = ~self._load_only_mask(bands=data_band) - # If we are not skipping nodata values, valid mask is everwhere + # If we are not skipping nodata values, valid mask is everywhere else: if self.count == 1: valid_mask = np.ones(self.data.shape, dtype=bool)