Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skip_nodata argument to Raster.to_pointcloud #546

Merged
merged 2 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 32 additions & 14 deletions geoutils/raster/raster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3779,6 +3779,7 @@ def to_pointcloud(
auxiliary_data_bands: list[int] | None = None,
auxiliary_column_names: list[str] | None = None,
subsample: float | int = 1,
skip_nodata: bool = True,
*,
as_array: Literal[False] = False,
random_state: int | np.random.Generator | None = None,
Expand All @@ -3794,6 +3795,7 @@ def to_pointcloud(
auxiliary_data_bands: list[int] | None = None,
auxiliary_column_names: list[str] | None = None,
subsample: float | int = 1,
skip_nodata: bool = True,
*,
as_array: Literal[True],
random_state: int | np.random.Generator | None = None,
Expand All @@ -3809,6 +3811,7 @@ def to_pointcloud(
auxiliary_data_bands: list[int] | None = None,
auxiliary_column_names: list[str] | None = None,
subsample: float | int = 1,
skip_nodata: bool = True,
*,
as_array: bool = False,
random_state: int | np.random.Generator | None = None,
Expand All @@ -3823,6 +3826,7 @@ def to_pointcloud(
auxiliary_data_bands: list[int] | None = None,
auxiliary_column_names: list[str] | None = None,
subsample: float | int = 1,
skip_nodata: bool = True,
as_array: bool = False,
random_state: int | np.random.Generator | None = None,
force_pixel_offset: Literal["center", "ul", "ur", "ll", "lr"] = "ul",
Expand All @@ -3840,11 +3844,11 @@ def to_pointcloud(
Optionally, all other bands can also be stored in columns "b1", "b2", etc. For more specific band selection,
use Raster.split_bands previous to converting to point cloud.

Optionally, randomly subsample valid pixels for the data band (nodata values are skipped, but only for the band
that will be used as data column of the point cloud).
If 'subsample' is either 1, or is equal to the pixel count, all valid points are returned.
Optionally, randomly subsample valid pixels for the data band (nodata values can be skipped, but only for the
band that will be used as data column of the point cloud).
If 'subsample' is either 1, or is equal to the pixel count, all (valid) points are returned.
If 'subsample' is smaller than 1 (for fractions), or smaller than the pixel count, a random subsample
of valid points is returned.
of (valid) points is returned.

If the raster is not loaded, sampling will be done from disk using rasterio.sample after loading only the masks
of the dataset.
Expand All @@ -3862,6 +3866,7 @@ def to_pointcloud(
:param auxiliary_column_names: (Only for multi-band rasters) Names to use for auxiliary data bands, only if
auxiliary data bands is not none, defaults to "b1", "b2", etc.
:param subsample: Subsample size. If > 1, parsed as a count, otherwise a fraction.
:param skip_nodata: Whether to skip nodata values.
:param as_array: Return an array instead of a vector.
:param random_state: Random state or seed number.
:param force_pixel_offset: Force offset to derive point coordinate with. Raster coordinates normally only
Expand Down Expand Up @@ -3931,18 +3936,27 @@ def to_pointcloud(
all_indexes = [b - 1 for b in all_bands]

# We do 2D subsampling on the data band only, regardless of valid masks on other bands
if self.is_loaded:
if self.count == 1:
self_mask = get_mask_from_array(self.data) # This is to avoid the case where the mask is just "False"
else:
self_mask = get_mask_from_array(
self.data[data_band - 1, :, :]
) # This is to avoid the case where the mask is just "False"
valid_mask = ~self_mask
if skip_nodata:
if self.is_loaded:
if self.count == 1:
self_mask = get_mask_from_array(
self.data
) # This is to avoid the case where the mask is just "False"
else:
self_mask = get_mask_from_array(
self.data[data_band - 1, :, :]
) # This is to avoid the case where the mask is just "False"
valid_mask = ~self_mask

# Load only mask of valid data from disk if array not loaded
# Load only mask of valid data from disk if array not loaded
else:
valid_mask = ~self._load_only_mask(bands=data_band)
# If we are not skipping nodata values, valid mask is everywhere
else:
valid_mask = ~self._load_only_mask(bands=data_band)
if self.count == 1:
valid_mask = np.ones(self.data.shape, dtype=bool)
else:
valid_mask = np.ones(self.data[0, :].shape, dtype=bool)

# Get subsample on valid mask
# Build a low memory boolean masked array with invalid values masked to pass to subsampling
Expand Down Expand Up @@ -3973,6 +3987,10 @@ def to_pointcloud(
if np.ma.isMaskedArray(pixel_data):
pixel_data = pixel_data.data

# If nodata values were not skipped, convert them to NaNs
if skip_nodata is False:
pixel_data[pixel_data == self.nodata] = np.nan

# Now we force the coordinates we define for the point cloud, according to pixel interpretation
x_coords_2, y_coords_2 = (
np.array(a) for a in self.ij2xy(indices[0], indices[1], force_offset=force_pixel_offset)
Expand Down
7 changes: 7 additions & 0 deletions tests/test_raster/test_raster.py
Original file line number Diff line number Diff line change
Expand Up @@ -3324,6 +3324,13 @@ def test_to_pointcloud(self) -> None:
points = img1.to_pointcloud(data_column_name="lol", subsample=10)
assert np.array_equal(points.ds.columns, ["lol", "geometry"])

# Keeping the nodata values
points_invalid = img1.to_pointcloud(subsample=10000, random_state=42, skip_nodata=False)

# The subsampled values should not all be valid and the right shape
assert points_invalid.ds.shape == (10000, 2) # One less column here due to geometry storing X and Y
assert any(~np.isfinite(points_invalid["b1"].values))

# 4/ Multi-band real raster
img2 = gu.Raster(self.landsat_rgb_path)

Expand Down
Loading