forked from GeoscienceAustralia/dea-coastlines
-
Notifications
You must be signed in to change notification settings - Fork 6
/
vector.py
1751 lines (1495 loc) · 63.4 KB
/
vector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding: utf-8
# This code conducts vector subpixel shoreline extraction for DE Africa
# Coastlines:
#
# * Apply morphological extraction algorithms to mask annual median
# composite rasters to a valid coastal region
# * Extract waterline vectors using subpixel waterline extraction
# (Bishop-Taylor et al. 2019b; https://doi.org/10.3390/rs11242984)
# * Compute rates of coastal change at every 30 m of coastline
# using linear regression
import glob
import os
import sys
import warnings
import click
import pyproj
import datacube
import odc.algo
import numpy as np
import pandas as pd
import xarray as xr
import geohash as gh
import geopandas as gpd
from affine import Affine
from rasterio.features import sieve
from rasterio.transform import array_bounds
from scipy.stats import circstd, circmean, linregress
from shapely.geometry import box
from shapely.ops import nearest_points
from skimage.measure import label, regionprops
from skimage.morphology import binary_closing, binary_dilation, dilation, disk
from coastlines.utils import configure_logging, load_config
from dea_tools.spatial import subpixel_contours, xr_vectorize, xr_rasterize
# Hide specific warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)
pd.options.mode.chained_assignment = None
def load_rasters(
path,
raster_version,
study_area,
water_index="mndwi",
start_year=2000,
end_year=2020,
):
"""
Loads DEA Coastlines water index (e.g. 'MNDWI'), 'tide_m', 'count',
and 'stdev' rasters for both annual and three-year gapfill data
into a consistent `xarray.Dataset` format for further analysis.
Parameters:
-----------
path : string
A string giving the directory containing raster outputs.
raster_version : string
A string giving the unique analysis name (e.g. 'v0.3.0') used
to load raster files.
study_area : string or int
A string giving the study area grid cell used to name raster files
(e.g. tile `6931`).
water_index : string, optional
A string giving the name of the water index to load. Defaults
to 'mndwi', which will load raster files produced using the
Modified Normalised Difference Water Index.
start_year : integer, optional
The first annual layer to include in the analysis. Defaults to
2000.
end_year : integer, optional
The final annual layer to include in the analysis. Defaults to
2020.
Returns:
--------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual input rasters.
The dataset contains water index (e.g. 'MNDWI'), 'tide_m',
'count', and 'stdev' arrays for each year from 1988 onward.
gapfill_ds : xarray.Dataset
An `xarray.Dataset` containing three-year gapfill rasters.
The dataset contains water index (e.g. 'MNDWI'),
'tide_m', 'count', and 'stdev' arrays for each year from
`start_year` to `end_year`.
"""
# List to hold output Datasets
ds_list = []
for layer_type in [".tif", "_gapfill.tif"]:
# List to hold output DataArrays
da_list = []
for layer_name in [f"{water_index}", "ndwi", "tide_m", "count", "stdev"]:
# Get paths of files that match pattern
paths = glob.glob(
f"{path}/{raster_version}/"
f"{study_area}_{raster_version}/"
f"*_{layer_name}{layer_type}"
)
# Test if data was returned
if len(paths) == 0:
raise Exception(
f"No rasters found for grid cell {study_area} "
f"(raster version '{raster_version}'). Verify that "
f"`raster.py` has been run "
"for this grid cell."
)
# Create variable used for time axis
time_var = xr.Variable("year", [int(i.split("/")[-1][0:4]) for i in paths])
# Import data
layer_da = xr.concat([xr.open_rasterio(i) for i in paths], dim=time_var)
layer_da.name = f"{layer_name}"
# Append to file
da_list.append(layer_da)
# Combine into a single dataset and set CRS
layer_ds = xr.merge(da_list).squeeze("band", drop=True)
layer_ds = layer_ds.assign_attrs(layer_da.attrs)
layer_ds.attrs["transform"] = Affine(*layer_ds.transform)
layer_ds = layer_ds.sel(year=slice(str(start_year), str(end_year)))
# Append to list
ds_list.append(layer_ds)
return ds_list
def ocean_masking(ds, tide_points_gdf, connectivity=1, dilation=None):
"""
Identifies ocean by selecting the largest connected area of water
pixels that contain tidal modelling points. This region can be
optionally dilated to ensure that the sub-pixel algorithm has pixels
on either side of the water index threshold.
Parameters:
-----------
ds : xarray.DataArray
An array containing True for land pixels, and False for water.
This can be obtained by thresholding a water index
array (e.g. MNDWI < 0).
tide_points_gdf : geopandas.GeoDataFrame
Spatial points located within the ocean. These points are used
to ensure that all coastlines are directly connected to the
ocean.
connectivity : integer, optional
An integer passed to the 'connectivity' parameter of the
`skimage.measure.label` function.
dilation : integer, optional
The number of pixels to dilate ocean pixels to ensure than
adequate land pixels are included for subpixel waterline
extraction. Defaults to None.
Returns:
--------
ocean_mask : xarray.DataArray
An array containing the a mask consisting of identified ocean
pixels as True.
"""
# First, break boolean array into unique, discrete regions/blobs
blobs = xr.apply_ufunc(label, ds, 1, False, 1)
# Get blob ID for each tidal modelling point
x = xr.DataArray(tide_points_gdf.geometry.x, dims="z")
y = xr.DataArray(tide_points_gdf.geometry.y, dims="z")
ocean_blobs = np.unique(blobs.interp(x=x, y=y, method="nearest"))
# Return only blobs that contained tide modelling point
ocean_mask = blobs.isin(ocean_blobs[ocean_blobs != 0])
# Dilate mask so that we include land pixels on the inland side
# of each shoreline to ensure contour extraction accurately
# seperates land and water spectra
if dilation:
ocean_mask = xr.apply_ufunc(binary_dilation, ocean_mask, disk(dilation))
return ocean_mask
def coastal_masking(ds, tide_points_gdf, buffer=50, closing=None):
"""
Creates a symmetrical buffer around the land-water boundary
in a input boolean array. This is used to create a study area
mask that is focused on the coastal zone, excluding inland or
deeper ocean pixels.
Parameters:
-----------
ds : xarray.DataArray
A single time-step boolean array containing True for land
pixels, and False for water.
tide_points_gdf : geopandas.GeoDataFrame
Spatial points located within the ocean. These points are used
to ensure that all coastlines are directly connected to the
ocean.
buffer : integer, optional
The number of pixels to buffer the land-water boundary in
each direction.
Returns:
--------
coastal_mask : xarray.DataArray
An array containing True within `buffer_pixels` of the
land-water boundary, and False everywhere else.
"""
def _coastal_buffer(ds, buffer):
"""Generate coastal buffer from ocean-land boundary"""
buffer_ocean = binary_dilation(ds, buffer)
buffer_land = binary_dilation(~ds, buffer)
return buffer_ocean & buffer_land
# If closing is specified, apply morphological closing to fill
# narrow rivers, excluding them from the output study area
if closing:
ds = xr.apply_ufunc(binary_closing, ds, disk(closing))
# Identify ocean pixels that are directly connected to tide points
all_time_ocean = ocean_masking(ds, tide_points_gdf)
# Generate coastal buffer from ocean-land boundary
coastal_mask = xr.apply_ufunc(
_coastal_buffer, all_time_ocean, disk(buffer), dask="parallelized"
)
# Return coastal mask as 1, and land pixels as 2
return coastal_mask.where(coastal_mask, ~all_time_ocean * 2)
def temporal_masking(ds):
"""
Create a temporal mask by identifying land pixels with a direct
spatial connection (e.g. contiguous) to land pixels in either the
previous or subsequent timestep.
This is used to clean up noisy land pixels (e.g. caused by clouds,
white water, sensor issues), as these pixels typically occur
randomly with no relationship to the distribution of land in
neighbouring timesteps. True land, however, is likely to appear
in proximity to land before or after the specific timestep.
Parameters:
-----------
ds : xarray.DataArray
A multi-temporal array containing True for land pixels, and
False for water.
Returns:
--------
temporal_mask : xarray.DataArray
A multi-temporal array array containing True for pixels
located within the `dilation` distance of land in at least
one neighbouring timestep.
"""
def _noncontiguous(labels, intensity):
# For each blob of land, obtain whether it intersected with land in
# any neighbouring timestep
region_props = regionprops(labels.values, intensity_image=intensity.values)
contiguous = [i.label for i in region_props if i.max_intensity == 0]
# Filter array to only contiguous land
noncontiguous_array = np.isin(labels, contiguous)
# Return as xr.DataArray
return xr.DataArray(
~noncontiguous_array, coords=labels.coords, dims=labels.dims
)
# Label independent groups of pixels in each timestep in the array
labelled_ds = xr.apply_ufunc(label, ds, None, 0, dask="parallelized").rename(
"labels"
)
# Check if a pixel was neighboured by land in either the
# previous or subsequent timestep by shifting array in both directions
masked_neighbours = (
(ds.shift(year=-1, fill_value=False) | ds.shift(year=1, fill_value=False))
.astype(int)
.rename("neighbours")
)
# Merge both into an xr.Dataset
label_neighbour_ds = xr.merge([labelled_ds, masked_neighbours])
# Apply continguity test to each year to obtain pixels that are
# contiguous (spatially connected to) to land in the previous or subsequent timestep
temporal_mask = label_neighbour_ds.groupby("year").apply(
lambda x: _noncontiguous(labels=x.labels, intensity=x.neighbours)
)
return temporal_mask
def certainty_masking(yearly_ds, obs_threshold=5, stdev_threshold=0.25, sieve_size=128):
"""
Generate annual vector polygon masks containing information
about the certainty of each extracted shoreline feature.
These masks are used to assign each shoreline feature with
important certainty information to flag potential issues with
the data.
Parameters:
-----------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DE Africa Coastlines
rasters.
obs_threshold : int, optional
The minimum number of post-gapfilling Landsat observations
required for an extracted shoreline to be considered good
quality. Annual shorelines based on low numbers of
observations can be noisy due to the influence of
environmental noise like unmasked cloud, sea spray, white
water etc. Defaults to 5.
stdev_threshold : float, optional
The maximum MNDWI standard deviation required for a
post-gapfilled Landsat observation to be considered good
quality. Annual shorelines based on MNDWI with a high
standard deviation represent unstable data, which can
indicate that the tidal modelling process did not adequately
remove the influence of tide. For more information,
refer to BIshop-Taylor et al. 2021
(https://doi.org/10.1016/j.rse.2021.112734).
Defaults to 0.25.
sieve_size : int, optional
To reduce the complexity of the output masks, they are
first cleaned using `rasterio.features.sieve` to replace
small areas of pixels with the values of their larger
neighbours. This parameter sets the minimum polygon size
to retain in this process. Defaults to 128.
Returns:
--------
vector_masks : dictionary of geopandas.GeoDataFrames
A dictionary with year (as an str) as the key, and vector
data as a `geopandas.GeoDataFrame` for each year in the
analysis.
"""
# Identify problematic pixels
high_stdev = yearly_ds["stdev"] > stdev_threshold
low_obs = yearly_ds["count"] < obs_threshold
# Create raster mask with values of 0 for good data, values of
# 1 for unstable data, and values of 2 for insufficient data.
# Clean this by sieving to merge small areas of pixels into
# their neighbours
raster_mask = (
high_stdev.where(~low_obs, 2)
.groupby("year")
.apply(lambda x: sieve(x.values.astype(np.int16), size=sieve_size))
)
# Apply greyscale dilation to expand masked pixels to err on
# the side of overclassifying certainty issues
raster_mask = raster_mask.groupby("year").apply(
lambda x: dilation(x.values, disk(3))
)
# Loop through each mask and vectorise
vector_masks = {}
for i, arr in raster_mask.groupby("year"):
vector_mask = xr_vectorize(
arr,
crs=yearly_ds.geobox.crs,
transform=yearly_ds.geobox.affine,
attribute_col="certainty",
)
# Dissolve column and fix geometry
vector_mask = vector_mask.dissolve("certainty")
vector_mask["geometry"] = vector_mask.geometry.buffer(0)
# Rename classes and add to dict
vector_mask = vector_mask.rename(
{0: "good", 1: "unstable data", 2: "insufficient data"}
)
vector_masks[str(i)] = vector_mask
return vector_masks
def contours_preprocess(
yearly_ds,
gapfill_ds,
water_index,
index_threshold,
tide_points_gdf,
buffer_pixels=33,
mask_landcover=True,
mask_ndwi=True,
mask_temporal=True,
mask_modifications=None,
):
"""
Prepares and preprocesses DE Africa Coastlines raster data to
restrict the analysis to coastal shorelines, and extract data
that is used to assess the certainty of extracted shorelines.
This function:
1) Identifies areas affected by either unstable composites or low data
2) Fills low data areas in annual layers with three-year gapfill
3) Computes an all-time coastal mask based on the observed timeseries
of water and land pixels, after first optionally cleaning the data
using land cover data, NDWI values and a temporal contiguity mask
4) Identifies pixels directly connected to the ocean in each annual
timestep, and uses this to remove inland waterbodies from the analyis
Parameters:
-----------
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DE Africa Coastlines rasters.
gapfill_ds : xarray.Dataset
An `xarray.Dataset` containing three-year gapfill DE Africa Coastlines
rasters.
water_index : string
A string giving the name of the water index included in the
annual and gapfill datasets (e.g. 'mndwi').
index_threshold : float
A float giving the water index threshold used to separate land
and water (e.g. 0.00).
tide_points_gdf : geopandas.GeoDataFrame
Spatial points located within the ocean. These points are used
by the `mask_ocean` to ensure that all coastlines are directly
connected to the ocean. These may be obtained from the tidal
modelling points used in the raster generation part of the DE
Africa CoastLines analysis, as these are guaranteed to be
located in coastal or marine waters.
buffer_pixels : int, optional
The number of pixels by which to buffer the all time shoreline
detected by this function to produce an overall coastal buffer.
The default is 33 pixels, which at 30 m Landsat resolution
produces a coastal buffer with a radius of approximately 1000 m.
mask_landcover : bool, optional
Whether to apply a mask derived from the ESA World Cover dataset
to flag deep water pixels as water before computing the all-time
coastal mask. This can help remove aerosol-based noise over open
water in USGS Collection 2 Level 2 data. Defaults to True.
mask_ndwi : bool, optional
Whether to apply an additional mask based on the Normalised
Difference Water Index (NDWI) to flag pixels with high water
values as water. This can help reduce aerosol-based noise over
open water in USGS Collection 2 Level 2 data, as NDWI is less
impacted by this than MNDWI. Defaults to True.
mask_temporal : bool, optional
Whether to apply a temporal contiguity mask by identifying land
pixels with a direct spatial connection (e.g. contiguous) to
land pixels in either the previous or subsequent timestep. This is
used to clean up noisy land pixels (e.g. caused by clouds,
white water, sensor issues), as these pixels typically occur
randomly with no relationship to the distribution of land in
neighbouring timesteps. True land, however, is likely to appear
in proximity to land before or after the specific timestep.
Defaults to True.
mask_modifications : geopandas.GeoDataFrame, optional
An optional polygon dataset including features to remove or add
to the all-time coastal mask. This should include a column/field
named 'type' that contains two possible values:
- 'add': features to add to the coastal mask (e.g. for
including areas of missing shorelines that were
previously removed by the coastal mask)
- 'remove': features to remove from the coastal mask (e.g.
areas of non-coastal rivers or estuaries,
irrigated fields or aquaculture that you wish
to exclude from the analysis)
Returns:
--------
masked_ds : xarray.Dataset
A dataset containing water index data for each annual timestep
that has been masked to the coastal zone. This can then be used
as an input to subpixel waterline extraction.
certainty_masks : dict
A dictionary containg one `geopandas.GeoDataFrame` for each year
in the time period, with polygons identifying any potentially
problematic region. This is used to assign each output shoreline
with a certainty column.
"""
# Remove low obs pixels and replace with 3-year gapfill
yearly_ds = yearly_ds.where(yearly_ds["count"] > 5, gapfill_ds)
# Set any pixels with only one observation to NaN, as these
# are extremely vulnerable to noise
yearly_ds = yearly_ds.where(yearly_ds["count"] > 1)
# Apply water index threshold and re-apply nodata values
thresholded_ds = yearly_ds[water_index] < index_threshold
nodata = yearly_ds[water_index].isnull()
thresholded_ds = thresholded_ds.where(~nodata)
# Set defaults which are overwritten if masks are requested
landcover_mask = True
ndwi_mask = True
temporal_mask = True
if mask_landcover:
# To remove aerosol-based noise over open water, apply a mask based
# on the ESA World Cover dataset, loading persistent water
# then shrinking this to ensure only deep water pixels are included
landcover = datacube.Datacube().load(
product="esa_worldcover_2020", like=yearly_ds.geobox
)
landcover_water = landcover.classification.isin([0, 80]).squeeze(dim="time")
landcover_mask = ~odc.algo.mask_cleanup(
landcover_water, mask_filters=[("erosion", buffer_pixels)]
)
# Set any pixels outside mask to 0 to represent water
thresholded_ds = thresholded_ds.where(landcover_mask, 0)
if mask_ndwi:
# To remove remaining aerosol-based noise over open water, apply an additional
# mask based on NDWI. This works because NIR is less affected by the aerosol
# issues than SWIR, and NDWI tends to be less aggressive at mapping
# water than MNDWI, which ensures that masking by NDWI will not remove useful
# along the actual coastline.
ndwi_land = yearly_ds["ndwi"] < 0
ndwi_mask = odc.algo.mask_cleanup(
ndwi_land, mask_filters=[("dilation", 2)]
) # This ensures NDWI mask does not affect pixels along the coastline
# Ensure the mask doesn't modify nodata
ndwi_mask = ndwi_mask.where(~nodata, 1)
# Set any pixels outside mask to 0 to represent water
thresholded_ds = thresholded_ds.where(ndwi_mask, 0)
if mask_temporal:
# Create a temporal mask by identifying land pixels with a direct
# spatial connection (e.g. contiguous) to land pixels in either the
# previous or subsequent timestep.
# This is used to clean up noisy land pixels (e.g. caused by clouds,
# white water, sensor issues), as these pixels typically occur
# randomly with no relationship to the distribution of land in
# neighbouring timesteps. True land, however, is likely to appear
# in proximity to land before or after the specific timestep.
# Compute temporal mask
temporal_mask = temporal_masking(thresholded_ds == 1)
# Set any pixels outside mask to 0 to represent water
thresholded_ds = thresholded_ds.where(temporal_mask, 0)
# Identify pixels that are land in at least 15% of valid observations,
# and use this to generate a coastal buffer study area. Morphological
# closing helps to "close" the entrances of estuaries and rivers, removing
# them from the analysis
all_time = thresholded_ds.mean(dim="year") >= 0.15
coastal_mask = coastal_masking(
ds=all_time, tide_points_gdf=tide_points_gdf, buffer=buffer_pixels, closing=15
)
# Optionally modify the coastal mask using manually supplied polygons to
# add missing areas of shoreline, or remove unwanted areas from the mask.
if mask_modifications is not None:
# Only proceed if there are polygons available
if len(mask_modifications.index) > 0:
# Convert type column to integer, with 1 representing pixels to add
# to the coastal mask, and 2 representing pixels to remove from the mask
mask_modifications = mask_modifications.replace({"add": 1, "remove": 2})
# Rasterise polygons into extent of satellite data
modifications_da = xr_rasterize(
mask_modifications, da=yearly_ds, attribute_col="type"
)
# Apply modifications to mask
coastal_mask = coastal_mask.where(modifications_da == 0, modifications_da)
# Because the output of `coastal_masking` contains values of 2 that
# represent pixels inland of the coastal buffer and values of 1 in
# the coastal buffer itself, seperate them for further use
inland_mask = coastal_mask == 2
coastal_mask = coastal_mask == 1
# Generate annual masks by selecting only water pixels that are
# directly connected to the ocean in each yearly timestep, and
# not within the inland mask (this prevents isolated sections of
# inland rivers/waterbodies being included in the data)
annual_mask = (
(thresholded_ds != 0) # Set both 1s and NaN to True
.where(~inland_mask, 1)
.groupby("year")
.apply(lambda x: ocean_masking(x, tide_points_gdf, 1, 3))
)
# Keep pixels within annual mask layers, all time coastal buffer,
# NDWI mask and temporal mask
masked_ds = yearly_ds[water_index].where(
annual_mask & coastal_mask & ndwi_mask & temporal_mask
)
# Generate annual vector polygon masks containing information
# about the certainty of each shoreline feature
certainty_masks = certainty_masking(yearly_ds)
return masked_ds, certainty_masks
def points_on_line(gdf, index, distance=30):
"""
Generates evenly-spaced point features along a specific line feature
in a `geopandas.GeoDataFrame`.
Parameters:
-----------
gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing line features with an
index and CRS.
index : string or int
An value giving the index of the line to generate points along
distance : integer or float, optional
A number giving the interval at which to generate points along
the line feature. Defaults to 30, which will generate a point
at every 30 metres along the line.
Returns:
--------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing point features at every
`distance` along the selected line.
"""
# Select individual line to generate points along
line_feature = gdf.loc[[index]].geometry
# If multiple features are returned, take unary union
if line_feature.shape[0] > 0:
line_feature = line_feature.unary_union
else:
line_feature = line_feature.iloc[0]
# Generate points along line and convert to geopandas.GeoDataFrame
points_line = [
line_feature.interpolate(i)
for i in range(0, int(line_feature.length), distance)
]
points_gdf = gpd.GeoDataFrame(geometry=points_line, crs=gdf.crs)
return points_gdf
def annual_movements(
points_gdf, contours_gdf, yearly_ds, baseline_year, water_index, max_valid_dist=1000
):
"""
For each rate of change point along the baseline annual coastline,
compute the distance to the nearest point on all neighbouring annual
coastlines and add this data as new fields in the dataset.
Distances are assigned a directionality (negative = located inland,
positive = located sea-ward) by sampling water index values from the
underlying DEA Coastlines rasters to determine if a coastline was
located in wetter or drier terrain than the baseline coastline.
Parameters:
-----------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points.
contours_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing annual coastlines.
yearly_ds : xarray.Dataset
An `xarray.Dataset` containing annual DEA CoastLines rasters.
baseline_year : string
A string giving the year used as the baseline when generating
the rates of change points dataset. This is used to load DEA
CoastLines water index rasters to calculate change
directionality.
water_index : string
A string giving the water index used in the analysis. This is
used to load DEA CoastLines water index rasters to calculate
change directionality.
max_valid_dist : int or float, optional
Any annual distance greater than this distance will be set
to `np.nan`.
Returns:
--------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points
with added 'dist_*' attribute columns giving the distance to
each annual coastline from the baseline. Negative values
indicate that an annual coastline was located inland of the
baseline; positive values indicate the coastline was located
towards the ocean.
"""
def _point_interp(points, array, **kwargs):
points_gs = gpd.GeoSeries(points)
x_vals = xr.DataArray(points_gs.x, dims="z")
y_vals = xr.DataArray(points_gs.y, dims="z")
return array.interp(x=x_vals, y=y_vals, **kwargs)
# Get array of water index values for baseline time period
baseline_array = yearly_ds[water_index].sel(year=int(baseline_year))
# Copy baseline point geometry to new column in points dataset
points_gdf["p_baseline"] = points_gdf.geometry
# Years to analyse
years = contours_gdf.index.unique().values
# Iterate through all comparison years in contour gdf
for comp_year in years:
# Set comparison contour
comp_contour = contours_gdf.loc[[comp_year]].geometry.iloc[0]
# Find nearest point on comparison contour, and add these to points dataset
points_gdf[f"p_{comp_year}"] = points_gdf.apply(
lambda x: nearest_points(x.p_baseline, comp_contour)[1], axis=1
)
# Compute distance between baseline and comparison year points and add
# this distance as a new field named by the current year being analysed
distances = points_gdf.apply(
lambda x: x.geometry.distance(x[f"p_{comp_year}"]), axis=1
)
# Set any value over X m to NaN, and drop any points with
# less than 50% valid observations
points_gdf[f"dist_{comp_year}"] = distances.where(distances < max_valid_dist)
# Extract comparison array containing water index values for the
# current year being analysed
comp_array = yearly_ds[water_index].sel(year=int(comp_year))
# Sample water index values for baseline and comparison points
points_gdf["index_comp_p1"] = _point_interp(
points_gdf["p_baseline"], comp_array
)
points_gdf["index_baseline_p2"] = _point_interp(
points_gdf[f"p_{comp_year}"], baseline_array
)
# Compute change directionality (positive = located towards the
# ocean; negative = located inland)
points_gdf["loss_gain"] = np.where(
points_gdf.index_baseline_p2 > points_gdf.index_comp_p1, 1, -1
)
# Ensure NaNs are correctly propagated (otherwise, X > NaN
# will return False, resulting in an incorrect land-ward direction)
is_nan = points_gdf[["index_comp_p1", "index_baseline_p2"]].isna().any(axis=1)
points_gdf["loss_gain"] = points_gdf["loss_gain"].where(~is_nan)
# Multiply distance to set change to negative, positive or NaN
points_gdf[f"dist_{comp_year}"] = (
points_gdf[f"dist_{comp_year}"] * points_gdf.loss_gain
)
# Calculate compass bearing from baseline to comparison point;
# first we need our points in lat-lon
lat_lon = points_gdf[["p_baseline", f"p_{comp_year}"]].apply(
lambda x: gpd.GeoSeries(x, crs=points_gdf.crs).to_crs("EPSG:4326")
)
geodesic = pyproj.Geod(ellps="WGS84")
bearings = geodesic.inv(
lons1=lat_lon.iloc[:, 0].values.x,
lats1=lat_lon.iloc[:, 0].values.y,
lons2=lat_lon.iloc[:, 1].values.x,
lats2=lat_lon.iloc[:, 1].values.y,
)[0]
# Add bearing as a new column after first restricting
# angles between 0 and 180 as we are only interested in
# the overall axis of our points e.g. north-south
points_gdf[f"bearings_{comp_year}"] = bearings % 180
# Calculate mean and standard deviation of angles
points_gdf["angle_mean"] = (
points_gdf.loc[:, points_gdf.columns.str.contains("bearings_")]
.apply(lambda x: circmean(x, high=180), axis=1)
.round(0)
.astype(int)
)
points_gdf["angle_std"] = (
points_gdf.loc[:, points_gdf.columns.str.contains("bearings_")]
.apply(lambda x: circstd(x, high=180), axis=1)
.round(0)
.astype(int)
)
# Keep only required columns
to_keep = points_gdf.columns.str.contains("dist|geometry|angle")
points_gdf = points_gdf.loc[:, to_keep]
points_gdf = points_gdf.assign(**{f"dist_{baseline_year}": 0.0})
points_gdf = points_gdf.round(2)
return points_gdf
def outlier_mad(points, thresh=3.5):
"""
Use robust Median Absolute Deviation (MAD) outlier detection
algorithm to detect outliers. Returns a boolean array with True if
points are outliers and False otherwise.
Parameters:
-----------
points :
An n-observations by n-dimensions array of observations
thresh :
The modified z-score to use as a threshold. Observations with a
modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask :
A n-observations-length boolean array.
References:
----------
Source: https://github.com/joferkington/oost_paper_code/blob/master/utilities.py
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:, None]
median = np.median(points, axis=0)
diff = np.sum((points - median) ** 2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
def outlier_ransac(xy_df, **kwargs):
"""
Use the RANSAC (RANdom SAmple Consensus) algorithm to
robustly identify outliers. Returns a boolean array with True if
points are outliers and False otherwise.
Parameters:
-----------
points :
An n-observations by n-dimensions array of observations
**kwargs :
Any parameters to pass to
`sklearn.linear_model.RANSACRegressor`
Returns:
--------
mask :
A n-observations-length boolean array.
"""
from sklearn import linear_model
# X and y inputs
X = xy_df[:, 0].reshape(-1, 1)
y = xy_df[:, 1].reshape(-1, 1)
# Robustly fit linear model with RANSAC algorithm
ransac = linear_model.RANSACRegressor(**kwargs)
ransac.fit(X, y)
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
return outlier_mask
def change_regress(
y_vals,
x_vals,
x_labels,
threshold=3.5,
detrend_params=None,
slope_var="slope",
interc_var="intercept",
pvalue_var="pvalue",
stderr_var="stderr",
outliers_var="outliers",
):
"""
For a given row in a `pandas.DataFrame`, apply linear regression to
data values (as y-values) and a corresponding sequence of x-values,
and return 'slope', 'intercept', 'pvalue', and 'stderr' regression
parameters.
Before computing the regression, outliers are identified using a
robust Median Absolute Deviation (MAD) outlier detection algorithm,
and excluded from the regression. A list of these outliers will be
recorded in the output 'outliers' variable.
Parameters:
-----------
x_vals, y_vals : list of numeric values, or nd.array
A sequence of values to use as the x and y variables
x_labels : list
A sequence of strings corresponding to each value in `x_vals`.
This is used to label any observations that are flagged as
outliers (often, this can simply be set to the same list
provided to `x_vals`).
threshold : float, optional
The modified z-score to use as a threshold for detecting
outliers using the MAD algorithm. Observations with a modified
z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
detrend_params : optional
Not currently used
slope, interc_var, pvalue_var, stderr_var : strings, optional
Strings giving the names to use for each of the output
regression variables.
outliers_var : string, optional
String giving the name to use for the output outlier variable.
Returns:
--------
mask :
A `pandas.Series` containing regression parameters and lists
of outliers.
"""
# Drop invalid NaN rows
xy_df = np.vstack([x_vals, y_vals]).T
valid_bool = ~np.isnan(xy_df).any(axis=1)
xy_df = xy_df[valid_bool]
valid_labels = x_labels[valid_bool]
# If detrending parameters are provided, apply these to the data to
# remove the trend prior to running the regression
if detrend_params:
xy_df[:, 1] = xy_df[:, 1] - (
detrend_params[0] * xy_df[:, 0] + detrend_params[1]
)
# Remove outliers using MAD
outlier_bool = outlier_mad(xy_df, thresh=threshold)
# outlier_bool = outlier_ransac(xy_df)
xy_df = xy_df[~outlier_bool]
valid_labels = valid_labels[~outlier_bool]
# Create string of all outliers and invalid NaN rows
outlier_set = set(x_labels) - set(valid_labels)
outlier_str = " ".join(map(str, sorted(outlier_set)))
# Compute linear regression
lin_reg = linregress(x=xy_df[:, 0], y=xy_df[:, 1])
# Return slope, p-values and list of outlier years excluded from regression
results_dict = {
slope_var: np.round(lin_reg.slope, 3),
interc_var: np.round(lin_reg.intercept, 3),
pvalue_var: np.round(lin_reg.pvalue, 3),
stderr_var: np.round(lin_reg.stderr, 3),
outliers_var: outlier_str,
}
return pd.Series(results_dict)
def calculate_regressions(points_gdf, contours_gdf):
"""
For each rate of change point along the baseline annual coastline,
compute linear regression rates of change against both time and
climate indices.
Regressions are computed after removing outliers to ensure robust
results.
Parameters:
-----------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points
with 'dist_*' annual movement/distance data.
contours_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing annual coastlines. This
is used to ensure that all years in the annual coastlines data
are included in the regression.
Returns:
--------
points_gdf : geopandas.GeoDataFrame
A `geopandas.GeoDataFrame` containing rates of change points
with additional attribute columns:
'rate_*': Slope of the regression
'sig_*': Significance of the regression
'se_*': Standard error of the regression