add pycnophylactic interpolation

pysal · Jun 25, 2021 · 0aecfa9 · 0aecfa9
1 parent 53eec88
commit 0aecfa9
Show file tree

Hide file tree

Showing 8 changed files with 362 additions and 0 deletions.
diff --git a/.ci/37.yml b/.ci/37.yml
@@ -25,3 +25,4 @@ dependencies:
   - pygeos
   - h3-py
   - joblib
+  - astropy
diff --git a/.ci/38.yml b/.ci/38.yml
@@ -25,3 +25,4 @@ dependencies:
   - pygeos
   - h3-py
   - joblib
+  - astropy
diff --git a/.ci/39.yml b/.ci/39.yml
@@ -30,3 +30,4 @@ dependencies:
   - numpydoc
   - nbsphinx
   - joblib
+  - astropy
diff --git a/docs/_static/references.bib b/docs/_static/references.bib
@@ -482,3 +482,21 @@ @article{mennis06_intel_dasym_mappin_its_applic
 volume = {33},
 year = {2006}
 }
+@article{tobler_smooth_1979,
+abstract = {Census enumerations are usually packaged in irregularly shaped geographical regions. Interior values can be interpolated for such regions, without specification of "control points," by using an analogy to elliptical partial differential equations. A solution procedure is suggested, using finite difference methods with classical boundary conditions. In order to estimate densities, an additional nonnegativity condition is required. Smooth contour maps, which satisfy the volume preserving and nonnegativity constraints, illustrate the method using actual geographical data. It is suggested that the procedure may be used to convert observations from one bureaucratic partitioning of a geographical area to another.},
+author = {Tobler, Waldo R},
+doi = {10.1080/01621459.1979.10481647},
+file = {:Users/knaaptime/Library/Application Support/Mendeley Desktop/Downloaded/Tobler - 1979 - Smooth Pycnophylactic Interpolation for Geographical Regions.pdf:pdf},
+isbn = {978-0-511-47935-9},
+issn = {0162-1459},
+journal = {Journal of the American Statistical Association},
+keywords = {bivariate interpolation,density estimation,dirichlet integral,interpolation,methods,numerical,population density},
+mendeley-groups = {boundary-harmonization},
+month = {sep},
+number = {367},
+pages = {519--529},
+title = {{Smooth pycnophylactic interpolation for geographical regions}},
+url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.1979.10481647},
+volume = {74},
+year = {1979}
+}
diff --git a/tobler/__init__.py b/tobler/__init__.py
@@ -8,3 +8,4 @@
 from . import dasymetric
 from . import model
 from . import util
+from . import pycno
diff --git a/tobler/pycno/__init__.py b/tobler/pycno/__init__.py
@@ -0,0 +1 @@
+from .pycno import pycno_interpolate
diff --git a/tobler/pycno/pycno.py b/tobler/pycno/pycno.py
@@ -0,0 +1,308 @@
+"""Pycnophylactic Interpolation (contributed by @danlewis85)."""
+# https://github.com/danlewis85/pycno/
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import rasterio
+from numpy import (
+    absolute,
+    apply_along_axis,
+    asarray,
+    convolve,
+    copy,
+    nan,
+    nanmax,
+    nanmean,
+    nansum,
+    pad,
+    power,
+    round,
+    unique,
+)
+from numpy.ma import masked_invalid, masked_where
+from pandas import DataFrame
+from rasterio.features import rasterize
+
+
+def pycno(
+    gdf, value_field, cellsize, r=0.2, handle_null=True, converge=3, verbose=True
+):
+    try:
+        from astropy.convolution import convolve as astro_convolve
+    except Exception:
+        raise ("Pycnophylactic interpolation requires the astropy package")
+
+    """Returns a smooth pycnophylactic interpolation raster for a given geodataframe
+
+    Args:
+    gdf (geopandas.geodataframe.GeoDataFrame): Input GeoDataFrame.
+    value_field (str): Field name of values to be used to produce pycnophylactic surface
+    cellsize (int): Pixel size of raster in planar units (i.e. metres, feet)
+    r (float, optional): Relaxation parameter, default of 0.2 is generally fine.
+    handle_null (boolean, optional): Changes how nodata values are smoothed. Default True.
+    converge (int, optional): Index for stopping value, default 3 is generally fine.
+    verbose (boolean, optional): Print out progress at each iteration.
+
+    Returns:
+    Numpy Array: Smooth pycnophylactic interpolation.
+    Rasterio geotransform
+    GeoPandas crs
+    """
+    # set nodata value
+    nodata = -9999
+
+    # work out raster rows and columns based on gdf extent and cellsize
+    xmin, ymin, xmax, ymax = gdf.total_bounds
+    xres = int((xmax - xmin) / cellsize)
+    yres = int((ymax - ymin) / cellsize)
+
+    # Work out transform so that we rasterize the area where the data are!
+    trans = rasterio.Affine.from_gdal(xmin, cellsize, 0, ymax, 0, -cellsize)
+
+    # First make a zone array
+    # NB using index values as ids can often be too large/alphanumeric. Limit is int32 polygon features.
+    # create a generator of geom, index pairs to use in rasterizing
+    shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.index))
+    # burn the features into a raster array
+    feature_array = rasterize(
+        shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans
+    )
+
+    # Get cell counts per index value (feature)
+    unique, count = np.unique(feature_array, return_counts=True)
+    cellcounts = asarray((unique, count)).T
+    # Lose the nodata counts
+    cellcounts = cellcounts[cellcounts[:, 0] != nodata, :]
+    # Adjust value totals by cells
+    # Make cell counts dataframe
+    celldf = DataFrame(cellcounts[:, 1], index=cellcounts[:, 0], columns=["cellcount"])
+    # Merge cell counts
+    gdf = gdf.merge(celldf, how="left", left_index=True, right_index=True)
+    # Calculate cell values
+    gdf["cellvalues"] = gdf[value_field] / gdf["cellcount"]
+
+    # create a generator of geom, cellvalue pairs to use in rasterizing
+    shapes = ((geom, value) for geom, value in zip(gdf.geometry, gdf.cellvalues))
+    # Now burn the initial value raster
+    value_array = rasterize(
+        shapes=shapes, fill=nodata, out_shape=(yres, xres), transform=trans
+    )
+
+    # Set nodata in value array to np.nan
+    value_array[value_array == -9999] = nan
+
+    # Set stopper value based on converge parameter
+    stopper = nanmax(value_array) * power(10.0, -converge)
+
+    # The basic numpy convolve function doesn't handle nulls.
+    def smooth2D(data):
+        # Create function that calls a 1 dimensionsal smoother.
+        s1d = lambda s: convolve(s, [0.5, 0.0, 0.5], mode="same")
+        # pad the data array with the mean value
+        padarray = pad(data, 1, "constant", constant_values=nanmean(data))
+        # make nodata mask
+        mask = masked_invalid(padarray).mask
+        # set nodata as zero to avoid eroding the raster
+        padarray[mask] = 0.0
+        # Apply the convolution along each axis of the data and average
+        padarray = (
+            apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray)
+        ) / 2
+        # Reinstate nodata
+        padarray[mask] = nan
+        return padarray[1:-1, 1:-1]
+
+    # The convolution function from astropy handles nulls.
+    def astroSmooth2d(data):
+        s1d = lambda s: astro_convolve(s, [0.5, 0, 0.5])
+        # pad the data array with the mean value
+        padarray = pad(data, 1, "constant", constant_values=nanmean(data))
+        # Apply the convolution along each axis of the data and average
+        padarray = (
+            apply_along_axis(s1d, 1, padarray) + apply_along_axis(s1d, 0, padarray)
+        ) / 2
+        return padarray[1:-1, 1:-1]
+
+    def correct2Da(data):
+
+        for idx, val in gdf[value_field].iteritems():
+            # Create zone mask from feature_array
+            mask = masked_where(feature_array == idx, feature_array).mask
+            # Work out the correction factor
+            correct = (val - nansum(data[mask])) / mask.sum()
+            # Apply correction
+            data[mask] += correct
+
+        return data
+
+    def correct2Dm(data):
+
+        for idx, val in gdf[value_field].iteritems():
+            # Create zone mask from feature_array
+            mask = masked_where(feature_array == idx, feature_array).mask
+            # Work out the correction factor
+            correct = val / nansum(data[mask])
+            if correct != 0.0:
+                # Apply correction
+                data[mask] *= correct
+
+        return data
+
+    while True:
+        # Store the current iteration
+        old = copy(value_array)
+
+        # Smooth the value_array
+        if handle_null:
+            sm = astroSmooth2d(value_array)
+        else:
+            sm = smooth2d(value_array)
+
+        # Relaxation to prevent overcompensation in the smoothing step
+        value_array = value_array * r + (1.0 - r) * sm
+
+        # Perform correction
+        value_array = correct2Da(value_array)
+
+        # Reset any negative values to zero.
+        value_array[value_array < 0] = 0.0
+
+        # Perform correction
+        value_array = correct2Dm(value_array)
+
+        if verbose:
+            print(
+                "Maximum Change: "
+                + str(round(nanmax(absolute(old - value_array)), 4))
+                + " - will stop at "
+                + str(round(stopper, 4))
+            )
+
+        if nanmax(absolute(old - value_array)) < stopper:
+            break
+
+    return (value_array, trans, gdf.crs)
+
+
+def save_pycno(pycno_array, transform, crs, filestring, driver="GTiff"):
+
+    """Saves a numpy array as a raster, largely a helper function for pycno
+    Args:
+        pycno_array (numpy array): 2D numpy array of pycnophylactic surface
+        transform (rasterio geotransform): Relevant transform from pycno()
+        crs (GeoPandas crs): Coordinate reference system of GeoDataFrame used in pycno()
+        filestring (str): File path to save raster
+        driver (str, optional): Format for output raster, default: geoTiff.
+    Returns:
+        None
+    """
+    import rasterio
+
+    # Save raster
+    new_dataset = rasterio.open(
+        filestring,
+        "w",
+        driver=driver,
+        height=pycno_array.shape[0],
+        width=pycno_array.shape[1],
+        count=1,
+        dtype="float64",
+        crs=crs,
+        transform=transform,
+    )
+    new_dataset.write(pycno_array.astype("float64"), 1)
+    new_dataset.close()
+
+    return None
+
+
+def extract_values(pycno_array, gdf, transform, fieldname="Estimate"):
+
+    """Extract raster value sums according to a provided polygon geodataframe
+    Args:
+        pycno_array (numpy array): 2D numpy array of pycnophylactic surface.
+        gdf (geopandas.geodataframe.GeoDataFrame): Target GeoDataFrame.
+        transform (rasterio geotransform): Relevant transform from pycno()
+        fieldname (str, optional): New gdf field to save estimates in. Default name: 'Estimate'.
+    Returns:
+        geopandas.geodataframe.GeoDataFrame: Target GeoDataFrame with appended estimates.
+    """
+    from numpy import nansum
+    from rasterio.features import geometry_mask
+
+    estimates = []
+    # Iterate through geodataframe and extract values
+    for idx, geom in gdf["geometry"].iteritems():
+        mask = geometry_mask(
+            [geom], pycno_array.shape, transform=transform, invert=True
+        )
+        estimates.append(nansum(pycno_array[mask]))
+    out = pd.Series(estimates)
+    return out
+
+
+def pycno_interpolate(
+    source_df,
+    target_df,
+    variables,
+    cellsize,
+    r=0.2,
+    handle_null=True,
+    converge=3,
+    verbose=False,
+):
+    """Pycnophylactic Inerpolation.
+
+    Parameters
+    ----------
+    source_df : geopandas.GeoDataFrame (required)
+        geodataframe with polygon geometries and data to transfer
+    target_df : geopandas.GeoDataFrame (required)
+        geodataframe with polygon geometries to receive new data
+    variables : list
+        columns on the source_df containing data to transfer
+    cellsize : int
+        Pixel size of intermediate raster in planar units (i.e. metres, feet)
+    r : float, optional
+        Relaxation parameter, default of 0.2 is generally fine
+    handle_null : bool, optional
+        Changes how nodata values are smoothed. Default True.
+    converge : int, optional
+        Index for stopping value, default 3 is generally fine.
+    verbose : bool, optional
+        Print out progress at each iteration.
+
+    Returns
+    -------
+    geopandas.GeoDataFrame
+        new geodaraframe with interpolated variables as columns and target_df geometry
+        as output geometry
+
+    Notes
+    -----
+    The formula is based on Tobler, W. R. (1979). Smooth pycnophylactic interpolation for geographical regions. Journal of the American Statistical Association, 74(367), 519–529. https://doi.org/10.1080/01621459.1979.10481647
+
+    Original implementation written by @danlewis85 at <https://github.com/danlewis85/pycno/>
+    and based in part on the R pycno package by Chris Brusndon (<https://cran.r-project.org/web/packages/pycno/index.html>)
+
+    References: :cite:`tobler_smooth_1979` 
+    """
+    assert source_df.crs.equals(
+        target_df.crs
+    ), "source_df CRS and target_df CRS are not the same. Reproject into consistent systems before proceeding"
+    output_vars = target_df.copy()[[target_df.geometry.name]]
+    for variable in variables:
+        pyc, trans, _ = pycno(
+            source_df,
+            variable,
+            cellsize=cellsize,
+            r=r,
+            handle_null=handle_null,
+            converge=converge,
+            verbose=verbose,
+        )
+        vals = extract_values(pyc, target_df, transform=trans)
+        output_vars[variable] = vals
+
+    return output_vars
diff --git a/tobler/tests/test_pycno.py b/tobler/tests/test_pycno.py
@@ -0,0 +1,31 @@
+"""test interpolation functions."""
+import geopandas
+
+from libpysal.examples import load_example
+from numpy.testing import assert_almost_equal
+from tobler.pycno import pycno_interpolate
+
+
+def datasets():
+    sac1 = load_example("Sacramento1")
+    sac2 = load_example("Sacramento2")
+    sac1 = geopandas.read_file(sac1.get_path("sacramentot2.shp"))
+    sac2 = geopandas.read_file(sac2.get_path("SacramentoMSA2.shp"))
+    sac1 = sac1.to_crs(sac1.estimate_utm_crs())
+    sac2 = sac2.to_crs(sac1.crs)
+    sac1["pct_poverty"] = sac1.POV_POP / sac1.POV_TOT
+
+    return sac1, sac2
+
+
+def test_pycno_interpolate():
+    sac1, sac2 = datasets()
+    pyc = pycno_interpolate(
+        source_df=sac1,
+        target_df=sac2,
+        variables=["TOT_POP"],
+        cellsize=500
+    )
+    assert_almost_equal(pyc.TOT_POP.sum(), 1794618.503, decimal=3)
+
+