Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] add spatial correlogram function #259

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions esda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from importlib.metadata import PackageNotFoundError, version

from . import adbscan, shape # noqa F401
from .correlogram import correlogram # noqa F401
from .gamma import Gamma # noqa F401
from .geary import Geary # noqa F401
from .geary_local import Geary_Local # noqa F401
Expand Down
144 changes: 144 additions & 0 deletions esda/correlogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Spatial Correlograms

import geopandas as gpd
import pandas as pd
from joblib import Parallel, delayed
from libpysal.cg.kdtree import KDTree
from libpysal.weights import KNN, DistanceBand
from libpysal.weights.util import get_points_array

from .moran import Moran


def _get_stat(inputs):
"""helper function for computing parallel statistics at multiple Graph specifications

Parameters
----------
inputs : tuple
tuple of (y, tree, W, statistic, STATISTIC, dist, weights_kwargs, stat_kwargs)

Returns
-------
pandas.Series
a pandas series with the computed autocorrelation statistic and its simulated p-value
"""
(

Check warning on line 26 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L26

Added line #L26 was not covered by tests
y, # y variable
tree, # kdreee,
W, # weights class
STATISTIC, # class of statistic (Moran, Geary, etc)
dist, # threshold/k parameter for the weights
weights_kwargs, # additional args
stat_kwargs, # additional args
) = inputs

w = W(tree, dist, silence_warnings=True, **weights_kwargs)
autocorr = STATISTIC(y, w, **stat_kwargs)
attrs = []
all_attrs = list(dict(vars(autocorr)).keys())

Check warning on line 39 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L36-L39

Added lines #L36 - L39 were not covered by tests
for attribute in all_attrs:
attrs.append(getattr(autocorr, str(attribute)))
return pd.Series(attrs, index=all_attrs, name=dist)

Check warning on line 42 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L41-L42

Added lines #L41 - L42 were not covered by tests


def correlogram(
gdf: gpd.GeoDataFrame,
variable: str,
distances: list,
statistic: callable = Moran,
distance_type: str = "band",
weights_kwargs: dict = None,
stat_kwargs: dict = None,
select_numeric: bool = False,
n_jobs: int = -1,
):
"""Generate a spatial correlogram

A spatial profile is a set of spatial autocorrelation statistics calculated for
a set of increasing distances. It is a useful exploratory tool for examining
how the relationship between spatial units changes over different notions of scale.

Parameters
----------
gdf : gpd.GeoDataFrame
geodataframe holding spatial and attribute data
variable: str
column on the geodataframe used to compute autocorrelation statistic
distances : list
list of distances to compute the autocorrelation statistic
statistic : callable
statistic to be computed for a range of libpysal.Graph specifications.
This should be a class with a signature like `Statistic(y,w, **kwargs)`
where y is a numpy array and w is a libpysal.Graph.
Generally, this is a class from pysal's `esda` package
defaults to esda.Moran, which computes the Moran's I statistic
distance_type : str, optional
which concept of distance to increment. Options are {`band`, `knn`}.
by default 'band' (for `libpysal.weights.DistanceBand` weights)
weights_kwargs : dict
additional keyword arguments passed to the libpysal.weights.W class
stat_kwargs : dict
additional keyword arguments passed to the `esda` autocorrelation statistic class.
For example for faster results with no statistical inference, set the number
of permutations to zero with {permutations: 0}
select_numeric : bool
if True, only return numeric attributes from the original class. This is useful
e.g. to prevent lists inside a "cell" of a dataframe
n_jobs : int
number of jobs to pass to joblib. If -1 (default), all cores will be used


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

Returns
-------
outputs : pandas.DataFrame
table of autocorrelation statistics at increasing distance bandwidths
"""
if stat_kwargs is None:
stat_kwargs = dict()
if weights_kwargs is None:
weights_kwargs = dict()

if distance_type == "band":
W = DistanceBand
elif distance_type == "knn":
if max(distances) > gdf.shape[0] - 1:
with ValueError as e:
raise e("max number of neighbors must be less than or equal to n-1")

Check warning on line 107 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L106-L107

Added lines #L106 - L107 were not covered by tests
W = KNN
else:
with NotImplementedError as e:
raise e("distance_type must be either `band` or `knn` ")

Check warning on line 111 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L110-L111

Added lines #L110 - L111 were not covered by tests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise e("distance_type must be either `band` or `knn` ")
raise e("distance_type must be either `band` or `knn`")


# there's a faster way to do this by building the largest tree first, then subsetting...
pts = get_points_array(gdf[gdf.geometry.name])
tree = KDTree(pts)
y = gdf[variable].values

inputs = [
(
y,
tree,
W,
statistic,
dist,
weights_kwargs,
stat_kwargs,
)
for dist in distances
]

outputs = Parallel(n_jobs=n_jobs, backend="loky")(
delayed(_get_stat)(i) for i in inputs
)

df = pd.DataFrame(outputs)
if select_numeric:
df = df.select_dtypes(["number"])

Check warning on line 137 in esda/correlogram.py

View check run for this annotation

Codecov / codecov/patch

esda/correlogram.py#L137

Added line #L137 was not covered by tests
return df


## Note: To be implemented:

## non-parametric version used in geoda https://geodacenter.github.io/workbook/5a_global_auto/lab5a.html#spatial-correlogram
## as given in https://link.springer.com/article/10.1023/A:1009601932481'
36 changes: 36 additions & 0 deletions esda/tests/test_correlogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from libpysal import examples
from esda import correlogram
import numpy as np
from numpy.testing import assert_array_almost_equal

import geopandas as gpd


sac = gpd.read_file(examples.load_example("Sacramento1").get_path("sacramentot2.shp"))
sac = sac.to_crs(sac.estimate_utm_crs()) # now in meters)

distances = [i + 500 for i in range(0, 2000, 500)]
kdists = list(range(1, 6))


def test_distance_correlogram():

corr = correlogram(sac, "HH_INC", distances)

test_data = np.array(
[
0.05822723177762817,
0.49206877942505206,
0.45494217612839183,
0.5625914469490942,
]
)

assert_array_almost_equal(corr.I, test_data)


def test_k_distance_correlogram():
corr = correlogram(sac, "HH_INC", kdists, distance_type="knn")

test_data = np.array([0.62411734, 0.59734846, 0.56958116, 0.54252517, 0.54093269])
assert_array_almost_equal(corr.I, test_data)
Loading
Loading