Skip to content

Commit

Permalink
Merge branch 'main' into explore
Browse files Browse the repository at this point in the history
  • Loading branch information
knaaptime authored Nov 1, 2023
2 parents da90661 + db27d9f commit 5007ee0
Show file tree
Hide file tree
Showing 21 changed files with 713 additions and 690 deletions.
10 changes: 5 additions & 5 deletions libpysal/graph/_contiguity.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from collections import defaultdict

import geopandas
import numpy
import shapely
import pandas
import geopandas
import shapely
from packaging.version import Version

from ._utils import _neighbor_dict_to_edges, _validate_geometry_input, _resolve_islands
from ._utils import _neighbor_dict_to_edges, _resolve_islands, _validate_geometry_input

GPD_013 = Version(geopandas.__version__) >= Version("0.13")

Expand Down Expand Up @@ -43,9 +43,9 @@ def _vertex_set_intersection(geoms, rook=True, ids=None, by_perimeter=False):
)

# initialise the target map
graph = dict()
graph = {}
for idx in ids:
graph[idx] = set([idx])
graph[idx] = {idx}

# get all of the vertices for the input
vertices, offsets = shapely.get_coordinates(geoms.geometry, return_index=True)
Expand Down
109 changes: 57 additions & 52 deletions libpysal/graph/_kernel.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import numpy
from scipy import sparse, optimize, spatial, stats
from scipy import optimize, sparse, spatial, stats

from ._utils import (
_validate_geometry_input,
_build_coincidence_lookup,
_induce_cliques,
_jitter_geoms,
_sparse_to_arrays,
_resolve_islands,
_sparse_to_arrays,
_validate_geometry_input,
)

try:
from sklearn import neighbors, metrics
from sklearn import metrics, neighbors

HAS_SKLEARN = True
except ImportError:
Expand Down Expand Up @@ -50,7 +49,7 @@ def _boxcar(distances, bandwidth):
return r


def _identity(distances, bandwidth):
def _identity(distances, _):
return distances


Expand Down Expand Up @@ -134,24 +133,25 @@ def _kernel(
if ids is None:
ids = numpy.arange(coordinates.shape[0])

if metric == "haversine":
if not (
if (
metric == "haversine"
and not (
(coordinates[:, 0] > -180)
& (coordinates[:, 0] < 180)
& (coordinates[:, 1] > -90)
& (coordinates[:, 1] < 90)
).all():
raise ValueError(
"'haversine' metric is limited to the range of "
"latitude coordinates (-90, 90) and the range of "
"longitude coordinates (-180, 180)."
)
).all()
):
raise ValueError(
"'haversine' metric is limited to the range of latitude coordinates "
"(-90, 90) and the range of longitude coordinates (-180, 180)."
)

if k is not None:
if metric != "precomputed":
D = _knn(coordinates, k=k, metric=metric, p=p, coincident=coincident)
d = _knn(coordinates, k=k, metric=metric, p=p, coincident=coincident)
else:
D = coordinates * (coordinates.argsort(axis=1, kind="stable") < (k + 1))
d = coordinates * (coordinates.argsort(axis=1, kind="stable") < (k + 1))
else:
if metric != "precomputed":
dist_kwds = {}
Expand All @@ -167,8 +167,8 @@ def _kernel(
f"metric {metric} is not supported by scipy, and scikit-learn "
"could not be imported."
)
D = spatial.distance.pdist(coordinates, metric=metric, **dist_kwds)
sq = spatial.distance.squareform(D)
d = spatial.distance.pdist(coordinates, metric=metric, **dist_kwds)
sq = spatial.distance.squareform(d)

# ensure that self-distance is dropped but 0 between co-located pts not
# get data and ids for sparse constructor
Expand All @@ -180,31 +180,31 @@ def _kernel(
i = numpy.delete(i, numpy.arange(0, i.size, sq.shape[0] + 1))
j = numpy.delete(j, numpy.arange(0, j.size, sq.shape[0] + 1))
# construct sparse
D = sparse.csc_array((data, (i, j)))
d = sparse.csc_array((data, (i, j)))
else:
D = sparse.csc_array(coordinates)
d = sparse.csc_array(coordinates)
if bandwidth is None:
bandwidth = numpy.percentile(D.data, 25)
bandwidth = numpy.percentile(d.data, 25)
elif bandwidth == "auto":
if (kernel == "identity") or (kernel is None):
bandwidth = numpy.nan # ignored by identity
else:
bandwidth = _optimize_bandwidth(D, kernel)
bandwidth = _optimize_bandwidth(d, kernel)
if callable(kernel):
D.data = kernel(D.data, bandwidth)
d.data = kernel(d.data, bandwidth)
else:
D.data = _kernel_functions[kernel](D.data, bandwidth)
d.data = _kernel_functions[kernel](d.data, bandwidth)

if taper:
D.eliminate_zeros()
d.eliminate_zeros()

heads, tails, weights = _sparse_to_arrays(D, ids=ids)
heads, tails, weights = _sparse_to_arrays(d, ids=ids)

return _resolve_islands(heads, tails, ids, weights)


def _knn(coordinates, metric="euclidean", k=1, p=2, coincident="raise"):
"""internal function called only from within _kernel, never directly to build KNN"""
"""internal function called only within _kernel, never directly to build KNN"""
coordinates, ids, geoms = _validate_geometry_input(
coordinates, ids=None, valid_geometry_types=_VALID_GEOMETRY_TYPES
)
Expand All @@ -217,29 +217,29 @@ def _knn(coordinates, metric="euclidean", k=1, p=2, coincident="raise"):
# sklearn haversine works with (lat,lng) in radians...
coordinates = numpy.fliplr(numpy.deg2rad(coordinates))
query = _prepare_tree_query(coordinates, metric, p=p)
D_linear, ixs = query(coordinates, k=k + 1)
d_linear, ixs = query(coordinates, k=k + 1)
self_ix, neighbor_ix = ixs[:, 0], ixs[:, 1:]
D_linear = D_linear[:, 1:]
d_linear = d_linear[:, 1:]
self_ix_flat = numpy.repeat(self_ix, k)
neighbor_ix_flat = neighbor_ix.flatten()
D_linear_flat = D_linear.flatten()
d_linear_flat = d_linear.flatten()
if metric == "haversine":
D_linear_flat * 6371 # express haversine distances in kilometers
D = sparse.csr_array(
(D_linear_flat, (self_ix_flat, neighbor_ix_flat)),
d_linear_flat * 6371 # express haversine distances in kilometers
d = sparse.csr_array(
(d_linear_flat, (self_ix_flat, neighbor_ix_flat)),
shape=(n_samples, n_samples),
)
return D
return d

else:
if coincident == "raise":
raise ValueError(
f"There are {len(coincident_lut)} "
f"unique locations in the dataset, but {len(coordinates)} observations. "
f"At least one of these sites has {max_at_one_site} points, more than the "
f"{k} nearest neighbors requested. This means there are more than {k} points "
"in the same location, which makes this graph type undefined. To address "
"this issue, consider setting `coincident='clique' or consult the "
f"There are {len(coincident_lut)} unique locations in the dataset, "
f"but {len(coordinates)} observations. At least one of these sites "
f"has {max_at_one_site} points, more than the {k} nearest neighbors "
f"requested. This means there are more than {k} points in the same "
"location, which makes this graph type undefined. To address "
"this issue, consider setting `coincident='clique'` or consult the "
"documentation about coincident points."
)
if coincident == "jitter":
Expand All @@ -258,7 +258,13 @@ def _knn(coordinates, metric="euclidean", k=1, p=2, coincident="raise"):
)
# # implicit coincident == "clique"
# heads, tails, weights = _sparse_to_arrays(
# _knn(coincident_lut.geometry, metric=metric, k=k, p=p, coincident="raise")
# _knn(
# coincident_lut.geometry,
# metric=metric,
# k=k,
# p=p,
# coincident="raise"
# )
# )
# adjtable = pandas.DataFrame.from_dict(
# dict(focal=heads, neighbor=tails, weight=weights)
Expand Down Expand Up @@ -300,9 +306,7 @@ def _prepare_tree_query(coordinates, metric, p=2):
return tree(coordinates, metric=metric, **dist_kwds).query
else:
if metric in ("euclidean", "manhattan", "cityblock", "minkowski"):
from scipy.spatial import KDTree as tree

tree_ = tree(coordinates)
tree_ = spatial.KDTree(coordinates)
p = {"euclidean": 2, "manhattan": 1, "cityblock": 1, "minkowski": p}[metric]

def query(target, k):
Expand All @@ -316,7 +320,7 @@ def query(target, k):
)


def _optimize_bandwidth(D, kernel):
def _optimize_bandwidth(d, kernel):
"""
Optimize the bandwidth as a function of entropy for a given kernel function.
Expand All @@ -326,16 +330,17 @@ def _optimize_bandwidth(D, kernel):
"moderate" level of smoothing.
"""
kernel_function = _kernel_functions.get(kernel, kernel)
assert callable(
kernel_function
), f"kernel {kernel} was not in supported kernel types {_kernel_functions.keys()} or callable"
assert callable(kernel_function), (
f"kernel {kernel} was not in supported kernel types "
f"{_kernel_functions.keys()} or callable"
)

def _loss(bandwidth, D=D, kernel_function=kernel_function):
Ku = kernel_function(D.data, bandwidth)
bins, _ = numpy.histogram(Ku, bins=int(D.shape[0] ** 0.5), range=(0, 1))
def _loss(bandwidth, d=d, kernel_function=kernel_function):
k_u = kernel_function(d.data, bandwidth)
bins, _ = numpy.histogram(k_u, bins=int(d.shape[0] ** 0.5), range=(0, 1))
return -stats.entropy(bins / bins.sum())

xopt = optimize.minimize_scalar(
_loss, bounds=(0, D.data.max() * 2), method="bounded"
_loss, bounds=(0, d.data.max() * 2), method="bounded"
)
return xopt.x
17 changes: 9 additions & 8 deletions libpysal/graph/_parquet.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import libpysal
import json

import libpysal


def _to_parquet(G, destination, **kwargs):
def _to_parquet(graph_obj, destination, **kwargs):
"""Save adjacency as a Parquet table and add custom metadata
Metadata contain transformation and the libpysal version used to save the file.
Expand All @@ -11,7 +12,7 @@ def _to_parquet(G, destination, **kwargs):
Parameters
----------
G : Graph
graph_obj : Graph
Graph to be saved
destination : str | pyarrow.NativeFile
path or any stream supported by pyarrow
Expand All @@ -22,11 +23,11 @@ def _to_parquet(G, destination, **kwargs):
import pyarrow as pa
import pyarrow.parquet as pq
except (ImportError, ModuleNotFoundError):
raise ImportError("pyarrow is required for `to_parquet`.")
table = pa.Table.from_pandas(G._adjacency.to_frame())
raise ImportError("pyarrow is required for `to_parquet`.") from None
table = pa.Table.from_pandas(graph_obj._adjacency.to_frame())

meta = table.schema.metadata
d = {"transformation": G.transformation, "version": libpysal.__version__}
d = {"transformation": graph_obj.transformation, "version": libpysal.__version__}
meta[b"libpysal"] = json.dumps(d).encode("utf-8")
schema = table.schema.with_metadata(meta)

Expand All @@ -51,10 +52,10 @@ def _read_parquet(source, **kwargs):
try:
import pyarrow.parquet as pq
except (ImportError, ModuleNotFoundError):
raise ImportError("pyarrow is required for `read_parquet`.")
raise ImportError("pyarrow is required for `read_parquet`.") from None

table = pq.read_table(source, **kwargs)
if b"libpysal" in table.schema.metadata.keys():
if b"libpysal" in table.schema.metadata:
meta = json.loads(table.schema.metadata[b"libpysal"])
transformation = meta["transformation"]
else:
Expand Down
19 changes: 10 additions & 9 deletions libpysal/graph/_plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def _plot(
G,
graph_obj,
gdf,
focal=None,
nodes=True,
Expand All @@ -24,7 +24,7 @@ def _plot(
Parameters
----------
G : Graph
graph_obj : Graph
Graph to be plotted
gdf : geopandas.GeoDataFrame
Geometries indexed using the same index as Graph. Geometry types other than
Expand Down Expand Up @@ -81,27 +81,28 @@ def _plot(
if "color" not in node_kws:
node_kws["color"] = color
else:
node_kws = dict(color=color)
node_kws = {"color": color}

if edge_kws is not None:
if "color" not in edge_kws:
edge_kws["color"] = color
else:
edge_kws = dict(color=color)
edge_kws = {"color": color}

# get array of coordinates in the order reflecting G._adjacency.index.codes
# get array of coordinates in the order reflecting graph_obj._adjacency.index.codes
# we need to work on int position to allow fast filtering of duplicated edges and
# cannot rely on gdf remaining in the same order between Graph creation and plotting
coords = shapely.get_coordinates(gdf.reindex(G.unique_ids).centroid)
# cannot rely on gdf remaining in the same order between Graph creation and
# plotting
coords = shapely.get_coordinates(gdf.reindex(graph_obj.unique_ids).centroid)

if focal is not None:
if not pd.api.types.is_list_like(focal):
focal = [focal]
subset = G._adjacency[focal]
subset = graph_obj._adjacency[focal]
codes = subset.index.codes

else:
codes = G._adjacency.index.codes
codes = graph_obj._adjacency.index.codes

# avoid plotting both ij and ji
edges = np.unique(np.sort(np.column_stack([codes]).T, axis=1), axis=0)
Expand Down
Loading

0 comments on commit 5007ee0

Please sign in to comment.