Skip to content

Commit

Permalink
fix #205
Browse files Browse the repository at this point in the history
  • Loading branch information
smnorris committed Dec 11, 2024
2 parents 8380f5d + 6aaed4f commit 8e374a9
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Changes
------------------
- support Data Catalogue API changes (#188)
- bump dependencies
- default to slightly cleaning/standardizing features returned from bcdata.get_data() / bcdata dump

0.12.3 (2024-11-12)
------------------
Expand Down
14 changes: 6 additions & 8 deletions src/bcdata/bc2pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def bc2pg( # noqa: C901
geometry_type=None,
query=None,
bounds=None,
bounds_crs=None,
bounds_crs="EPSG:3005",
count=None,
sortby=None,
primary_key=None,
Expand Down Expand Up @@ -101,7 +101,7 @@ def bc2pg( # noqa: C901

# if geometry type is not provided, determine type by making the first request
if not geometry_type:
df = WFS.make_requests([urls[0]], as_gdf=True, crs="epsg:3005", lowercase=True)
df = WFS.make_requests(dataset=dataset, urls=[urls[0]], as_gdf=True, crs="epsg:3005", lowercase=True)
geometry_type = df.geom_type.unique()[0] # keep only the first type
if numpy.any(df.has_z.unique()[0]): # geopandas does not include Z in geom_type string
geometry_type = geometry_type + "Z"
Expand All @@ -111,7 +111,8 @@ def bc2pg( # noqa: C901
if not geometry_type:
if not urls[-1] == urls[0]:
df_temp = WFS.make_requests(
[urls[-1]],
dataset=dataset,
urls=[urls[-1]],
as_gdf=True,
crs="epsg:3005",
lowercase=True,
Expand Down Expand Up @@ -164,7 +165,7 @@ def bc2pg( # noqa: C901
for n, url in enumerate(urls):
# if first url not downloaded above when checking geom type, do now
if df is None:
df = WFS.make_requests([url], as_gdf=True, crs="epsg:3005", lowercase=True)
df = WFS.make_requests(dataset=dataset, urls=[url], as_gdf=True, crs="epsg:3005", lowercase=True)
# tidy the resulting dataframe
df = df.rename_geometry("geom")
# lowercasify
Expand All @@ -177,10 +178,7 @@ def bc2pg( # noqa: C901
df_nulls = df_nulls.drop(columns=["geom"])
# remove rows with null geometry from geodataframe
df = df[df["geom"].notna()]
# cast to everything multipart because responses can have mixed types
# geopandas does not have a built in function:
# https://gis.stackexchange.com/questions/311320/casting-geometry-to-multi-using-geopandas
# (but only cast if geometry_type is not specified to be singlepart)
# promote to multipart
if promote_to_multi:
df["geom"] = [
MultiPoint([feature]) if isinstance(feature, Point) else feature
Expand Down
2 changes: 1 addition & 1 deletion src/bcdata/bcdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

log = logging.getLogger(__name__)

BCDC_API_URL = "https://toyger.data.gov.bc.ca/api/3/action/"
BCDC_API_URL = "https://catalogue.data.gov.bc.ca/api/3/action/"


class ServiceException(Exception):
Expand Down
22 changes: 19 additions & 3 deletions src/bcdata/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@

import click
from cligj import compact_opt, indent_opt, quiet_opt, verbose_opt
from shapely.geometry.linestring import LineString
from shapely.geometry.multilinestring import MultiLineString
from shapely.geometry.multipoint import MultiPoint
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry.point import Point
from shapely.geometry.polygon import Polygon

import bcdata
from bcdata.database import Database
Expand All @@ -21,7 +27,6 @@ def configure_logging(verbosity):
def complete_dataset_names(ctx, param, incomplete):
return [k for k in bcdata.list_tables() if k.startswith(incomplete)]


# bounds handling direct from rasterio
# https://github.com/mapbox/rasterio/blob/master/rasterio/rio/options.py
# https://github.com/mapbox/rasterio/blob/master/rasterio/rio/clip.py
Expand Down Expand Up @@ -204,10 +209,17 @@ def dem(
help="CRS of provided bounds",
default="EPSG:3005",
)
@click.option(
"--no-clean",
"-nc",
help="Do not do any data standardization",
is_flag=True,
default=True,
)
@lowercase_opt
@verbose_opt
@quiet_opt
def dump(dataset, query, out_file, bounds, bounds_crs, lowercase, verbose, quiet):
def dump(dataset, query, out_file, bounds, bounds_crs, no_clean, lowercase, verbose, quiet):
"""Write DataBC features to stdout as GeoJSON feature collection.
\b
Expand All @@ -223,8 +235,12 @@ def dump(dataset, query, out_file, bounds, bounds_crs, lowercase, verbose, quiet
verbosity = verbose - quiet
configure_logging(verbosity)
table = bcdata.validate_name(dataset)
if no_clean:
clean = False
else:
clean = True
data = bcdata.get_data(
table, query=query, bounds=bounds, bounds_crs=bounds_crs, lowercase=lowercase
table, query=query, bounds=bounds, bounds_crs=bounds_crs, lowercase=lowercase, clean=clean
)
if out_file:
with open(out_file, "w") as sink:
Expand Down
82 changes: 65 additions & 17 deletions src/bcdata/wfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
from owslib.feature import schema as wfs_schema
from owslib.feature import wfs200
from owslib.wfs import WebFeatureService
from shapely.geometry.linestring import LineString
from shapely.geometry.multilinestring import MultiLineString
from shapely.geometry.multipoint import MultiPoint
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry.point import Point
from shapely.geometry.polygon import Polygon

import bcdata

Expand All @@ -24,6 +30,26 @@
log = logging.getLogger(__name__)


def ensure_single_geometry_type(df):
"""If mix of single/multi part geometries are present, promote all geometries to multipart"""
geomtypes = sorted(
[t.upper() for t in df.geometry.geom_type.dropna(axis=0, how="all").unique()], key=len
)
if len(geomtypes) > 1 and geomtypes[1] == "MULTI" + geomtypes[0]:
df.geometry = [
MultiPoint([feature]) if isinstance(feature, Point) else feature
for feature in df.geometry
]
df.geometry = [
MultiLineString([feature]) if isinstance(feature, LineString) else feature
for feature in df.geometry
]
df.geometry = [
MultiPolygon([feature]) if isinstance(feature, Polygon) else feature
for feature in df.geometry
]
return df

class ServiceException(Exception):
pass

Expand Down Expand Up @@ -338,7 +364,9 @@ def define_requests(
urls.append(self.wfs_url + "?" + urlencode(request, doseq=True))
return urls

def make_requests(self, urls, as_gdf=False, crs="epsg4326", lowercase=False, silent=False):
def make_requests(
self, dataset, urls, as_gdf=False, crs="epsg4326", lowercase=False, silent=False, clean=True
):
"""turn urls into data"""
# loop through urls
results = []
Expand All @@ -347,27 +375,42 @@ def make_requests(self, urls, as_gdf=False, crs="epsg4326", lowercase=False, sil
outjson = dict(type="FeatureCollection", features=[])
for result in results:
outjson["features"] += result

# if specified, lowercasify all properties
if lowercase:
for feature in outjson["features"]:
feature["properties"] = {k.lower(): v for k, v in feature["properties"].items()}
if not as_gdf:
# If output crs is specified, include the crs object in the json
# But as default, we prefer to default to 4326 and RFC7946 (no crs)
if crs.lower() != "epsg:4326":
crs_int = crs.split(":")[1]
outjson["crs"] = (
f"""{{"type":"name","properties":{{"name":"urn:ogc:def:crs:EPSG::{crs_int}"}}}}"""
)
return outjson
feature["properties"] = {
k.lower(): v for k, v in feature["properties"].items()
}

# load to geodataframe, standardize data slightly
if len(outjson["features"]) > 0:
gdf = gpd.GeoDataFrame.from_features(outjson)
gdf.crs = crs
# minor data cleaning as default
if clean:
if gdf.geometry.name != "geometry":
gdf = gdf.rename_geometry("geometry")
gdf = ensure_single_geometry_type(gdf)
table_definition = bcdata.get_table_definition(dataset)
column_names = [
c["column_name"]
for c in table_definition["schema"]
if c["column_name"] not in ["FEATURE_AREA_SQM", "FEATURE_LENGTH_M"]
and c["data_type"] in ["NUMBER", "VARCHAR2", "DATE"]
]
if lowercase:
column_names = [c.lower() for c in column_names]
gdf = gdf[column_names + ["geometry"]]
else:
if len(outjson["features"]) > 0:
gdf = gpd.GeoDataFrame.from_features(outjson)
gdf.crs = crs
else:
gdf = gpd.GeoDataFrame()
gdf = gpd.GeoDataFrame()

if as_gdf:
return gdf

else:
return json.loads(gdf.to_json())

def get_data(
self,
dataset,
Expand All @@ -379,8 +422,10 @@ def get_data(
sortby=None,
as_gdf=False,
lowercase=False,
clean=True
):
"""Request features from DataBC WFS and return GeoJSON featurecollection or geodataframe"""
dataset = self.validate_name(dataset)
urls = self.define_requests(
dataset,
query=query,
Expand All @@ -390,7 +435,7 @@ def get_data(
count=count,
sortby=sortby,
)
return self.make_requests(urls, as_gdf, crs, lowercase)
return self.make_requests(dataset, urls, as_gdf=as_gdf, crs=crs, lowercase=lowercase, clean=clean)

def get_features(
self,
Expand Down Expand Up @@ -441,6 +486,7 @@ def define_requests(
query=query,
crs=crs,
bounds=bounds,
bounds_crs=bounds_crs,
count=count,
sortby=sortby,
check_count=check_count,
Expand Down Expand Up @@ -470,6 +516,7 @@ def get_data(
sortby=None,
as_gdf=False,
lowercase=False,
clean=True
):
WFS = BCWFS()
return WFS.get_data(
Expand All @@ -482,6 +529,7 @@ def get_data(
sortby=sortby,
as_gdf=as_gdf,
lowercase=lowercase,
clean=clean
)


Expand Down
22 changes: 21 additions & 1 deletion tests/test_wfs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
import json
import requests
import requests_mock
import stamina
Expand Down Expand Up @@ -93,7 +94,9 @@ def test_get_data_lowercase():

def test_get_data_crs():
data = bcdata.get_data(AIRPORTS_TABLE, crs="EPSG:3005")
assert data["crs"] == """{"type":"name","properties":{"name":"urn:ogc:def:crs:EPSG::3005"}}"""
assert (
data["crs"]["properties"]["name"] == 'urn:ogc:def:crs:EPSG::3005'
)


def test_get_features():
Expand Down Expand Up @@ -144,3 +147,20 @@ def test_cql_bounds_filter():
)
assert len(data["features"]) == 1
assert data["features"][0]["properties"]["AIRPORT_NAME"] == "Victoria International Airport"


def test_clean():
data = bcdata.get_data(
AIRPORTS_TABLE,
query="AIRPORT_NAME='Terrace (Northwest Regional) Airport'",
)
assert "SE_ANNO_CAD_DATA" not in data["features"][0]["properties"].keys()


def test_no_clean():
data = bcdata.get_data(
AIRPORTS_TABLE,
query="AIRPORT_NAME='Terrace (Northwest Regional) Airport'",
clean=False
)
assert "SE_ANNO_CAD_DATA" in data["features"][0]["properties"].keys()

0 comments on commit 8e374a9

Please sign in to comment.