diff --git a/cf_xarray/geometry.py b/cf_xarray/geometry.py index febac8bc..fe48d58a 100644 --- a/cf_xarray/geometry.py +++ b/cf_xarray/geometry.py @@ -1,13 +1,17 @@ from __future__ import annotations import copy -from collections.abc import Sequence +from collections import ChainMap +from collections.abc import Hashable, Sequence +from dataclasses import dataclass import numpy as np import pandas as pd import xarray as xr +from numpy.typing import ArrayLike GEOMETRY_CONTAINER_NAME = "geometry_container" +FEATURES_DIM_NAME = "features" __all__ = [ "decode_geometries", @@ -30,9 +34,149 @@ # 1. node coordinates are exact; the 'normal' coordinates are a reasonable value to use, if you do not know how to interpret the nodes. +@dataclass +class GeometryNames: + """Helper class to ease handling of all the variable names needed for CF geometries.""" + + def __init__( + self, + suffix: str = "", + grid_mapping_name: str | None = None, + grid_mapping: str | None = None, + ): + self.container_name: str = GEOMETRY_CONTAINER_NAME + suffix + self.node_dim: str = "node" + suffix + self.node_count: str = "node_count" + suffix + self.node_coordinates_x: str = "x" + suffix + self.node_coordinates_y: str = "y" + suffix + self.coordinates_x: str = "crd_x" + suffix + self.coordinates_y: str = "crd_y" + suffix + self.part_node_count: str = "part_node_count" + suffix + self.part_dim: str = "part" + suffix + self.interior_ring: str = "interior_ring" + suffix + self.attrs_x: dict[str, str] = {} + self.attrs_y: dict[str, str] = {} + self.grid_mapping_attr = {"grid_mapping": grid_mapping} if grid_mapping else {} + + # Special treatment of selected grid mappings + if grid_mapping_name in ["latitude_longitude", "rotated_latitude_longitude"]: + # Special case for longitude_latitude type grid mappings + self.coordinates_x = "lon" + self.coordinates_y = "lat" + if grid_mapping_name == "latitude_longitude": + self.attrs_x = dict(units="degrees_east", standard_name="longitude") + self.attrs_y = dict(units="degrees_north", standard_name="latitude") + elif grid_mapping_name == "rotated_latitude_longitude": + self.attrs_x = dict( + units="degrees_east", standard_name="grid_longitude" + ) + self.attrs_y = dict( + units="degrees_north", standard_name="grid_latitude" + ) + elif grid_mapping_name is not None: + self.attrs_x = dict(standard_name="projection_x_coordinate") + self.attrs_y = dict(standard_name="projection_y_coordinate") + self.attrs_x.update(self.grid_mapping_attr) + self.attrs_y.update(self.grid_mapping_attr) + + @property + def geometry_container_attrs(self) -> dict[str, str]: + return { + "node_count": self.node_count, + "node_coordinates": f"{self.node_coordinates_x} {self.node_coordinates_y}", + "coordinates": f"{self.coordinates_x} {self.coordinates_y}", + **self.grid_mapping_attr, + } + + def coords( + self, + *, + dim: Hashable, + x: ArrayLike, + y: ArrayLike, + crdX: ArrayLike | None = None, + crdY: ArrayLike | None = None, + ) -> dict[str, xr.DataArray]: + """ + Construct coordinate DataArrays for the numpy data (x, y, crdX, crdY) + + Parameters + ---------- + x: array + Node coordinates for X coordinate + y: array + Node coordinates for Y coordinate + crdX: array, optional + Nominal X coordinate + crdY: array, optional + Nominal X coordinate + """ + mapping = { + self.node_coordinates_x: xr.DataArray( + x, dims=self.node_dim, attrs={"axis": "X", **self.attrs_x} + ), + self.node_coordinates_y: xr.DataArray( + y, dims=self.node_dim, attrs={"axis": "Y", **self.attrs_y} + ), + } + if crdX is not None: + mapping[self.coordinates_x] = xr.DataArray( + crdX, + dims=(dim,), + attrs={"nodes": self.node_coordinates_x, **self.attrs_x}, + ) + if crdY is not None: + mapping[self.coordinates_y] = xr.DataArray( + crdY, + dims=(dim,), + attrs={"nodes": self.node_coordinates_y, **self.attrs_y}, + ) + return mapping + + +def _assert_single_geometry_container(ds: xr.Dataset) -> Hashable: + container_names = _get_geometry_containers(ds) + if len(container_names) > 1: + raise ValueError( + "Only one geometry container is supported by cf_to_points. " + "To handle multiple geometries use `decode_geometries` instead." + ) + (container_name,) = container_names + return container_name + + +def _get_geometry_containers(obj: xr.DataArray | xr.Dataset) -> list[Hashable]: + """ + Translate from key (either CF key or variable name) to its bounds' variable names. + + This function interprets the ``geometry`` attribute on DataArrays. + + Parameters + ---------- + obj : DataArray, Dataset + DataArray belonging to the coordinate to be checked + + Returns + ------- + List[str] + Variable name(s) in parent xarray object that are bounds of `key` + """ + + if isinstance(obj, xr.DataArray): + obj = obj._to_temp_dataset() + variables = obj._variables + + results = set() + for name, var in variables.items(): + attrs_or_encoding = ChainMap(var.attrs, var.encoding) + if "geometry_type" in attrs_or_encoding: + results.update([name]) + return list(results) + + def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: """ - Decode CF encoded geometries to a numpy object array containing shapely geometries. + Decode CF encoded geometries to numpy object arrays containing shapely geometries. Parameters ---------- @@ -50,46 +194,57 @@ def decode_geometries(encoded: xr.Dataset) -> xr.Dataset: cf_to_shapely encode_geometries """ - if GEOMETRY_CONTAINER_NAME not in encoded._variables: + + containers = _get_geometry_containers(encoded) + if not containers: raise NotImplementedError( - f"Currently only a single geometry variable named {GEOMETRY_CONTAINER_NAME!r} is supported." - "A variable by this name is not present in the provided dataset." + "No geometry container variables detected, none of the provided variables " + "have a `geometry_type` attribute." ) - enc_geom_var = encoded[GEOMETRY_CONTAINER_NAME] - geom_attrs = enc_geom_var.attrs - # Grab the coordinates attribute - geom_attrs.update(enc_geom_var.encoding) - - geom_var = cf_to_shapely(encoded).variable - - todrop = (GEOMETRY_CONTAINER_NAME,) + tuple( - s - for s in " ".join( - geom_attrs.get(attr, "") - for attr in [ - "interior_ring", - "node_coordinates", - "node_count", - "part_node_count", - "coordinates", - ] - ).split(" ") - if s - ) - decoded = encoded.drop_vars(todrop) - - name = geom_attrs.get("variable_name", None) - if name in decoded.dims: - decoded = decoded.assign_coords( - xr.Coordinates(coords={name: geom_var}, indexes={}) + todrop: list[Hashable] = [] + decoded = xr.Dataset() + for container_name in containers: + enc_geom_var = encoded[container_name] + geom_attrs = enc_geom_var.attrs + + # Grab the coordinates attribute + geom_attrs.update(enc_geom_var.encoding) + + geom_var = cf_to_shapely(encoded, container=container_name).variable + + todrop.extend( + (container_name,) + + tuple( + s + for s in " ".join( + geom_attrs.get(attr, "") + for attr in [ + "interior_ring", + "node_coordinates", + "node_count", + "part_node_count", + "coordinates", + ] + ).split(" ") + if s + ) ) - else: - decoded[name] = geom_var + + name = geom_attrs.get("variable_name", None) + if name in encoded.dims: + decoded = decoded.assign_coords( + xr.Coordinates(coords={name: geom_var}, indexes={}) + ) + else: + decoded[name] = geom_var + + decoded.update(encoded.drop_vars(todrop)) # Is this a good idea? We are deleting information. + # OTOH we have decoded it to a useful in-memory representation for var in decoded._variables.values(): - if var.attrs.get("geometry") == GEOMETRY_CONTAINER_NAME: + if var.attrs.get("geometry") in containers: var.attrs.pop("geometry") return decoded @@ -101,11 +256,6 @@ def encode_geometries(ds: xr.Dataset): Practically speaking, geometry variables are numpy object arrays where the first element is a shapely geometry. - .. warning:: - - Only a single geometry variable is supported at present. Contributions to fix this - are welcome. - Parameters ---------- ds : Dataset @@ -114,7 +264,9 @@ def encode_geometries(ds: xr.Dataset): Returns ------- Dataset - Where all geometry variables are encoded. + Where all geometry variables are encoded. The information in a single geometry + variable in the input is split across multiple variables in the returned Dataset + following the CF conventions. See Also -------- @@ -151,58 +303,57 @@ def encode_geometries(ds: xr.Dataset): # e.g. xvec GeometryIndex ds = ds.drop_indexes(to_drop) - if len(geom_var_names) > 1: - raise NotImplementedError( - "Multiple geometry variables are not supported at this time. " - "Contributions to fix this are welcome. " - f"Detected geometry variables are {geom_var_names!r}" + variables = {} + for name in geom_var_names: + # TODO: do we prefer this choice be invariant to number of geometry variables + suffix = "_" + str(name) if len(geom_var_names) > 1 else "" + container_name = GEOMETRY_CONTAINER_NAME + suffix + # If `name` is a dimension name, then we need to drop it. Otherwise we don't + # So set errors="ignore" + variables.update( + shapely_to_cf(ds[name], suffix=suffix) + .drop_vars(name, errors="ignore") + ._variables ) - (name,) = geom_var_names - variables = {} - # If `name` is a dimension name, then we need to drop it. Otherwise we don't - # So set errors="ignore" - variables.update( - shapely_to_cf(ds[name]).drop_vars(name, errors="ignore")._variables + geom_var = ds[name] + more_updates = {} + for varname, var in ds._variables.items(): + if varname == name: + continue + # TODO: this is incomplete. It works for vector data cubes where one of the geometry vars + # is a dimension coordinate. + if name in var.dims: + var = var.copy() + var._attrs = copy.deepcopy(var._attrs) + var.attrs["geometry"] = container_name + # The grid_mapping and coordinates attributes can be carried by the geometry container + # variable provided they are also carried by the data variables associated with the container. + if to_add := geom_var.attrs.get("coordinates", ""): + var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add + more_updates[varname] = var + variables.update(more_updates) + + # WARNING: cf-xarray specific convention. + # For vector data cubes, `name` is a dimension name. + # By encoding to CF, we have + # encoded the information in that variable across many different + # variables (e.g. node_count) with `name` as a dimension. + # We have to record `name` somewhere so that we reconstruct + # a geometry variable of the right name at decode-time. + variables[container_name].attrs["variable_name"] = name + + encoded = xr.Dataset(variables).set_coords( + set(ds._coord_names) - set(geom_var_names) ) - geom_var = ds[name] - - more_updates = {} - for varname, var in ds._variables.items(): - if varname == name: - continue - # TODO: this is incomplete. It works for vector data cubes where one of the geometry vars - # is a dimension coordinate. - if name in var.dims: - var = var.copy() - var._attrs = copy.deepcopy(var._attrs) - var.attrs["geometry"] = GEOMETRY_CONTAINER_NAME - # The grid_mapping and coordinates attributes can be carried by the geometry container - # variable provided they are also carried by the data variables associated with the container. - if to_add := geom_var.attrs.get("coordinates", ""): - var.attrs["coordinates"] = var.attrs.get("coordinates", "") + to_add - more_updates[varname] = var - variables.update(more_updates) - - # WARNING: cf-xarray specific convention. - # For vector data cubes, `name` is a dimension name. - # By encoding to CF, we have - # encoded the information in that variable across many different - # variables (e.g. node_count) with `name` as a dimension. - # We have to record `name` somewhere so that we reconstruct - # a geometry variable of the right name at decode-time. - variables[GEOMETRY_CONTAINER_NAME].attrs["variable_name"] = name - - encoded = xr.Dataset(variables) - return encoded def reshape_unique_geometries( ds: xr.Dataset, geom_var: str = "geometry", - new_dim: str = "features", + new_dim: str = FEATURES_DIM_NAME, ) -> xr.Dataset: """Reshape a dataset containing a geometry variable so that all unique features are identified along a new dimension. @@ -263,7 +414,12 @@ def reshape_unique_geometries( return out -def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None = None): +def shapely_to_cf( + geometries: xr.DataArray | Sequence, + grid_mapping: str | None = None, + *, + suffix: str = "", +): """ Convert a DataArray with shapely geometry objects into a CF-compliant dataset. @@ -277,8 +433,8 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None A CF grid mapping name. When given, coordinates and attributes are named and set accordingly. Defaults to None, in which case the coordinates are simply names "crd_x" and "crd_y". - .. warning:: - Only the `longitude_latitude` grid mapping is currently implemented. + container_name: str, optional + Name for the "geometry container" scalar variable in the encoded Dataset Returns ------- @@ -289,7 +445,7 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None - 'node_count' : The number of nodes per feature. Always present for Lines and Polygons. For Points: only present if there are multipart geometries. - 'part_node_count' : The number of nodes per individual geometry. Only for Lines with multipart geometries and for Polygons with multipart geometries or holes. - 'interior_ring' : Integer boolean indicating whether rings are interior or exterior. Only for Polygons with holes. - - 'geometry_container' : Empty variable with attributes describing the geometry type. + - container_name : Empty variable with attributes describing the geometry type. References ---------- @@ -308,56 +464,39 @@ def shapely_to_cf(geometries: xr.DataArray | Sequence, grid_mapping: str | None geom.item().geom_type if isinstance(geom, xr.DataArray) else geom.geom_type for geom in geometries } - if types.issubset({"Point", "MultiPoint"}): - ds = points_to_cf(geometries) - elif types.issubset({"LineString", "MultiLineString"}): - ds = lines_to_cf(geometries) - elif types.issubset({"Polygon", "MultiPolygon"}): - ds = polygons_to_cf(geometries) - else: - raise ValueError( - f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" - ) - - ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="crd_x crd_y") + grid_mapping_varname = None if ( grid_mapping is None and isinstance(geometries, xr.DataArray) and (grid_mapping_varname := geometries.attrs.get("grid_mapping")) ): if grid_mapping_varname in geometries.coords: - grid_mapping = geometries.coords[grid_mapping_varname].attrs[ - "grid_mapping_name" - ] - for name_ in ["x", "y", "crd_x", "crd_y"]: - ds[name_].attrs["grid_mapping"] = grid_mapping_varname - - # Special treatment of selected grid mappings - if grid_mapping in ["latitude_longitude", "rotated_latitude_longitude"]: - # Special case for longitude_latitude type grid mappings - ds = ds.rename(crd_x="lon", crd_y="lat") - if grid_mapping == "latitude_longitude": - ds.lon.attrs.update(units="degrees_east", standard_name="longitude") - ds.x.attrs.update(units="degrees_east", standard_name="longitude") - ds.lat.attrs.update(units="degrees_north", standard_name="latitude") - ds.y.attrs.update(units="degrees_north", standard_name="latitude") - elif grid_mapping == "rotated_latitude_longitude": - ds.lon.attrs.update(units="degrees", standard_name="grid_longitude") - ds.x.attrs.update(units="degrees", standard_name="grid_longitude") - ds.lat.attrs.update(units="degrees", standard_name="grid_latitude") - ds.y.attrs.update(units="degrees", standard_name="grid_latitude") - ds[GEOMETRY_CONTAINER_NAME].attrs.update(coordinates="lon lat") - elif grid_mapping is not None: - ds.crd_x.attrs.update(standard_name="projection_x_coordinate") - ds.x.attrs.update(standard_name="projection_x_coordinate") - ds.crd_y.attrs.update(standard_name="projection_y_coordinate") - ds.y.attrs.update(standard_name="projection_y_coordinate") + # Not all CRS can be encoded in CF + grid_mapping = geometries.coords[grid_mapping_varname].attrs.get( + "grid_mapping_name", None + ) + + # TODO: consider accepting a GeometryNames instance from the user instead + names = GeometryNames( + suffix=suffix, grid_mapping_name=grid_mapping, grid_mapping=grid_mapping_varname + ) + + if types.issubset({"Point", "MultiPoint"}): + ds = points_to_cf(geometries, names=names) + elif types.issubset({"LineString", "MultiLineString"}): + ds = lines_to_cf(geometries, names=names) + elif types.issubset({"Polygon", "MultiPolygon"}): + ds = polygons_to_cf(geometries, names=names) + else: + raise ValueError( + f"Mixed geometry types are not supported in CF-compliant datasets. Got {types}" + ) return ds -def cf_to_shapely(ds: xr.Dataset): +def cf_to_shapely(ds: xr.Dataset, *, container: Hashable = GEOMETRY_CONTAINER_NAME): """ Convert geometries stored in a CF-compliant way to shapely objects stored in a single variable. @@ -378,22 +517,35 @@ def cf_to_shapely(ds: xr.Dataset): ---------- Please refer to the CF conventions document: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#geometries """ - geom_type = ds[GEOMETRY_CONTAINER_NAME].attrs["geometry_type"] + if container not in ds._variables: + raise ValueError( + f"{container!r} is not the name of a variable in the provided Dataset." + ) + if not (geom_type := ds[container].attrs.get("geometry_type", None)): + raise ValueError( + f"{container!r} is not the name of a valid geometry variable. " + "It does not have a `geometry_type` attribute." + ) + + # Extract all necessary geometry variables + subds = ds.cf[[container]] if geom_type == "point": - geometries = cf_to_points(ds) + geometries = cf_to_points(subds) elif geom_type == "line": - geometries = cf_to_lines(ds) + geometries = cf_to_lines(subds) elif geom_type == "polygon": - geometries = cf_to_polygons(ds) + geometries = cf_to_polygons(subds) else: raise ValueError( f"Valid CF geometry types are 'point', 'line' and 'polygon'. Got {geom_type}" ) + if gm := ds[container].attrs.get("grid_mapping"): + geometries.attrs["grid_mapping"] = gm return geometries.rename("geometry") -def points_to_cf(pts: xr.DataArray | Sequence): +def points_to_cf(pts: xr.DataArray | Sequence, *, names: GeometryNames | None = None): """Get a list of points (shapely.geometry.[Multi]Point) and return a CF-compliant geometry dataset. Parameters @@ -410,11 +562,14 @@ def points_to_cf(pts: xr.DataArray | Sequence): from shapely.geometry import MultiPoint if isinstance(pts, xr.DataArray): + # TODO: Fix this hardcoding + if pts.ndim != 1: + raise ValueError("Only 1D DataArrays are supported.") dim = pts.dims[0] coord = pts[dim] if dim in pts.coords else None pts_ = pts.values.tolist() else: - dim = "features" + dim = FEATURES_DIM_NAME coord = None pts_ = pts @@ -430,33 +585,27 @@ def points_to_cf(pts: xr.DataArray | Sequence): crdX.append(xy[0, 0]) crdY.append(xy[0, 1]) + if names is None: + names = GeometryNames() + ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - "geometry_container": xr.DataArray( - attrs={ - "geometry_type": "point", - "node_count": "node_count", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "point", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPoints - if (ds.node_count == 1).all(): - ds = ds.drop_vars("node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["node_count"] + if (ds[names.node_count] == 1).data.all(): + ds = ds.drop_vars(names.node_count) + del ds[names.container_name].attrs["node_count"] return ds @@ -467,7 +616,7 @@ def cf_to_points(ds: xr.Dataset): ---------- ds : xr.Dataset A dataset with CF-compliant point geometries. - Must have a "geometry_container" variable with at least a 'node_coordinates' attribute. + Must have a *single* "geometry container" variable with at least a 'node_coordinates' attribute. Must also have the two 1D variables listed by this attribute. Returns @@ -479,8 +628,9 @@ def cf_to_points(ds: xr.Dataset): """ from shapely.geometry import MultiPoint, Point + container_name = _assert_single_geometry_container(ds) # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'node_count' or the dimension of the coordinates, if present. feat_dim = None @@ -488,14 +638,14 @@ def cf_to_points(ds: xr.Dataset): xcoord_name, _ = geo["coordinates"].split(" ") (feat_dim,) = ds[xcoord_name].dims - x_name, y_name = ds[GEOMETRY_CONTAINER_NAME].attrs["node_coordinates"].split(" ") + x_name, y_name = ds[container_name].attrs["node_coordinates"].split(" ") xy = np.stack([ds[x_name].values, ds[y_name].values], axis=-1) - node_count_name = ds[GEOMETRY_CONTAINER_NAME].attrs.get("node_count") + node_count_name = ds[container_name].attrs.get("node_count") if node_count_name is None: # No node_count means all geometries are single points (node_count = 1) - # And if we had no coordinates, then the dimension defaults to "features" - feat_dim = feat_dim or "features" + # And if we had no coordinates, then the dimension defaults to FEATURES_DIM_NAME + feat_dim = feat_dim or FEATURES_DIM_NAME node_count = xr.DataArray([1] * xy.shape[0], dims=(feat_dim,)) if feat_dim in ds.coords: node_count = node_count.assign_coords({feat_dim: ds[feat_dim]}) @@ -512,10 +662,13 @@ def cf_to_points(ds: xr.Dataset): geoms[i] = MultiPoint(xy[j : j + n, :]) j += n - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + da = xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + if node_count_name: + del da[node_count_name] + return da -def lines_to_cf(lines: xr.DataArray | Sequence): +def lines_to_cf(lines: xr.DataArray | Sequence, *, names: GeometryNames | None = None): """Convert an iterable of lines (shapely.geometry.[Multi]Line) into a CF-compliant geometry dataset. Parameters @@ -540,6 +693,9 @@ def lines_to_cf(lines: xr.DataArray | Sequence): coord = None lines_ = np.array(lines) + if names is None: + names = GeometryNames() + _, arr, offsets = to_ragged_array(lines_) x = arr[:, 0] y = arr[:, 1] @@ -558,33 +714,23 @@ def lines_to_cf(lines: xr.DataArray | Sequence): ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - "geometry_container": xr.DataArray( - attrs={ - "geometry_type": "line", - "node_count": "node_count", - "part_node_count": "part_node_count", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "line", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiLines - if len(ds.part_node_count) == len(ds.node_count): - ds = ds.drop_vars("part_node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] + if len(part_node_count) != len(node_count): + ds[names.part_node_count] = xr.DataArray(part_node_count, dims=names.part_dim) + ds[names.container_name].attrs["part_node_count"] = names.part_node_count + return ds @@ -607,8 +753,10 @@ def cf_to_lines(ds: xr.Dataset): """ from shapely import GeometryType, from_ragged_array + container_name = _assert_single_geometry_container(ds) + # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'node_count' # or the dimension of the coordinates, if present. @@ -645,10 +793,14 @@ def cf_to_lines(ds: xr.Dataset): # get items from lines or multilines depending on number of parts geoms = np.where(np.diff(offset2) == 1, lines[offset2[:-1]], multilines) - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + return xr.DataArray( + geoms, dims=node_count.dims, coords=node_count.coords + ).drop_vars(node_count_name) -def polygons_to_cf(polygons: xr.DataArray | Sequence): +def polygons_to_cf( + polygons: xr.DataArray | Sequence, *, names: GeometryNames | None = None +): """Convert an iterable of polygons (shapely.geometry.[Multi]Polygon) into a CF-compliant geometry dataset. Parameters @@ -656,6 +808,9 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): polygons : sequence of shapely.geometry.Polygon or MultiPolygon The sequence of [multi]polygons to translate to a CF dataset. + names: GeometryNames, optional + Structure that helps manipulate geometry attrs. + Returns ------- xr.Dataset @@ -673,6 +828,9 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): coord = None polygons_ = np.array(polygons) + if names is None: + names = GeometryNames() + _, arr, offsets = to_ragged_array(polygons_) x = arr[:, 0] y = arr[:, 1] @@ -696,40 +854,27 @@ def polygons_to_cf(polygons: xr.DataArray | Sequence): ds = xr.Dataset( data_vars={ - "node_count": xr.DataArray(node_count, dims=(dim,)), - "interior_ring": xr.DataArray(interior_ring, dims=("part",)), - "part_node_count": xr.DataArray(part_node_count, dims=("part",)), - "geometry_container": xr.DataArray( - attrs={ - "geometry_type": "polygon", - "node_count": "node_count", - "part_node_count": "part_node_count", - "interior_ring": "interior_ring", - "node_coordinates": "x y", - "coordinates": "crd_x crd_y", - } + names.node_count: xr.DataArray(node_count, dims=(dim,)), + names.container_name: xr.DataArray( + data=np.nan, + attrs={"geometry_type": "polygon", **names.geometry_container_attrs}, ), }, - coords={ - "x": xr.DataArray(x, dims=("node",), attrs={"axis": "X"}), - "y": xr.DataArray(y, dims=("node",), attrs={"axis": "Y"}), - "crd_x": xr.DataArray(crdX, dims=(dim,), attrs={"nodes": "x"}), - "crd_y": xr.DataArray(crdY, dims=(dim,), attrs={"nodes": "y"}), - }, + coords=names.coords(x=x, y=y, crdX=crdX, crdY=crdY, dim=dim), ) if coord is not None: ds = ds.assign_coords({dim: coord}) # Special case when we have no MultiPolygons and no holes - if len(ds.part_node_count) == len(ds.node_count): - ds = ds.drop_vars("part_node_count") - del ds[GEOMETRY_CONTAINER_NAME].attrs["part_node_count"] + if len(part_node_count) != len(node_count): + ds[names.part_node_count] = xr.DataArray(part_node_count, dims=names.part_dim) + ds[names.container_name].attrs["part_node_count"] = names.part_node_count # Special case when we have no holes - if (ds.interior_ring == 0).all(): - ds = ds.drop_vars("interior_ring") - del ds[GEOMETRY_CONTAINER_NAME].attrs["interior_ring"] + if (interior_ring != 0).any(): + ds[names.interior_ring] = xr.DataArray(interior_ring, dims=names.part_dim) + ds[names.container_name].attrs["interior_ring"] = names.interior_ring return ds @@ -752,8 +897,10 @@ def cf_to_polygons(ds: xr.Dataset): """ from shapely import GeometryType, from_ragged_array + container_name = _assert_single_geometry_container(ds) + # Shorthand for convenience - geo = ds[GEOMETRY_CONTAINER_NAME].attrs + geo = ds[container_name].attrs # The features dimension name, defaults to the one of 'part_node_count' # or the dimension of the coordinates, if present. @@ -805,4 +952,6 @@ def cf_to_polygons(ds: xr.Dataset): # get items from polygons or multipolygons depending on number of parts geoms = np.where(np.diff(offset3) == 1, polygons[offset3[:-1]], multipolygons) - return xr.DataArray(geoms, dims=node_count.dims, coords=node_count.coords) + return xr.DataArray( + geoms, dims=node_count.dims, coords=node_count.coords + ).drop_vars(node_count_name) diff --git a/cf_xarray/tests/test_geometry.py b/cf_xarray/tests/test_geometry.py index c412f3c7..31cb2de8 100644 --- a/cf_xarray/tests/test_geometry.py +++ b/cf_xarray/tests/test_geometry.py @@ -360,6 +360,8 @@ def test_shapely_to_cf_errors(): ) assert encoded["x"].attrs["standard_name"] == "projection_x_coordinate" assert encoded["y"].attrs["standard_name"] == "projection_y_coordinate" + for name in ["x", "y", "crd_x", "crd_y"]: + assert "grid_mapping" not in encoded[name].attrs @requires_shapely @@ -500,6 +502,10 @@ def test_encode_decode(geometry_ds, polygon_geometry): ) ).assign({"foo": ("geoms", [1, 2])}) - for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds): + polyds = ( + polygon_geometry.rename("polygons").rename({"index": "index2"}).to_dataset() + ) + multi_ds = xr.merge([polyds, geometry_ds[1]]) + for ds in (geometry_ds[1], polygon_geometry.to_dataset(), geom_dim_ds, multi_ds): roundtripped = decode_geometries(encode_geometries(ds)) xr.testing.assert_identical(ds, roundtripped) diff --git a/doc/api.rst b/doc/api.rst index 4651f017..493a01a9 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -26,6 +26,7 @@ Geometries geometry.encode_geometries geometry.shapely_to_cf geometry.cf_to_shapely + geometry.GeometryNames .. currentmodule:: xarray diff --git a/doc/geometry.md b/doc/geometry.md index 3a2380f8..e105c88b 100644 --- a/doc/geometry.md +++ b/doc/geometry.md @@ -97,9 +97,8 @@ ds.identical(decoded) The following limitations can be relaxed in the future. PRs welcome! -1. cf-xarray uses `"geometry_container"` as the name for the geometry variable always -1. cf-xarray only supports decoding a single geometry in a Dataset. -1. CF xarray will not set the `"geometry"` attribute that links a variable to a geometry by default unless the geometry variable is a dimension coordiante for that variable. This heuristic works OK for vector data cubes (e.g. [xvec](https://xvec.readthedocs.io/en/stable/)). You should set the `"geometry"` attribute manually otherwise. Suggestions for better behaviour here are very welcome. +1. cf-xarray uses `"geometry_container"` as the name for the geometry variable always. If there are multiple geometry variables then `"geometry_N"`is used where N is an integer >= 0. cf-xarray behaves similarly for all associated geometry variables names: i.e. `"node"`, `"node_count"`, `"part_node_count"`, `"part"`, `"interior_ring"`. `"x"`, `"y"` (with suffixes if needed) are always the node coordinate variable names, and `"crd_x"`, `"crd_y"` are the nominal X, Y coordinate locations. None of this is configurable at the moment. +1. CF xarray will not set the `"geometry"` attribute that links a variable to a geometry by default unless the geometry variable is a dimension coordinate for that variable. This heuristic works OK for vector data cubes (e.g. [xvec](https://xvec.readthedocs.io/en/stable/)). You should set the `"geometry"` attribute manually otherwise. Suggestions for better behaviour here are very welcome. ## Lower-level conversions