From e6661b824dc74bf303f78e94a616b2b144c352c3 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 12 Apr 2023 12:49:57 +0200 Subject: [PATCH] Modernize and simplify iris.analysis._Groupby (#5015) * Modernize and simplify _Groupby * Rename variable to improve readability Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> * Add a whatsnew entry * Add a type hint to _add_shared_coord * Add a test for iris.analysis._Groupby.__repr__ --------- Co-authored-by: Martin Yeo <40734014+trexfeathers@users.noreply.github.com> --- docs/src/whatsnew/latest.rst | 3 + lib/iris/analysis/__init__.py | 290 ++++++++++++-------------------- lib/iris/coords.py | 29 +--- lib/iris/tests/test_analysis.py | 11 ++ 4 files changed, 127 insertions(+), 206 deletions(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 402f436c97..baa362fea9 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -144,6 +144,9 @@ This document explains the changes made to Iris for this release 💼 Internal =========== +#. `@bouweandela`_ and `@trexfeathers`_ (reviewer) modernized and simplified + the code of ``iris.analysis._Groupby``. (:pull:`5015`) + #. `@fnattino`_ changed the order of ``ncgen`` arguments in the command to create NetCDF files for testing (caused errors on OS X). (:pull:`5105`) diff --git a/lib/iris/analysis/__init__.py b/lib/iris/analysis/__init__.py index 173487cfb0..47feb2a847 100644 --- a/lib/iris/analysis/__init__.py +++ b/lib/iris/analysis/__init__.py @@ -35,11 +35,15 @@ """ -from collections import OrderedDict +from __future__ import annotations + from collections.abc import Iterable import functools from functools import wraps from inspect import getfullargspec +import itertools +from numbers import Number +from typing import Optional, Union import warnings from cf_units import Unit @@ -2383,8 +2387,11 @@ class _Groupby: """ def __init__( - self, groupby_coords, shared_coords=None, climatological=False - ): + self, + groupby_coords: list[iris.coords.Coord], + shared_coords: Optional[list[tuple[iris.coords.Coord, int]]] = None, + climatological: bool = False, + ) -> None: """ Determine the group slices over the group-by coordinates. @@ -2408,15 +2415,15 @@ def __init__( """ #: Group-by and shared coordinates that have been grouped. - self.coords = [] - self._groupby_coords = [] - self._shared_coords = [] - self._slices_by_key = OrderedDict() + self.coords: list[iris.coords.Coord] = [] + self._groupby_coords: list[iris.coords.Coord] = [] + self._shared_coords: list[tuple[iris.coords.Coord, int]] = [] + self._groupby_indices: list[tuple[int, ...]] = [] self._stop = None # Ensure group-by coordinates are iterable. if not isinstance(groupby_coords, Iterable): raise TypeError( - "groupby_coords must be a " "`collections.Iterable` type." + "groupby_coords must be a `collections.Iterable` type." ) # Add valid group-by coordinates. @@ -2428,7 +2435,7 @@ def __init__( # Ensure shared coordinates are iterable. if not isinstance(shared_coords, Iterable): raise TypeError( - "shared_coords must be a " "`collections.Iterable` type." + "shared_coords must be a `collections.Iterable` type." ) # Add valid shared coordinates. for coord, dim in shared_coords: @@ -2439,9 +2446,11 @@ def __init__( # Stores mapping from original cube coords to new ones, as metadata may # not match - self.coord_replacement_mapping = [] + self.coord_replacement_mapping: list[ + tuple[iris.coords.Coord, iris.coords.Coord] + ] = [] - def _add_groupby_coord(self, coord): + def _add_groupby_coord(self, coord: iris.coords.Coord) -> None: if coord.ndim != 1: raise iris.exceptions.CoordinateMultiDimError(coord) if self._stop is None: @@ -2450,12 +2459,12 @@ def _add_groupby_coord(self, coord): raise ValueError("Group-by coordinates have different lengths.") self._groupby_coords.append(coord) - def _add_shared_coord(self, coord, dim): + def _add_shared_coord(self, coord: iris.coords.Coord, dim: int) -> None: if coord.shape[dim] != self._stop and self._stop is not None: raise ValueError("Shared coordinates have different lengths.") self._shared_coords.append((coord, dim)) - def group(self): + def group(self) -> list[tuple[int, ...]]: """ Calculate the groups and associated slices over one or more group-by coordinates. @@ -2464,147 +2473,84 @@ def group(self): group slices. Returns: - A generator of the coordinate group slices. - - """ - if self._groupby_coords: - if not self._slices_by_key: - items = [] - groups = [] - - for coord in self._groupby_coords: - groups.append(iris.coords._GroupIterator(coord.points)) - items.append(next(groups[-1])) - - # Construct the group slice for each group over the group-by - # coordinates. Keep constructing until all group-by coordinate - # groups are exhausted. - while any([item is not None for item in items]): - # Determine the extent (start, stop) of the group given - # each current group-by coordinate group. - start = max( - [ - item.groupby_slice.start - for item in items - if item is not None - ] - ) - stop = min( - [ - item.groupby_slice.stop - for item in items - if item is not None - ] - ) - # Construct composite group key for the group using the - # start value from each group-by coordinate. - key = tuple( - [coord.points[start] for coord in self._groupby_coords] - ) - # Associate group slice with group key within the ordered - # dictionary. - self._slices_by_key.setdefault(key, []).append( - slice(start, stop) - ) - # Prepare for the next group slice construction over the - # group-by coordinates. - for item_index, item in enumerate(items): - if item is None: - continue - # Get coordinate current group slice. - groupby_slice = item.groupby_slice - # Determine whether coordinate has spanned all its - # groups i.e. its full length - # or whether we need to get the coordinates next group. - if groupby_slice.stop == self._stop: - # This coordinate has exhausted all its groups, - # so remove it. - items[item_index] = None - elif groupby_slice.stop == stop: - # The current group of this coordinate is - # exhausted, so get the next one. - items[item_index] = next(groups[item_index]) - - # Merge multiple slices together into one tuple. - self._slice_merge() - # Calculate the new group-by coordinates. - self._compute_groupby_coords() - # Calculate the new shared coordinates. - self._compute_shared_coords() - # Generate the group-by slices/groups. - for groupby_slice in self._slices_by_key.values(): - yield groupby_slice - - return - - def _slice_merge(self): - """ - Merge multiple slices into one tuple and collapse items from - containing list. - - """ - # Iterate over the ordered dictionary in order to reduce - # multiple slices into a single tuple and collapse - # all items from containing list. - for key, groupby_slices in self._slices_by_key.items(): - if len(groupby_slices) > 1: - # Compress multiple slices into tuple representation. - groupby_indicies = [] - - for groupby_slice in groupby_slices: - groupby_indicies.extend( - range(groupby_slice.start, groupby_slice.stop) - ) - - self._slices_by_key[key] = tuple(groupby_indicies) - else: - # Remove single inner slice from list. - self._slices_by_key[key] = groupby_slices[0] - - def _compute_groupby_coords(self): + A list of the coordinate group slices. + + """ + if not self._groupby_indices: + # Construct the group indices for each group over the group-by + # coordinates. Keep constructing until all group-by coordinate + # groups are exhausted. + + def group_iterator(points): + start = 0 + for _, group in itertools.groupby(points): + stop = sum((1 for _ in group), start) + yield slice(start, stop) + start = stop + + groups = [group_iterator(c.points) for c in self._groupby_coords] + groupby_slices = [next(group) for group in groups] + indices_by_key: dict[ + tuple[Union[Number, str], ...], list[int] + ] = {} + while any(s is not None for s in groupby_slices): + # Determine the extent (start, stop) of the group given + # each current group-by coordinate group. + start = max(s.start for s in groupby_slices if s is not None) + stop = min(s.stop for s in groupby_slices if s is not None) + # Construct composite group key for the group using the + # start value from each group-by coordinate. + key = tuple( + coord.points[start] for coord in self._groupby_coords + ) + # Associate group slice with group key within the ordered + # dictionary. + indices_by_key.setdefault(key, []).extend(range(start, stop)) + # Prepare for the next group slice construction over the + # group-by coordinates. + for index, groupby_slice in enumerate(groupby_slices): + if groupby_slice is None: + continue + # Determine whether coordinate has spanned all its + # groups i.e. its full length + # or whether we need to get the coordinates next group. + if groupby_slice.stop == self._stop: + # This coordinate has exhausted all its groups, + # so remove it. + groupby_slices[index] = None + elif groupby_slice.stop == stop: + # The current group of this coordinate is + # exhausted, so get the next one. + groupby_slices[index] = next(groups[index]) + + # Cache the indices + self._groupby_indices = [tuple(i) for i in indices_by_key.values()] + # Calculate the new group-by coordinates. + self._compute_groupby_coords() + # Calculate the new shared coordinates. + self._compute_shared_coords() + + # Return the group-by indices/groups. + return self._groupby_indices + + def _compute_groupby_coords(self) -> None: """Create new group-by coordinates given the group slices.""" - - groupby_slice = [] - - # Iterate over the ordered dictionary in order to construct - # a group-by slice that samples the first element from each group. - for key_slice in self._slices_by_key.values(): - if isinstance(key_slice, tuple): - groupby_slice.append(key_slice[0]) - else: - groupby_slice.append(key_slice.start) - - groupby_slice = np.array(groupby_slice) + # Construct a group-by slice that samples the first element from each + # group. + groupby_slice = np.array([i[0] for i in self._groupby_indices]) # Create new group-by coordinates from the group-by slice. self.coords = [coord[groupby_slice] for coord in self._groupby_coords] - def _compute_shared_coords(self): + def _compute_shared_coords(self) -> None: """Create the new shared coordinates given the group slices.""" - - groupby_indices = [] - groupby_bounds = [] - - # Iterate over the ordered dictionary in order to construct a list of - # tuple group indices, and a list of the respective bounds of those - # indices. - for key_slice in self._slices_by_key.values(): - if isinstance(key_slice, tuple): - indices = key_slice - else: - indices = tuple(range(*key_slice.indices(self._stop))) - - groupby_indices.append(indices) - groupby_bounds.append((indices[0], indices[-1])) - - # Create new shared bounded coordinates. for coord, dim in self._shared_coords: climatological_coord = ( self.climatological and coord.units.is_time_reference() ) if coord.points.dtype.kind in "SU": if coord.bounds is None: - new_points = [] + new_points_list = [] new_bounds = None # np.apply_along_axis does not work with str.join, so we # need to loop through the array directly. First move axis @@ -2612,32 +2558,32 @@ def _compute_shared_coords(self): work_arr = np.moveaxis(coord.points, dim, -1) shape = work_arr.shape work_shape = (-1, shape[-1]) - new_shape = (len(self),) + new_shape: tuple[int, ...] = (len(self),) if coord.ndim > 1: new_shape += shape[:-1] work_arr = work_arr.reshape(work_shape) - for indices in groupby_indices: + for indices in self._groupby_indices: for arr in work_arr: - new_points.append("|".join(arr.take(indices))) + new_points_list.append("|".join(arr.take(indices))) # Reinstate flattened dimensions. Aggregated dim now leads. - new_points = np.array(new_points).reshape(new_shape) + new_points = np.array(new_points_list).reshape(new_shape) # Move aggregated dimension back to position it started in. new_points = np.moveaxis(new_points, 0, dim) else: msg = ( - "collapsing the bounded string coordinate {0!r}" - " is not supported".format(coord.name()) + "collapsing the bounded string coordinate" + f" {coord.name()!r} is not supported" ) raise ValueError(msg) else: - new_bounds = [] + new_bounds_list = [] if coord.has_bounds(): # Derive new coord's bounds from bounds. item = coord.bounds - maxmin_axis = (dim, -1) + maxmin_axis: Union[int, tuple[int, int]] = (dim, -1) first_choices = coord.bounds.take(0, -1) last_choices = coord.bounds.take(1, -1) @@ -2654,12 +2600,13 @@ def _compute_shared_coords(self): # Construct list of coordinate group boundary pairs. if monotonic: # Use first and last bound or point for new bounds. - for start, stop in groupby_bounds: + for indices in self._groupby_indices: + start, stop = indices[0], indices[-1] if ( getattr(coord, "circular", False) and (stop + 1) == self._stop ): - new_bounds.append( + new_bounds_list.append( [ first_choices.take(start, dim), first_choices.take(0, dim) @@ -2667,7 +2614,7 @@ def _compute_shared_coords(self): ] ) else: - new_bounds.append( + new_bounds_list.append( [ first_choices.take(start, dim), last_choices.take(stop, dim), @@ -2675,9 +2622,9 @@ def _compute_shared_coords(self): ) else: # Use min and max bound or point for new bounds. - for indices in groupby_indices: + for indices in self._groupby_indices: item_slice = item.take(indices, dim) - new_bounds.append( + new_bounds_list.append( [ item_slice.min(axis=maxmin_axis), item_slice.max(axis=maxmin_axis), @@ -2688,7 +2635,7 @@ def _compute_shared_coords(self): # dimension last, and the aggregated dimension back in its # original position. new_bounds = np.moveaxis( - np.array(new_bounds), (0, 1), (dim, -1) + np.array(new_bounds_list), (0, 1), (dim, -1) ) # Now create the new bounded group shared coordinate. @@ -2700,8 +2647,8 @@ def _compute_shared_coords(self): new_points = new_bounds.mean(-1) except TypeError: msg = ( - "The {0!r} coordinate on the collapsing dimension" - " cannot be collapsed.".format(coord.name()) + f"The {coord.name()!r} coordinate on the collapsing" + " dimension cannot be collapsed." ) raise ValueError(msg) @@ -2719,29 +2666,16 @@ def _compute_shared_coords(self): self.coords.append(new_coord) - def __len__(self): + def __len__(self) -> int: """Calculate the number of groups given the group-by coordinates.""" + return len(self.group()) - if self._slices_by_key: - value = len(self._slices_by_key) - else: - value = len([s for s in self.group()]) - - return value - - def __repr__(self): + def __repr__(self) -> str: groupby_coords = [coord.name() for coord in self._groupby_coords] - - if self._shared_coords_by_name: - shared_coords = [coord.name() for coord in self._shared_coords] - shared_string = ", shared_coords=%r)" % shared_coords - else: - shared_string = ")" - - return "%s(%r%s" % ( - self.__class__.__name__, - groupby_coords, - shared_string, + shared_coords = [coord.name() for coord, _ in self._shared_coords] + return ( + f"{self.__class__.__name__}({groupby_coords!r}" + f", shared_coords={shared_coords!r})" ) diff --git a/lib/iris/coords.py b/lib/iris/coords.py index 4245fe55ef..dcb108f712 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -10,7 +10,7 @@ from abc import ABCMeta, abstractmethod from collections import namedtuple -from collections.abc import Container, Iterator +from collections.abc import Container import copy from functools import lru_cache from itertools import zip_longest @@ -1218,10 +1218,6 @@ def __new__( BOUND_POSITION_END = 1 -# Private named tuple class for coordinate groups. -_GroupbyItem = namedtuple("GroupbyItem", "groupby_point, groupby_slice") - - def _get_2d_coord_bound_grid(bounds): """ Creates a grid using the bounds of a 2D coordinate with 4 sided cells. @@ -3131,26 +3127,3 @@ def xml_element(self, doc): cellMethod_xml_element.appendChild(coord_xml_element) return cellMethod_xml_element - - -# See ExplicitCoord._group() for the description/context. -class _GroupIterator(Iterator): - def __init__(self, points): - self._points = points - self._start = 0 - - def __next__(self): - num_points = len(self._points) - if self._start >= num_points: - raise StopIteration - - stop = self._start + 1 - m = self._points[self._start] - while stop < num_points and self._points[stop] == m: - stop += 1 - - group = _GroupbyItem(m, slice(self._start, stop)) - self._start = stop - return group - - next = __next__ diff --git a/lib/iris/tests/test_analysis.py b/lib/iris/tests/test_analysis.py index 0717368d98..4b36a915aa 100644 --- a/lib/iris/tests/test_analysis.py +++ b/lib/iris/tests/test_analysis.py @@ -1915,6 +1915,17 @@ def test_update_kwargs_weights(self): assert kwargs["weights"].units == "1" +def test__Groupby_repr(): + groupby_coord = iris.coords.AuxCoord([2000, 2000], var_name="year") + shared_coord = iris.coords.DimCoord( + [0, 1], + var_name="time", + units=cf_units.Unit("days since 2000-01-01"), + ) + grouper = iris.analysis._Groupby([groupby_coord], [(shared_coord, 0)]) + assert repr(grouper) == "_Groupby(['year'], shared_coords=['time'])" + + CUBE = iris.cube.Cube(0)