diff --git a/tobac/analysis.py b/tobac/analysis.py index bfb373de..d8651522 100644 --- a/tobac/analysis.py +++ b/tobac/analysis.py @@ -1,3 +1,31 @@ +"""Provide tools to analyse and visualize the tracked objects. +This module provides a set of routines that enables performing analyses +and deriving statistics for individual tracks, such as the time series +of integrated properties and vertical profiles. It also provides +routines to calculate summary statistics of the entire population of +tracked features in the field like histograms of areas/volumes +or mass and a detailed cell lifetime analysis. These analysis +routines are all built in a modular manner. Thus, users can reuse the +most basic methods for interacting with the data structure of the +package in their own analysis procedures in Python. This includes +functions performing simple tasks like looping over all identified +objects or trajectories and masking arrays for the analysis of +individual features. Plotting routines include both visualizations +for individual convective cells and their properties. [1]_ + +References +---------- +.. Heikenfeld, M., Marinescu, P. J., Christensen, M., + Watson-Parris, D., Senf, F., van den Heever, S. C. + & Stier, P. (2019). tobac 1.2: towards a flexible + framework for tracking and analysis of clouds in + diverse datasets. Geoscientific Model Development, + 12(11), 4551-4570. + +Notes +----- +""" + import pandas as pd import numpy as np import logging @@ -19,6 +47,45 @@ def cell_statistics_all( dimensions=["x", "y"], **kwargs ): + """ + Parameters + ---------- + input_cubes : iris.cube.Cube + + track : dask.dataframe.DataFrame + + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes 0 everywhere + else). + + aggregators : list + list of iris.analysis.Aggregator instances + + output_path : str, optional + Default is './'. + + cell_selection : optional + Default is None. + + output_name : str, optional + Default is 'Profiles'. + + width : int, optional + Default is 10000. + + z_coord : str, optional + Name of the vertical coordinate in the cube. Default is + 'model_level_number'. + + dimensions : list of str, optional + Default is ['x', 'y']. + + **kwargs + + Returns + ------- + None + """ if cell_selection is None: cell_selection = np.unique(track["cell"]) for cell in cell_selection: @@ -50,6 +117,46 @@ def cell_statistics( dimensions=["x", "y"], **kwargs ): + """ + Parameters + ---------- + input_cubes : iris.cube.Cube + + track : dask.dataframe.DataFrame + + mask : iris.cube.Cube + Cube containing mask (int id for tracked volumes 0 everywhere + else). + + aggregators list + list of iris.analysis.Aggregator instances + + cell : int + Integer id of cell to create masked cube for output. + + output_path : str, optional + Default is './'. + + output_name : str, optional + Default is 'Profiles'. + + width : int, optional + Default is 10000. + + z_coord : str, optional + Name of the vertical coordinate in the cube. Default is + 'model_level_number'. + + dimensions : list of str, optional + Default is ['x', 'y']. + + **kwargs + + Returns + ------- + None + """ + from iris.cube import Cube, CubeList from iris.coords import AuxCoord from iris import Constraint, save @@ -180,6 +287,31 @@ def cog_cell( Mask=None, savedir=None, ): + """ + Parameters + ---------- + cell : int + Integer id of cell to create masked cube for output. + + Tracks : optional + Default is None. + + M_total : subset of cube, optional + Default is None. + + M_liquid : subset of cube, optional + Default is None. + + M_frozen : subset of cube, optional + Default is None. + + savedir : str + Default is None. + + Returns + ------- + None + """ from iris import Constraint @@ -227,6 +359,48 @@ def cog_cell( def lifetime_histogram( Track, bin_edges=np.arange(0, 200, 20), density=False, return_values=False ): + """Compute the lifetime histogram of linked features. + + Parameters + ---------- + Track : pandas.DataFrame + Dataframe of linked features, containing the columns 'cell' + and 'time_cell'. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. The unit is minutes. + Default is np.arange(0, 200, 20). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + return_values : bool, optional + Bool determining wether the lifetimes of the features are + returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + bin_centers : ndarray + The centers of the histogram intervalls. + + minutes, optional : ndarray + Numpy.array of the lifetime of each feature in minutes. + Returned if return_values is True. + + """ + Track_cell = Track.groupby("cell") minutes = (Track_cell["time_cell"].max() / pd.Timedelta(minutes=1)).values hist, bin_edges = np.histogram(minutes, bin_edges, density=density) @@ -238,13 +412,27 @@ def lifetime_histogram( def haversine(lat1, lon1, lat2, lon2): - """Computes the Haversine distance in kilometres between two points (based on implementation CIS https://github.com/cedadev/cis) - :param lat1: first point or points as array, each as array of latitude in degrees - :param lon1: first point or points as array, each as array of longitude in degrees - :param lat2: second point or points as array, each as array of latitude in degrees - :param lon2: second point or points as array, each as array of longitude in degrees - :return: distance between the two points in kilometres + """Computes the Haversine distance in kilometers. + + Calculates the Haversine distance between two points + (based on implementation CIS https://github.com/cedadev/cis). + + Parameters + ---------- + lat1, lon1 : array of latitude, longitude + First point or points as array in degrees. + + lat2, lon2 : array of latitude, longitude + Second point or points as array in degrees. + + Returns + ------- + arclen * RADIUS_EARTH : array + Array of Distance(s) between the two points(-arrays) in + kilometers. + """ + RADIUS_EARTH = 6378.0 lat1 = np.radians(lat1) lat2 = np.radians(lat2) @@ -261,10 +449,31 @@ def haversine(lat1, lon1, lat2, lon2): def calculate_distance(feature_1, feature_2, method_distance=None): - """Computes distance between two features based on either lat/lon coordinates or x/y coordinates - :param feature_1: first feature or points as array, each as array of latitude, longitude in degrees - :param feature_2: second feature or points as array, each as array of latitude, longitude in degrees - :return: distance between the two features in metres + """Compute the distance between two features. It is based on + either lat/lon coordinates or x/y coordinates. + + Parameters + ---------- + feature_1, feature_2 : pandas.DataFrame or pandas.Series + Dataframes containing multiple features or pandas.Series + of one feature. Need to contain either projection_x_coordinate + and projection_y_coordinate or latitude and longitude + coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the + vector between the two features, 'latlon' uses the haversine + distance. None checks wether the required coordinates are + present and starts with 'xy'. Default is None. + + Returns + ------- + distance : float or pandas.Series + Float with the distance between the two features in meters if + the input are two pandas.Series containing one feature, + pandas.Series of the distances if one of the inputs contains + multiple features. + """ if method_distance is None: if ( @@ -312,6 +521,34 @@ def calculate_distance(feature_1, feature_2, method_distance=None): def calculate_velocity_individual(feature_old, feature_new, method_distance=None): + """Calculate the mean velocity of a feature between two timeframes. + + Parameters + ---------- + feature_old : pandas.Series + pandas.Series of a feature at a certain timeframe. Needs to + contain a 'time' column and either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + feature_new : pandas.Series + pandas.Series of the same feature at a later timeframe. Needs + to contain a 'time' column and either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the velocity. + 'xy' uses the length of the vector between the two features, + 'latlon' uses the haversine distance. None checks wether the + required coordinates are present and starts with 'xy'. + Default is None. + + Returns + ------- + velocity : float + Value of the approximate velocity. + + """ + distance = calculate_distance( feature_old, feature_new, method_distance=method_distance ) @@ -321,6 +558,30 @@ def calculate_velocity_individual(feature_old, feature_new, method_distance=None def calculate_velocity(track, method_distance=None): + """Calculate the velocities of a set of linked features. + + Parameters + ---------- + track : pandas.DataFrame + Dataframe of linked features, containing the columns 'cell', + 'time' and either 'projection_x_coordinate' and + 'projection_y_coordinate' or 'latitude' and 'longitude'. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the + velocity. 'xy' uses the length of the vector between the + two features, 'latlon' uses the haversine distance. None + checks wether the required coordinates are present and + starts with 'xy'. Default is None. + + Returns + ------- + track : pandas.DataFrame + DataFrame from the input, with an additional column 'v', + contain the value of the velocity for every feature at + every possible timestep + """ + for cell_i, track_i in track.groupby("cell"): index = track_i.index.values for i, index_i in enumerate(index[:-1]): @@ -340,6 +601,52 @@ def velocity_histogram( method_distance=None, return_values=False, ): + """Create an velocity histogram of the features. If the DataFrame + does not contain a velocity column, the velocities are calculated. + + Parameters + ---------- + track: pandas.DataFrame + DataFrame of the linked features, containing the columns 'cell', + 'time' and either 'projection_x_coordinate' and + 'projection_y_coordinate' or 'latitude' and 'longitude'. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + methods_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation, used to calculate the velocity. + 'xy' uses the length of the vector between the two features, + 'latlon' uses the haversine distance. None checks wether the + required coordinates are present and starts with 'xy'. + Default is None. + + return_values : bool, optional + Bool determining wether the velocities of the features are + returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + velocities , optional : ndarray + Numpy array with the velocities of each feature. + + """ + if "v" not in track.columns: logging.info("calculate velocities") track = calculate_velocity(track) @@ -354,6 +661,30 @@ def velocity_histogram( def calculate_nearestneighbordistance(features, method_distance=None): + """Calculate the distance between a feature and the nearest other + feature in the same timeframe. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features whose nearest neighbor distance is to + be calculated. Needs to contain either projection_x_coordinate + and projection_y_coordinate or latitude and longitude coordinates. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the vector + between the two features, 'latlon' uses the haversine distance. + None checks wether the required coordinates are present and starts + with 'xy'. Default is None. + + Returns + ------- + features : pandas.DataFrame + DataFrame of the features with a new column 'min_distance', + containing the calculated minimal distance to other features. + + """ + from itertools import combinations features["min_distance"] = np.nan @@ -393,6 +724,49 @@ def nearestneighbordistance_histogram( method_distance=None, return_values=False, ): + """Create an nearest neighbor distance histogram of the features. + If the DataFrame does not contain a 'min_distance' column, the + distances are calculated. + + ---------- + features + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of equal-width + bins in the given range. If bins is a ndarray, it defines a + monotonically increasing array of bin edges, including the + rightmost edge. Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples in + each bin. If True, the result is the value of the probability + density function at the bin, normalized such that the integral + over the range is 1. Default is False. + + method_distance : {None, 'xy', 'latlon'}, optional + Method of distance calculation. 'xy' uses the length of the + vector between the two features, 'latlon' uses the haversine + distance. None checks wether the required coordinates are + present and starts with 'xy'. Default is None. + + return_values : bool, optional + Bool determining wether the nearest neighbor distance of the + features are returned from this function. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + distances, optional : ndarray + A numpy array with the nearest neighbor distances of each + feature. + + """ + if "min_distance" not in features.columns: logging.debug("calculate nearest neighbor distances") features = calculate_nearestneighbordistance( @@ -410,34 +784,34 @@ def nearestneighbordistance_histogram( # Treatment of 2D lat/lon coordinates to be added: def calculate_areas_2Dlatlon(_2Dlat_coord, _2Dlon_coord): - """ - Calculate an array of cell areas when given two 2D arrays of latitude and - longitude values + """Calculate an array of cell areas when given two 2D arrays + of latitude and longitude values - NOTE: This currently assuems that the lat/lon grid is orthogonal, which is - not strictly true! It's close enough for most cases, but should be updated - in future to use the cross product of the distances to the neighbouring - cells. This will require the use of a more advanced calculation. I would - advise using pyproj at some point in the future to solve this issue and - replace haversine distance. + NOTE: This currently assuems that the lat/lon grid is orthogonal, + which is not strictly true! It's close enough for most cases, but + should be updated in future to use the cross product of the + distances to the neighbouring cells. This will require the use + of a more advanced calculation. I would advise using pyproj + at some point in the future to solve this issue and replace + haversine distance. Parameters ---------- _2Dlat_coord : AuxCoord - Iris auxilliary coordinate containing a 2d grid of latitudes for each - point + Iris auxilliary coordinate containing a 2d grid of latitudes + for each point. _2Dlon_coord : AuxCoord - Iris auxilliary coordinate containing a 2d grid of longitudes for each - point - + Iris auxilliary coordinate containing a 2d grid of longitudes + for each point. Returns ------- area : ndarray - A numpy array approximating the area of each cell + A numpy array approximating the area of each cell. """ + hdist1 = ( haversine( _2Dlat_coord.points[:-1], @@ -474,6 +848,48 @@ def calculate_areas_2Dlatlon(_2Dlat_coord, _2Dlon_coord): def calculate_area(features, mask, method_area=None): + """Calculate the area of the segments for each feature. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features whose area is to be calculated. + + mask : iris.cube.Cube + Cube containing mask (int for tracked volumes 0 everywhere + else). Needs to contain either projection_x_coordinate and + projection_y_coordinate or latitude and longitude + coordinates. + + method_area : {None, 'xy', 'latlon'}, optional + Flag determining how the area is calculated. 'xy' uses the + areas of the individual pixels, 'latlon' uses the + area_weights method of iris.analysis.cartography, None + checks wether the required coordinates are present and + starts with 'xy'. Default is None. + + Returns + ------- + features : pandas.DataFrame + DataFrame of the features with a new column 'area', + containing the calculated areas. + + Raises + ------ + ValueError + If neither latitude/longitude nor + projection_x_coordinate/projection_y_coordinate are + present in mask_coords. + + If latitude/longitude coordinates are 2D. + + If latitude/longitude shapes are not supported. + + If method is undefined, i.e. method is neither None, + 'xy' nor 'latlon'. + + """ + from tobac.utils import mask_features_surface, mask_features from iris import Constraint from iris.analysis.cartography import area_weights @@ -541,6 +957,61 @@ def area_histogram( return_values=False, representative_area=False, ): + """Create an area histogram of the features. If the DataFrame + does not contain an area column, the areas are calculated. + + Parameters + ---------- + features : pandas.DataFrame + DataFrame of the features. + + mask : iris.cube.Cube + Cube containing mask (int for tracked volumes 0 + everywhere else). Needs to contain either + projection_x_coordinate and projection_y_coordinate or + latitude and longitude coordinates. The output of a + segmentation should be used here. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is a ndarray, + it defines a monotonically increasing array of bin edges, + including the rightmost edge. + Default is np.arange(0, 30000, 500). + + density : bool, optional + If False, the result will contain the number of samples + in each bin. If True, the result is the value of the + probability density function at the bin, normalized such + that the integral over the range is 1. Default is False. + + return_values : bool, optional + Bool determining wether the areas of the features are + returned from this function. Default is False. + + representive_area: bool, optional + If False, no weights will associated to the values. + If True, the weights for each area will be the areas + itself, i.e. each bin count will have the value of + the sum of all areas within the edges of the bin. + Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram. + + bin_edges : ndarray + The edges of the histogram. + + bin_centers : ndarray + The centers of the histogram intervalls. + + areas : ndarray, optional + A numpy array approximating the area of each feature. + + """ + if "area" not in features.columns: logging.info("calculate area") features = calculate_area(features, mask, method_area) @@ -563,6 +1034,57 @@ def area_histogram( def histogram_cellwise( Track, variable=None, bin_edges=None, quantity="max", density=False ): + """Create a histogram of the maximum, minimum or mean of + a variable for the cells (series of features linked together + over multiple timesteps) of a track. Essentially a wrapper + of the numpy.histogram() method. + + Parameters + ---------- + Track : pandas.DataFrame + The track containing the variable to create the histogram + from. + + variable : string, optional + Column of the DataFrame with the variable on which the + histogram is to be based on. Default is None. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is a ndarray, + it defines a monotonically increasing array of bin edges, + including the rightmost edge. + + quantity : {'max', 'min', 'mean'}, optional + Flag determining wether to use maximum, minimum or mean + of a variable from all timeframes the cell covers. + Default is 'max'. + + density : bool, optional + If False, the result will contain the number of samples + in each bin. If True, the result is the value of the + probability density function at the bin, normalized such + that the integral over the range is 1. + Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram + + bin_edges : ndarray + The edges of the histogram + + bin_centers : ndarray + The centers of the histogram intervalls + + Raises + ------ + ValueError + If quantity is not 'max', 'min' or 'mean'. + + """ + Track_cell = Track.groupby("cell") if quantity == "max": variable_cell = Track_cell[variable].max().values @@ -579,6 +1101,46 @@ def histogram_cellwise( def histogram_featurewise(Track, variable=None, bin_edges=None, density=False): + """Create a histogram of a variable from the features + (detected objects at a single time step) of a track. + Essentially a wrapper of the numpy.histogram() method. + + Parameters + ---------- + Track : pandas.DataFrame + The track containing the variable to create the + histogram from. + + variable : string, optional + Column of the DataFrame with the variable on which the + histogram is to be based on. Default is None. + + bin_edges : int or ndarray, optional + If bin_edges is an int, it defines the number of + equal-width bins in the given range. If bins is + a sequence, it defines a monotonically increasing + array of bin edges, including the rightmost edge. + + density : bool, optional + If False, the result will contain the number of + samples in each bin. If True, the result is the + value of the probability density function at the + bin, normalized such that the integral over the + range is 1. Default is False. + + Returns + ------- + hist : ndarray + The values of the histogram + + bin_edges : ndarray + The edges of the histogram + + bin_centers : ndarray + The centers of the histogram intervalls + + """ + hist, bin_edges = np.histogram(Track[variable].values, bin_edges, density=density) bin_centers = bin_edges[:-1] + 0.5 * np.diff(bin_edges) @@ -588,6 +1150,36 @@ def histogram_featurewise(Track, variable=None, bin_edges=None, density=False): def calculate_overlap( track_1, track_2, min_sum_inv_distance=None, min_mean_inv_distance=None ): + """Count the number of time frames in which the + individual cells of two tracks are present together + and calculate their mean and summed inverse distance. + + Parameters + ---------- + track_1, track_2 : pandas.DataFrame + The tracks conaining the cells to analyze. + + min_sum_inv_distance : float, optional + Minimum of the inverse net distance for two + cells to be counted as overlapping. + Default is None. + + min_mean_inv_distance : float, optional + Minimum of the inverse mean distance for two cells + to be counted as overlapping. Default is None. + + Returns + ------- + overlap : pandas.DataFrame + DataFrame containing the columns cell_1 and cell_2 + with the index of the cells from the tracks, + n_overlap with the number of frames both cells are + present in, mean_inv_distance with the mean inverse + distance and sum_inv_distance with the summed + inverse distance of the cells. + + """ + cells_1 = track_1["cell"].unique() # n_cells_1_tot=len(cells_1) cells_2 = track_2["cell"].unique() diff --git a/tobac/centerofgravity.py b/tobac/centerofgravity.py index d7d5675d..472412a6 100644 --- a/tobac/centerofgravity.py +++ b/tobac/centerofgravity.py @@ -1,19 +1,33 @@ +"""Identify center of gravity and mass for analysis. +""" + import logging def calculate_cog(tracks, mass, mask): - """caluclate centre of gravity and mass forech individual tracked cell in the simulation - Input: - tracks: pandas.DataFrame - DataFrame containing trajectories of cell centres - mass: iris.cube.Cube - cube of quantity (need coordinates 'time', 'geopotential_height','projection_x_coordinate' and 'projection_y_coordinate') - mask: iris.cube.Cube - cube containing mask (int > where belonging to cloud volume, 0 everywhere else ) - Output: - tracks_out pandas.DataFrame - Dataframe containing t,x,y,z positions of centre of gravity and total cloud mass each tracked cells at each timestep + """Calculate center of gravity and mass for each tracked cell. + + Parameters + ---------- + tracks : pandas.DataFrame + DataFrame containing trajectories of cell centers. + + mass : iris.cube.Cube + Cube of quantity (need coordinates 'time', + 'geopotential_height','projection_x_coordinate' and + 'projection_y_coordinate'). + + mask : iris.cube.Cube + Cube containing mask (int > where belonging to area/volume + of feature, 0 else). + + Returns + ------- + tracks_out : pandas.DataFrame + Dataframe containing t, x, y, z positions of center of gravity + and total mass of each tracked cell at each timestep. """ + from .utils import mask_cube_cell from iris import Constraint @@ -39,17 +53,26 @@ def calculate_cog(tracks, mass, mask): def calculate_cog_untracked(mass, mask): - """caluclate centre of gravity and mass for untracked parts of domain - Input: - mass: iris.cube.Cube - cube of quantity (need coordinates 'time', 'geopotential_height','projection_x_coordinate' and 'projection_y_coordinate') - - mask: iris.cube.Cube - cube containing mask (int > where belonging to cloud volume, 0 everywhere else ) - Output: - tracks_out pandas.DataFrame - Dataframe containing t,x,y,z positions of centre of gravity and total cloud mass for untracked part of dimain + """Calculate center of gravity and mass for untracked domain parts. + + Parameters + ---------- + mass : iris.cube.Cube + Cube of quantity (need coordinates 'time', + 'geopotential_height','projection_x_coordinate' and + 'projection_y_coordinate'). + + mask : iris.cube.Cube + Cube containing mask (int > where belonging to area/volume + of feature, 0 else). + + Returns + ------- + tracks_out : pandas.DataFrame + Dataframe containing t, x, y, z positions of center of gravity + and total mass for untracked part of the domain. """ + from pandas import DataFrame from .utils import mask_cube_untracked from iris import Constraint @@ -81,14 +104,22 @@ def calculate_cog_untracked(mass, mask): def calculate_cog_domain(mass): - """caluclate centre of gravity and mass for entire domain - Input: - mass: iris.cube.Cube - cube of quantity (need coordinates 'time', 'geopotential_height','projection_x_coordinate' and 'projection_y_coordinate') - Output: - tracks_out pandas.DataFrame - Dataframe containing t,x,y,z positions of centre of gravity and total cloud mass + """Calculate center of gravity and mass for entire domain. + + Parameters + ---------- + mass : iris.cube.Cube + Cube of quantity (need coordinates 'time', + 'geopotential_height','projection_x_coordinate' and + 'projection_y_coordinate'). + + Returns + ------- + tracks_out : pandas.DataFrame + Dataframe containing t, x, y, z positions of center of gravity + and total mass of the entire domain. """ + from pandas import DataFrame from iris import Constraint @@ -115,21 +146,30 @@ def calculate_cog_domain(mass): def center_of_gravity(cube_in): - """caluclate centre of gravity and sum of quantity - Input: - cube_in: iris.cube.Cube - cube (potentially masked) of quantity (need coordinates 'geopotential_height','projection_x_coordinate' and 'projection_y_coordinate') - Output: - x: float - x position of centre of gravity - y: float - y position of centre of gravity - z: float - z position of centre of gravity - variable_sum: float - sum of quantity of over unmasked part of the cube + """Calculate center of gravity and sum of quantity. + + Parameters + ---------- + cube_in : iris.cube.Cube + Cube (potentially masked) of quantity (need coordinates + 'geopotential_height','projection_x_coordinate' and + 'projection_y_coordinate'). + Returns + ------- + x : float + X position of center of gravity. + + y : float + Y position of center of gravity. + + z : float + Z position of center of gravity. + + variable_sum : float + Sum of quantity of over unmasked part of the cube. """ + from iris.analysis import SUM import numpy as np diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index 69d2bd90..c11d527b 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1,3 +1,22 @@ +"""Provide feature detection. + +This module can work with any two-dimensional field. +To identify the features, contiguous regions above or +below a threshold are determined and labelled individually. +To describe the specific location of the feature at a +specific point in time, different spatial properties +are used to describe the identified region. [2]_ + +References +---------- +.. Heikenfeld, M., Marinescu, P. J., Christensen, M., + Watson-Parris, D., Senf, F., van den Heever, S. C. + & Stier, P. (2019). tobac 1.2: towards a flexible + framework for tracking and analysis of clouds in + diverse datasets. Geoscientific Model Development, + 12(11), 4551-4570. +""" + import logging import numpy as np import pandas as pd @@ -15,27 +34,33 @@ def feature_position( position_threshold="center", target=None, ): - """Function to determine feature position + """Determine feature position with regard to the horizontal + dimensions in pixels from the identified region above + threshold values Parameters ---------- hdim1_indices : list - list of indices along hdim1 (typically ```y```) + indices of pixels in region along first horizontal + dimension hdim2_indices : list - List of indices of feature along hdim2 (typically ```x```) + indices of pixels in region along second horizontal + dimension region_small : 2D array-like A true/false array containing True where the threshold - is met and false where the threshold isn't met. This array should - be the the size specified by region_bbox, and can be a subset of the - overall input array (i.e., ```track_data```). + is met and false where the threshold isn't met. This + array should be the the size specified by region_bbox, + and can be a subset of the overall input array + (i.e., ```track_data```). region_bbox : list or tuple with length of 4 - The coordinates that region_small occupies within the total track_data - array. This is in the order that the coordinates come from the - ```get_label_props_in_dict``` function. For 2D data, this should be: - (hdim1 start, hdim 2 start, hdim 1 end, hdim 2 end). + The coordinates that region_small occupies within the + total track_data array. This is in the order that the + coordinates come from the ```get_label_props_in_dict``` + function. For 2D data, this should be: (hdim1 start, + hdim 2 start, hdim 1 end, hdim 2 end). track_data : 2D array-like 2D array containing the data @@ -43,16 +68,20 @@ def feature_position( threshold_i : float The threshold value that we are testing against - position_threshold : {'center', 'extreme', 'weighted_diff', 'weighted abs'} + position_threshold : {'center', 'extreme', 'weighted_diff', ' + weighted abs'} How to select the single point position from our data. - 'center' picks the geometrical centre of the region, and is typically not recommended. - 'extreme' picks the maximum or minimum value inside the region (max/min set by ```target```) - 'weighted_diff' picks the centre of the region weighted by the distance from the threshold value - 'weighted_abs' picks the centre of the region weighted by the absolute values of the field + 'center' picks the geometrical centre of the region, + and is typically not recommended. 'extreme' picks the + maximum or minimum value inside the region (max/min set by + ```target```) 'weighted_diff' picks the centre of the + region weighted by the distance from the threshold value + 'weighted_abs' picks the centre of the region weighted by + the absolute values of the field target : {'maximum', 'minimum'} - Used only when position_threshold is set to 'extreme', this sets whether - it is looking for maxima or minima. + Used only when position_threshold is set to 'extreme', + this sets whether it is looking for maxima or minima. Returns ------- @@ -104,36 +133,57 @@ def feature_position( def test_overlap(region_inner, region_outer): + """Test for overlap between two regions + + Parameters + ---------- + region_1 : list + list of 2-element tuples defining the indices of + all cell in the region + + region_2 : list + list of 2-element tuples defining the indices of + all cell in the region + + Returns + ---------- + overlap : bool + True if there are any shared points between the two + regions """ - function to test for overlap between two regions (probably scope for further speedup here) - Input: - region_1: list - list of 2-element tuples defining the indices of all cell in the region - region_2: list - list of 2-element tuples defining the indices of all cell in the region - - Output: - overlap: bool - True if there are any shared points between the two regions - """ + overlap = frozenset(region_outer).isdisjoint(region_inner) return not overlap def remove_parents(features_thresholds, regions_i, regions_old): + """Remove parents of newly detected feature regions. + + Remove features where its regions surround newly + detected feature regions. + + Parameters + ---------- + features_thresholds : pandas.DataFrame + Dataframe containing detected features. + + regions_i : dict + Dictionary containing the regions above/below + threshold for the newly detected feature + (feature ids as keys). + + regions_old : dict + Dictionary containing the regions above/below + threshold from previous threshold + (feature ids as keys). + + Returns + ------- + features_thresholds : pandas.DataFrame + Dataframe containing detected features excluding those + that are superseded by newly detected ones. """ - function to remove features whose regions surround newly detected feature regions - Input: - features_thresholds: pandas.DataFrame - Dataframe containing detected features - regions_i: dict - dictionary containing the regions above/below threshold for the newly detected feature (feature ids as keys) - regions_old: dict - dictionary containing the regions above/below threshold from previous threshold (feature ids as keys) - Output: - features_thresholds pandas.DataFrame - Dataframe containing detected features excluding those that are superseded by newly detected ones - """ + try: all_curr_pts = np.concatenate([vals for idx, vals in regions_i.items()]) all_old_pts = np.concatenate([vals for idx, vals in regions_old.items()]) @@ -173,35 +223,55 @@ def feature_detection_threshold( min_distance=0, idx_start=0, ): + """Find features based on individual threshold value. + + Parameters + ---------- + data_i : iris.cube.Cube + 2D field to perform the feature detection (single timestep) on. + + i_time : int + Number of the current timestep. + + threshold : float, optional + Threshold value used to select target regions to track. Default + is None. + + target : {'maximum', 'minimum'}, optional + Flag to determine if tracking is targetting minima or maxima + in the data. Default is 'maximum'. + + position_threshold : {'center', 'extreme', 'weighted_diff', + 'weighted_abs'}, optional + Flag choosing method used for the position of the tracked + feature. Default is 'center'. + + sigma_threshold: float, optional + Standard deviation for intial filtering step. Default is 0.5. + + n_erosion_threshold: int, optional + Number of pixel by which to erode the identified features. + Default is 0. + + n_min_threshold : int, optional + Minimum number of identified features. Default is 0. + + min_distance : float, optional + Minimum distance between detected features. Default is 0. + + idx_start : int, optional + Feature id to start with. Default is 0. + + Returns + ------- + features_threshold : pandas DataFrame + Detected features for individual threshold. + + regions : dict + Dictionary containing the regions above/below threshold used + for each feature (feature ids as keys). """ - function to find features based on individual threshold value: - Input: - data_i: iris.cube.Cube - 2D field to perform the feature detection (single timestep) - i_time: int - number of the current timestep - threshold: float - threshold value used to select target regions to track - target: str ('minimum' or 'maximum') - flag to determine if tracking is targetting minima or maxima in the data - position_threshold: str('extreme', 'weighted_diff', 'weighted_abs' or 'center') - flag choosing method used for the position of the tracked feature - sigma_threshold: float - standard deviation for intial filtering step - n_erosion_threshold: int - number of pixel by which to erode the identified features - n_min_threshold: int - minimum number of identified features - min_distance: float - minimum distance between detected features (m) - idx_start: int - feature id to start with - Output: - features_threshold: pandas DataFrame - detected features for individual threshold - regions: dict - dictionary containing the regions above/below threshold used for each feature (feature ids as keys) - """ + from skimage.measure import label from skimage.morphology import binary_erosion from copy import deepcopy @@ -337,35 +407,54 @@ def feature_detection_multithreshold_timestep( min_distance=0, feature_number_start=1, ): - """ - function to find features in each timestep based on iteratively finding regions above/below a set of thresholds - Input: - data_i: iris.cube.Cube - 2D field to perform the feature detection (single timestep) - i_time: int - number of the current timestep - - threshold: list of floats - threshold values used to select target regions to track - dxy: float - grid spacing of the input data (m) - target: str ('minimum' or 'maximum') - flag to determine if tracking is targetting minima or maxima in the data - position_threshold: str('extreme', 'weighted_diff', 'weighted_abs' or 'center') - flag choosing method used for the position of the tracked feature - sigma_threshold: float - standard deviation for intial filtering step - n_erosion_threshold: int - number of pixel by which to erode the identified features - n_min_threshold: int - minimum number of identified features - min_distance: float - minimum distance between detected features (m) - feature_number_start: int - feature number to start with - Output: - features_threshold: pandas DataFrame - detected features for individual timestep + """Find features in each timestep. + + Based on iteratively finding regions above/below a set of + thresholds. Smoothing the input data with the Gaussian filter makes + output less sensitive to noisiness of input data. + + Parameters + ---------- + + data_i : iris.cube.Cube + 2D field to perform the feature detection (single timestep) on. + + threshold : float, optional + Threshold value used to select target regions to track. Default + is None. + + min_num : int, optional + This parameter is not used in the function. Default is 0. + + target : {'maximum', 'minimum'}, optinal + Flag to determine if tracking is targetting minima or maxima + in the data. Default is 'maximum'. + + position_threshold : {'center', 'extreme', 'weighted_diff', + 'weighted_abs'}, optional + Flag choosing method used for the position of the tracked + feature. Default is 'center'. + + sigma_threshold: float, optional + Standard deviation for intial filtering step. Default is 0.5. + + n_erosion_threshold: int, optional + Number of pixel by which to erode the identified features. + Default is 0. + + n_min_threshold : int, optional + Minimum number of identified features. Default is 0. + + min_distance : float, optional + Minimum distance between detected features. Default is 0. + + feature_number_start : int, optional + Feature id to start with. Default is 1. + + Returns + ------- + features_threshold : pandas DataFrame + Detected features for individual timestep. """ # Handle scipy depreciation gracefully try: @@ -437,31 +526,62 @@ def feature_detection_multithreshold( min_distance=0, feature_number_start=1, ): - """Function to perform feature detection based on contiguous regions above/below a threshold - Input: - field_in: iris.cube.Cube - 2D field to perform the tracking on (needs to have coordinate 'time' along one of its dimensions) - - thresholds: list of floats - threshold values used to select target regions to track - dxy: float - grid spacing of the input data (m) - target: str ('minimum' or 'maximum') - flag to determine if tracking is targetting minima or maxima in the data - position_threshold: str('extreme', 'weighted_diff', 'weighted_abs' or 'center') - flag choosing method used for the position of the tracked feature - sigma_threshold: float - standard deviation for intial filtering step - n_erosion_threshold: int - number of pixel by which to erode the identified features - n_min_threshold: int - minimum number of identified features - min_distance: float - minimum distance between detected features (m) - Output: - features: pandas DataFrame - detected features + """Perform feature detection based on contiguous regions. + + The regions are above/below a threshold. + + Parameters + ---------- + field_in : iris.cube.Cube + 2D field to perform the tracking on (needs to have coordinate + 'time' along one of its dimensions), + + dxy : float + Grid spacing of the input data (in meter). + + thresholds : list of floats, optional + Threshold values used to select target regions to track. + Default is None. + + target : {'maximum', 'minimum'}, optional + Flag to determine if tracking is targetting minima or maxima in + the data. Default is 'maximum'. + + position_threshold : {'center', 'extreme', 'weighted_diff', + 'weighted_abs'}, optional + Flag choosing method used for the position of the tracked + feature. Default is 'center'. + + coord_interp_kind : str, optional + The kind of interpolation for coordinates. Default is 'linear'. + For 1d interp, {'linear', 'nearest', 'nearest-up', 'zero', + 'slinear', 'quadratic', 'cubic', + 'previous', 'next'}. + For 2d interp, {'linear', 'cubic', 'quintic'}. + + sigma_threshold: float, optional + Standard deviation for intial filtering step. Default is 0.5. + + n_erosion_threshold: int, optional + Number of pixel by which to erode the identified features. + Default is 0. + + n_min_threshold : int, optional + Minimum number of identified features. Default is 0. + + min_distance : float, optional + Minimum distance between detected features. Default is 0. + + feature_number_start : int, optional + Feature id to start with. Default is 1. + + Returns + ------- + features : pandas.DataFrame + Detected features. The structure of this dataframe is explained + `here `__ """ + from .utils import add_coordinates if min_num != 0: @@ -528,18 +648,26 @@ def feature_detection_multithreshold( def filter_min_distance(features, dxy, min_distance): - """Function to perform feature detection based on contiguous regions above/below a threshold - Input: - features: pandas DataFrame - features - dxy: float - horzontal grid spacing (m) - min_distance: float - minimum distance between detected features (m) - Output: - features: pandas DataFrame - features + """Perform feature detection based on contiguous regions. + + Regions are above/below a threshold. + + Parameters + ---------- + features : pandas.DataFrame + + dxy : float + Grid spacing (in meter) of the input data. + + min_distance : float, optional + Minimum distance (in meter) between detected features. + + Returns + ------- + features : pandas.DataFrame + Detected features. """ + from itertools import combinations remove_list_distance = [] diff --git a/tobac/segmentation.py b/tobac/segmentation.py index 2a76ebc6..ca06a798 100644 --- a/tobac/segmentation.py +++ b/tobac/segmentation.py @@ -1,3 +1,35 @@ +"""Provide segmentation techniques. + +Segmentation techniques are used to associate areas or volumes to each +identified feature. The segmentation is implemented using watershedding +techniques from the field of image processing with a fixed threshold +value. This value has to be set specifically for every type of input +data and application. The segmentation can be performed for both +two-dimensional and three-dimensional data. At each timestep, a marker +is set at the position (weighted mean center) of each feature identified +in the detection step in an array otherwise filled with zeros. In case +of the three-dimentional watershedding, all cells in the column above +the weighted mean center position of the identified features fulfilling +the threshold condition are set to the respective marker. The algorithm +then fills the area (2D) or volume (3D) based on the input field +starting from these markers until reaching the threshold. If two or more +features are directly connected, the border runs along the +watershed line between the two regions. This procedure creates a mask +that has the same form as the input data, with the corresponding integer +number at all grid points that belong to a feature, else with zero. This +mask can be conveniently and efficiently used to select the volume of each +feature at a specific time step for further analysis or visialization. + +References +---------- +.. Heikenfeld, M., Marinescu, P. J., Christensen, M., + Watson-Parris, D., Senf, F., van den Heever, S. C. + & Stier, P. (2019). tobac 1.2: towards a flexible + framework for tracking and analysis of clouds in + diverse datasets. Geoscientific Model Development, + 12(11), 4551-4570. +""" + import logging @@ -11,6 +43,8 @@ def segmentation_3D( method="watershed", max_distance=None, ): + """Wrapper for the segmentation()-function.""" + return segmentation( features, field, @@ -33,6 +67,7 @@ def segmentation_2D( method="watershed", max_distance=None, ): + """Wrapper for the segmentation()-function.""" return segmentation( features, field, @@ -56,31 +91,73 @@ def segmentation_timestep( max_distance=None, vertical_coord="auto", ): + """Perform watershedding for an individual time step of the data. Works + for both 2D and 3D data + + Parameters + ---------- + field_in : iris.cube.Cube + Input field to perform the watershedding on (2D or 3D for one + specific point in time). + + features_in : pandas.DataFrame + Features for one specific point in time. + + dxy : float + Grid spacing of the input data in metres + + threshold : float, optional + Threshold for the watershedding field to be used for the mask. + Default is 3e-3. + + target : {'maximum', 'minimum'}, optional + Flag to determine if tracking is targetting minima or maxima in + the data to determine from which direction to approach the threshold + value. Default is 'maximum'. + + level : slice of iris.cube.Cube, optional + Levels at which to seed the cells for the watershedding + algorithm. Default is None. + + method : {'watershed'}, optional + Flag determining the algorithm to use (currently watershedding + implemented). 'random_walk' could be uncommented. + + max_distance : float, optional + Maximum distance from a marker allowed to be classified as + belonging to that cell. Default is None. + + vertical_coord : str, optional + Vertical coordinate in 3D input data. If 'auto', input is checked for + one of {'z', 'model_level_number', 'altitude','geopotential_height'} + as a likely coordinate name + + Returns + ------- + segmentation_out : iris.cube.Cube + Mask, 0 outside and integer numbers according to track + inside the ojects. + + features_out : pandas.DataFrame + Feature dataframe including the number of cells (2D or 3D) in + the segmented area/volume of the feature at the timestep. + + Raises + ------ + ValueError + If target is neither 'maximum' nor 'minimum'. + + If vertical_coord is not in {'auto', 'z', 'model_level_number', + 'altitude', geopotential_height'}. + + If there is more than one coordinate name. + + If the spatial dimension is neither 2 nor 3. + + If method is not 'watershed'. + """ - Function performing watershedding for an individual timestep of the data - - Parameters: - features: pandas.DataFrame - features for one specific point in time - field: iris.cube.Cube - input field to perform the watershedding on (2D or 3D for one specific point in time) - threshold: float - threshold for the watershedding field to be used for the mas - target: string - switch to determine if algorithm looks strating from maxima or minima in input field (maximum: starting from maxima (default), minimum: starting from minima) - level slice - vertical levels at which to seed the cells for the watershedding algorithm - method: string - flag determining the algorithm to use (currently watershedding implemented) - max_distance: float - maximum distance from a marker allowed to be classified as belonging to that cell - - Output: - segmentation_out: iris.cube.Cube - cloud mask, 0 outside and integer numbers according to track inside the clouds - features_out: pandas.DataFrame - feature dataframe including the number of cells (2D or 3D) in the segmented area/volume of the feature at the timestep - """ + # The location of watershed within skimage submodules changes with v0.19, but I've kept both for backward compatibility for now try: from skimage.segmentation import watershed @@ -210,31 +287,65 @@ def segmentation( max_distance=None, vertical_coord="auto", ): - """ - Function using watershedding or random walker to determine cloud volumes associated with tracked updrafts - - Parameters: - features: pandas.DataFrame - output from trackpy/maketrack - field: iris.cube.Cube - containing the field to perform the watershedding on - threshold: float - threshold for the watershedding field to be used for the mask - - target: string - Switch to determine if algorithm looks strating from maxima or minima in input field (maximum: starting from maxima (default), minimum: starting from minima) - - level slice - levels at which to seed the cells for the watershedding algorithm - method: str ('method') - flag determining the algorithm to use (currently watershedding implemented) - - max_distance: float - Maximum distance from a marker allowed to be classified as belonging to that cell - - Output: - segmentation_out: iris.cube.Cube - Cloud mask, 0 outside and integer numbers according to track inside the cloud + """Use watershedding to determine region above a threshold + value around initial seeding position for all time steps of + the input data. Works both in 2D (based on single seeding + point) and 3D and returns a mask with zeros everywhere around + the identified regions and the feature id inside the regions. + + Calls segmentation_timestep at each individal timestep of the + input data. + + Parameters + ---------- + features : pandas.DataFrame + Output from trackpy/maketrack. + + field : iris.cube.Cube + Containing the field to perform the watershedding on. + + dxy : float + Grid spacing of the input data. + + threshold : float, optional + Threshold for the watershedding field to be used for the mask. + Default is 3e-3. + + target : {'maximum', 'minimum'}, optional + Flag to determine if tracking is targetting minima or maxima in + the data. Default is 'maximum'. + + level : slice of iris.cube.Cube, optional + Levels at which to seed the cells for the watershedding + algorithm. Default is None. + + method : {'watershed'}, optional + Flag determining the algorithm to use (currently watershedding + implemented). 'random_walk' could be uncommented. + + max_distance : float, optional + Maximum distance from a marker allowed to be classified as + belonging to that cell. Default is None. + + vertical_coord : {'auto', 'z', 'model_level_number', 'altitude', + 'geopotential_height'}, optional + Name of the vertical coordinate for use in 3D segmentation case + + Returns + ------- + segmentation_out : iris.cube.Cube + Mask, 0 outside and integer numbers according to track + inside the area/volume of the feature. + + features_out : pandas.DataFrame + Feature dataframe including the number of cells (2D or 3D) in + the segmented area/volume of the feature at the timestep. + + Raises + ------ + ValueError + If field_in.ndim is neither 3 nor 4 and 'time' is not included + in coords. """ import pandas as pd from iris.cube import CubeList @@ -286,10 +397,12 @@ def segmentation( def watershedding_3D(track, field_in, **kwargs): + """Wrapper for the segmentation()-function.""" kwargs.pop("method", None) return segmentation_3D(track, field_in, method="watershed", **kwargs) def watershedding_2D(track, field_in, **kwargs): + """Wrapper for the segmentation()-function.""" kwargs.pop("method", None) return segmentation_2D(track, field_in, method="watershed", **kwargs) diff --git a/tobac/testing.py b/tobac/testing.py index 36338a44..cb2c750f 100644 --- a/tobac/testing.py +++ b/tobac/testing.py @@ -1,3 +1,7 @@ +"""Containing methods to make simple sample data for testing. + +""" + import datetime import numpy as np from xarray import DataArray @@ -5,17 +9,28 @@ def make_simple_sample_data_2D(data_type="iris"): - """ - function creating a simple dataset to use in tests for tobac. - The grid has a grid spacing of 1km in both horizontal directions and 100 grid cells in x direction and 500 in y direction. - Time resolution is 1 minute and the total length of the dataset is 100 minutes around a abritraty date (2000-01-01 12:00). - The longitude and latitude coordinates are added as 2D aux coordinates and arbitrary, but in realisitic range. - The data contains a single blob travelling on a linear trajectory through the dataset for part of the time. + """Create a simple dataset to use in tests. - :param data_type: 'iris' or 'xarray' to chose the type of dataset to produce - :return: sample dataset as an Iris.Cube.cube or xarray.DataArray + The grid has a grid spacing of 1km in both horizontal directions + and 100 grid cells in x direction and 500 in y direction. + Time resolution is 1 minute and the total length of the dataset is + 100 minutes around a abritraty date (2000-01-01 12:00). + The longitude and latitude coordinates are added as 2D aux + coordinates and arbitrary, but in realisitic range. + The data contains a single blob travelling on a linear trajectory + through the dataset for part of the time. + + Parameters + ---------- + data_type : {'iris', 'xarray'}, optional + Choose type of the dataset that will be produced. + Default is 'iris' + Returns + ------- + sample_data : iris.cube.Cube or xarray.DataArray """ + from iris.cube import Cube from iris.coords import DimCoord, AuxCoord @@ -79,21 +94,31 @@ def make_simple_sample_data_2D(data_type="iris"): def make_sample_data_2D_3blobs(data_type="iris"): - from iris.cube import Cube - from iris.coords import DimCoord, AuxCoord + """Create a simple dataset to use in tests. - """ - function creating a simple dataset to use in tests for tobac. - The grid has a grid spacing of 1km in both horizontal directions and 100 grid cells in x direction and 200 in y direction. - Time resolution is 1 minute and the total length of the dataset is 100 minutes around a abritraty date (2000-01-01 12:00). - The longitude and latitude coordinates are added as 2D aux coordinates and arbitrary, but in realisitic range. - The data contains a three individual blobs travelling on a linear trajectory through the dataset for part of the time. - - :param data_type: 'iris' or 'xarray' to chose the type of dataset to produce - :return: sample dataset as an Iris.Cube.cube or xarray.DataArray + The grid has a grid spacing of 1km in both horizontal directions + and 100 grid cells in x direction and 200 in y direction. + Time resolution is 1 minute and the total length of the dataset is + 100 minutes around a arbitrary date (2000-01-01 12:00). + The longitude and latitude coordinates are added as 2D aux + coordinates and arbitrary, but in realisitic range. + The data contains three individual blobs travelling on a linear + trajectory through the dataset for part of the time. + + Parameters + ---------- + data_type : {'iris', 'xarray'}, optional + Choose type of the dataset that will be produced. + Default is 'iris' + Returns + ------- + sample_data : iris.cube.Cube or xarray.DataArray """ + from iris.cube import Cube + from iris.coords import DimCoord, AuxCoord + t_0 = datetime.datetime(2000, 1, 1, 12, 0, 0) x = np.arange(0, 100e3, 1000) @@ -183,14 +208,24 @@ def make_sample_data_2D_3blobs(data_type="iris"): def make_sample_data_2D_3blobs_inv(data_type="iris"): - """ - function creating a version of the dataset created in the function make_sample_cube_2D, but with switched coordinate order for the horizontal coordinates - for tests to ensure that this does not affect the results + """Create a version of the dataset with switched coordinates. - :param data_type: 'iris' or 'xarray' to chose the type of dataset to produce - :return: sample dataset as an Iris.Cube.cube or xarray.DataArray + Create a version of the dataset created in the function + make_sample_cube_2D, but with switched coordinate order for the + horizontal coordinates for tests to ensure that this does not + affect the results. + Parameters + ---------- + data_type : {'iris', 'xarray'}, optional + Choose type of the dataset that will be produced. + Default is 'iris' + + Returns + ------- + sample_data : iris.cube.Cube or xarray.DataArray """ + from iris.cube import Cube from iris.coords import DimCoord, AuxCoord @@ -285,21 +320,35 @@ def make_sample_data_2D_3blobs_inv(data_type="iris"): def make_sample_data_3D_3blobs(data_type="iris", invert_xy=False): - from iris.cube import Cube - from iris.coords import DimCoord, AuxCoord + """Create a simple dataset to use in tests. - """ - function creating a simple dataset to use in tests for tobac. - The grid has a grid spacing of 1km in both horizontal directions and 100 grid cells in x direction and 200 in y direction. - Time resolution is 1 minute and the total length of the dataset is 100 minutes around a abritraty date (2000-01-01 12:00). - The longitude and latitude coordinates are added as 2D aux coordinates and arbitrary, but in realisitic range. - The data contains a three individual blobs travelling on a linear trajectory through the dataset for part of the time. - - :param data_type: 'iris' or 'xarray' to chose the type of dataset to produce - :return: sample dataset as an Iris.Cube.cube or xarray.DataArray + The grid has a grid spacing of 1km in both horizontal directions + and 100 grid cells in x direction and 200 in y direction. + Time resolution is 1 minute and the total length of the dataset is + 100 minutes around a abritraty date (2000-01-01 12:00). + The longitude and latitude coordinates are added as 2D aux + coordinates and arbitrary, but in realisitic range. + The data contains three individual blobs travelling on a linear + trajectory through the dataset for part of the time. + Parameters + ---------- + data_type : {'iris', 'xarray'}, optional + Choose type of the dataset that will be produced. + Default is 'iris' + + invert_xy : bool, optional + Flag to determine wether to switch x and y coordinates + Default is False + + Returns + ------- + sample_data : iris.cube.Cube or xarray.DataArray """ + from iris.cube import Cube + from iris.coords import DimCoord, AuxCoord + t_0 = datetime.datetime(2000, 1, 1, 12, 0, 0) x = np.arange(0, 100e3, 1000) @@ -438,22 +487,33 @@ def make_dataset_from_arr( ---------- in_arr: array-like The input array to convert to iris/xarray - data_type: str('xarray' or 'iris') + + data_type: str('xarray' or 'iris'), optional Type of the dataset to return - time_dim_num: int or None + Default is 'xarray' + + time_dim_num: int or None, optional What axis is the time dimension on, None for a single timestep - z_dim_num: int or None + Default is None + + z_dim_num: int or None, optional What axis is the z dimension on, None for a 2D array - y_dim_num: int + Default is None + + y_dim_num: int, optional What axis is the y dimension on, typically 0 for a 2D array - x_dim_num: int + Default is 0 + + x_dim_num: int, optional What axis is the x dimension on, typically 1 for a 2D array + Default is 1 Returns ------- Iris or xarray dataset with everything we need for feature detection/tracking. """ + import xarray as xr import iris @@ -490,34 +550,46 @@ def make_feature_blob( shape="rectangle", amplitude=1, ): - import xarray as xr - - """Function to make a defined "blob" in location (zloc, yloc, xloc) with + """Function to make a defined "blob" in location (zloc, yloc, xloc) with user-specified shape and amplitude. Note that this function will round the size and locations to the nearest point within the array. - + Parameters ---------- in_arr: array-like input array to add the "blob" to + h1_loc: float Center hdim_1 location of the blob, required + h2_loc: float Center hdim_2 location of the blob, required - v_loc: float + + v_loc: float, optional Center vdim location of the blob, optional. If this is None, we assume that the dataset is 2D. - h1_size: float + Default is None + + h1_size: float, optional Size of the bubble in array coordinates in hdim_1 - h2_size: float + Default is 1 + + h2_size: float, optional Size of the bubble in array coordinates in hdim_2 - v_size: float + Default is 1 + + v_size: float, optional Size of the bubble in array coordinates in vdim - shape: str('rectangle') + Default is 1 + + shape: str('rectangle'), optional The shape of the blob that is added. For now, this is just rectangle 'rectangle' adds a rectangular/rectangular prism bubble with constant amplitude `amplitude`. - amplitude: float + Default is "rectangle" + + amplitude: float, optional Maximum amplitude of the blob + Default is 1 Returns ------- @@ -525,6 +597,8 @@ def make_feature_blob( An array with the same type as `in_arr` that has the blob added. """ + import xarray as xr + # Check if z location is there and set our 3D-ness based on this. if v_loc is None: is_3D = False @@ -576,27 +650,37 @@ def set_arr_2D_3D( ---------- in_arr: array-like Array of values to set + value: int, float, or array-like of size (end_v-start_v, end_h1-start_h1, end_h2-start_h2) The value to assign to in_arr. This will work to assign an array, but the array must have the same dimensions as the size specified in the function. + start_h1: int Start index to set for hdim_1 + end_h1: int End index to set for hdim_1 (exclusive, so it acts like [start_h1:end_h1]) + start_h2: int Start index to set for hdim_2 + end_h2: int End index to set for hdim_2 - start_v: int - Start index to set for vdim (optional) - end_v: int - End index to set for vdim (optional) + + start_v: int, optional + Start index to set for vdim + Default is None + + end_v: int, optional + End index to set for vdim + Default is None Returns ------- array-like in_arr with the new values set. """ + if start_v is not None and end_v is not None: in_arr[start_v:end_v, start_h1:end_h1, start_h2:end_h2] = value else: @@ -628,37 +712,64 @@ def generate_single_feature( ---------- start_h1: float Starting point of the feature in hdim_1 space + start_h2: float Starting point of the feature in hdim_2 space - start_v: float + + start_v: float, optional Starting point of the feature in vdim space (if 3D). For 2D, set to None. - spd_h1: float + Default is None + + spd_h1: float, optional Speed (per frame) of the feature in hdim_1 - spd_h2: float + Default is 1 + + spd_h2: float, optional Speed (per frame) of the feature in hdim_2 - spd_v: float + Default is 1 + + spd_v: float, optional Speed (per frame) of the feature in vdim - min_h1: int + Default is 1 + + min_h1: int, optional Minimum value of hdim_1 allowed. If PBC_flag is not 'none', then this will be used to know when to wrap around periodic boundaries. If PBC_flag is 'none', features will disappear if they are above/below these bounds. - max_h1: int + Default is 0 + + max_h1: int, optional Similar to min_h1, but the max value of hdim_1 allowed. - min_h2: int + Default is 1000 + + min_h2: int, optional Similar to min_h1, but the minimum value of hdim_2 allowed. - max_h2: int + Default is 0 + + max_h2: int, optional Similar to min_h1, but the maximum value of hdim_2 allowed. - num_frames: int + Default is 1000 + + num_frames: int, optional Number of frames to generate - dt: datetime.timedelta + Default is 1 + + dt: datetime.timedelta, optional Difference in time between each frame - start_date: datetime.datetime + Default is datetime.timedelta(minutes=5) + + start_date: datetime.datetime, optional Start datetime - frame_start: int + Default is datetime.datetime(2022, 1, 1, 0) + + frame_start: int, optional Number to start the frame at - feature_num: int + Default is 1 + + feature_num: int, optional What number to start the feature at + Default is 1 """ out_list_of_dicts = list() diff --git a/tobac/tracking.py b/tobac/tracking.py index 1bc9b925..d0eaa606 100644 --- a/tobac/tracking.py +++ b/tobac/tracking.py @@ -1,3 +1,24 @@ +"""Provide tracking methods. + +The individual features and associated area/volumes identified in +each timestep have to be linked into trajectories to analyse +the time evolution of their properties for a better understanding of +the underlying physical processes. +The implementations are structured in a way that allows for the future +addition of more complex tracking methods recording a more complex +network of relationships between features at different points in +time. + +References +---------- +.. Heikenfeld, M., Marinescu, P. J., Christensen, M., + Watson-Parris, D., Senf, F., van den Heever, S. C. + & Stier, P. (2019). tobac 1.2: towards a flexible + framework for tracking and analysis of clouds in + diverse datasets. Geoscientific Model Development, + 12(11), 4551-4570. +""" + import logging import numpy as np import pandas as pd @@ -24,31 +45,125 @@ def linking_trackpy( cell_number_start=1, cell_number_unassigned=-1, ): - """ - Function to perform the linking of features in trajectories - - Parameters: - features: pandas.DataFrame - Detected features to be linked - v_max: float - speed at which features are allowed to move - dt: float - time resolution of tracked features - dxy: float - grid spacing of input data - memory int - number of output timesteps features allowed to vanish for to be still considered tracked - subnetwork_size int - maximim size of subnetwork for linking - method_detection: str('trackpy' or 'threshold') - flag choosing method used for feature detection - method_linking: str('predict' or 'random') - flag choosing method used for trajectory linking + + """Perform Linking of features in trajectories. + + The linking determines which of the features detected in a specific + timestep is most likely identical to an existing feature in the + previous timestep. For each existing feature, the movement within + a time step is extrapolated based on the velocities in a number + previous time steps. The algorithm then breaks the search process + down to a few candidate features by restricting the search to a + circular search region centered around the predicted position of + the feature in the next time step. For newly initialized trajectories, + where no velocity from previous time steps is available, the + algorithm resorts to the average velocity of the nearest tracked + objects. v_max and d_min are given as physical quantities and then + converted into pixel-based values used in trackpy. This allows for + tracking that is controlled by physically-based parameters that are + independent of the temporal and spatial resolution of the input + data. The algorithm creates a continuous track for the feature + that is the most probable based on the previous cell path. + + Parameters + ---------- + features : pandas.DataFrame + Detected features to be linked. + + field_in : xarray.DataArray + Input field to perform the watershedding on (2D or 3D for one + specific point in time). + + dt : float + Time resolution of tracked features. + + dxy : float + Grid spacing of the input data. + + d_max : float, optional + Maximum search range + Default is None. + + d_min : float, optional + Variations in the shape of the regions used to determine the + positions of the features can lead to quasi-instantaneous shifts + of the position of the feature by one or two grid cells even for + a very high temporal resolution of the input data, potentially + jeopardising the tracking procedure. To prevent this, tobac uses + an additional minimum radius of the search range. + Default is None. + + subnetwork_size : int, optional + Maximum size of subnetwork for linking. This parameter should be + adjusted when using adaptive search. Usually a lower value is desired + in that case. For a more in depth explanation have look + `here `_ + If None, 30 is used for regular search and 15 for adaptive search. + Default is None. + + v_max : float, optional + Speed at which features are allowed to move. Default is None. + + memory : int, optional + Number of output timesteps features allowed to vanish for to + be still considered tracked. Default is 0. + .. warning :: This parameter should be used with caution, as it + can lead to erroneous trajectory linking, + espacially for data with low time resolution. + + stubs : int, optional + Minimum number of timesteps of a tracked cell to be reported + Default is 1 + + time_cell_min : float, optional + Minimum length in time of tracked cell to be reported in minutes + Default is None. + + order : int, optional + Order of polynomial used to extrapolate trajectory into gaps and + ond start and end point. + Default is 1. + + extrapolate : int, optional + Number or timesteps to extrapolate trajectories. + Default is 0. + + method_linking : {'random', 'predict'}, optional + Flag choosing method used for trajectory linking. + Default is 'random'. + + adaptive_step : float, optional + Reduce search range by multiplying it by this factor. + + adaptive_stop : float, optional + If not None, when encountering an oversize subnet, retry by progressively + reducing search_range until the subnet is solvable. If search_range + becomes <= adaptive_stop, give up and raise a SubnetOversizeException. + Default is None + + cell_number_start : int, optional + Cell number for first tracked cell. + Default is 1 + cell_number_unassigned: int - Number to set the unassigned/non-tracked cells to. By default, this is -1. - Note that if you set this to `np.nan`, the data type of 'cell' will - change to float. + Number to set the unassigned/non-tracked cells to. Note that if you set this + to `np.nan`, the data type of 'cell' will change to float. + Default is -1 + + Returns + ------- + trajectories_final : pandas.DataFrame + Dataframe of the linked features, containing the variable 'cell', + with integers indicating the affiliation of a feature to a specific + track, and the variable 'time_cell' with the time the cell has + already existed. + + Raises + ------ + ValueError + If method_linking is neither 'random' nor 'predict'. """ + # from trackpy import link_df import trackpy as tp from copy import deepcopy @@ -202,24 +317,34 @@ def linking_trackpy( def fill_gaps( t, order=1, extrapolate=0, frame_max=None, hdim_1_max=None, hdim_2_max=None ): - """add cell time as time since the initiation of each cell - Input: - t: pandas dataframe - trajectories from trackpy - order: int - Order of polynomial used to extrapolate trajectory into gaps and beyond start and end point - extrapolate int - number of timesteps to extrapolate trajectories by - frame_max: int - size of input data along time axis - hdim_1_max: int - size of input data along first horizontal axis - hdim_2_max: int - size of input data along second horizontal axis - Output: - t: pandas dataframe - trajectories from trackpy with with filled gaps and potentially extrapolated + """Add cell time as time since the initiation of each cell. + + Parameters + ---------- + t : pandas.DataFrame + Trajectories from trackpy. + + order : int, optional + Order of polynomial used to extrapolate trajectory into + gaps and beyond start and end point. Default is 1. + + extrapolate : int, optional + Number or timesteps to extrapolate trajectories. Default is 0. + + frame_max : int, optional + Size of input data along time axis. Default is None. + + hdim_1_max, hdim2_max : int, optional + Size of input data along first and second horizontal axis. + Default is None. + + Returns + ------- + t : pandas.DataFrame + Trajectories from trackpy with with filled gaps and potentially + extrapolated. """ + from scipy.interpolate import InterpolatedUnivariateSpline logging.debug("start filling gaps") @@ -276,13 +401,13 @@ def add_cell_time(t): Parameters ---------- - t: pandas DataFrame - trajectories with added coordinates + t : pandas.DataFrame + trajectories with added coordinates Returns ------- - t: pandas dataframe - trajectories with added cell time + t : pandas.Dataframe + trajectories with added cell time """ # logging.debug('start adding time relative to cell initiation')