diff --git a/doc/index.rst b/doc/index.rst index dd0deb6f..1f1f7585 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -52,6 +52,13 @@ The project is currently being extended by several contributors to include addit linking tracking_output + +.. toctree:: + :caption: Merge/Split + :maxdepth: 2 + + merge_split + merge_split_out_vars .. toctree:: :caption: API Reference diff --git a/doc/merge_split.rst b/doc/merge_split.rst new file mode 100644 index 00000000..b6b56cbd --- /dev/null +++ b/doc/merge_split.rst @@ -0,0 +1,43 @@ +Merge and Split +====================== + +This submodule is a post processing step to address tracked cells which merge/split. +The first iteration of this module is to combine the cells which are merging but have received a new cell id (and are considered a new cell) once merged. +This module uses a minimum euclidian spanning tree to combine merging cells, thus the postfix for the function is MEST. +This submodule will label merged/split cells with a TRACK number in addition to its CELL number. + +Features, cells, and tracks are combined using parent/child nomenclature. +(quick note on terms; “feature” is a detected object at a single time step (see :doc:`feature_detection_overview`). “cell” is a series of features linked together over multiple timesteps (see :doc:`linking`). "track" may be an individual cell or series of cells which have merged and/or split.) + +Overview of the output dataframe from merge_split + +d : `xarray.core.dataset.Dataset` + +xarray dataset of tobac merge/split cells with parent and child designations. + +Parent/child variables include: + +* cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. + +* feature_parent_cell_id: The associated parent cell id for each feature. All feature in a given cell will have the same cell id. + +* feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + +* track_child_cell_count: The total number of features belonging to all child cells of a given track id. + +* cell_child_feature_count: The total number of features for each cell. + + +Example usage: + +``d = merge_split_MEST(Track)`` + +merge_split outputs an `xarray` dataset with several variables. The variables, (with column names listed in the `Variable Name` column), are described below with units. Coordinates and dataset dimensions are Feature, Cell, and Track. + +Variables that are common to all feature detection files: + +.. csv-table:: tobac Merge_Split Track Output Variables + :file: ./merge_split_out_vars.csv + :widths: 3, 35, 3, 3 + :header-rows: 1 + diff --git a/doc/merge_split_out_vars.csv b/doc/merge_split_out_vars.csv new file mode 100644 index 00000000..b4b6069c --- /dev/null +++ b/doc/merge_split_out_vars.csv @@ -0,0 +1,9 @@ +Variable Name,Description,Units,Type +feature,Unique number of the feature; starts from 1 and increments by 1 to the number of features identified in all frames,n/a,int64 +cell,Tracked cell number; generally starts from 1. Untracked cell value is -1.,n/a,int64 +track,Unique number of the track; starts from 0 and increments by 1 to the number of tracks identified. Untracked cells and features have a track id of -1.,n/a,int64 +cell_parent_track_id,"The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id.",n/a,int64 +feature_parent_cell_id,The associated parent cell id for each feature. All feature in a given cell will have the same cell id.,n/a,int64 +feature_parent_track_id,The associated parent track id for each feature. This is not the same as the cell id number.,n/a,int64 +track_child_cell_count,The number of features belonging to all child cells of a given track id.,n/a,int64 +cell_child_feature_count,The number of features for each cell.,n/a,int64 \ No newline at end of file diff --git a/doc/tobac.rst b/doc/tobac.rst index 6f7f4ccb..ac0d88fe 100644 --- a/doc/tobac.rst +++ b/doc/tobac.rst @@ -28,6 +28,14 @@ tobac.feature\_detection module :undoc-members: :show-inheritance: +tobac.merge_split module +--------------------- + +.. automodule:: tobac.merge_split + :members: + :undoc-members: + :show-inheritance: + tobac.plotting module --------------------- diff --git a/tobac/__init__.py b/tobac/__init__.py index f1aeb95e..6d7627bf 100644 --- a/tobac/__init__.py +++ b/tobac/__init__.py @@ -74,6 +74,7 @@ from .tracking import linking_trackpy from .wrapper import maketrack from .wrapper import tracking_wrapper +from . import merge_split # Set version number __version__ = "1.4.0" diff --git a/tobac/merge_split.py b/tobac/merge_split.py new file mode 100644 index 00000000..6e53eb97 --- /dev/null +++ b/tobac/merge_split.py @@ -0,0 +1,225 @@ +""" + Tobac merge and split + This submodule is a post processing step to address tracked cells which merge/split. + The first iteration of this module is to combine the cells which are merging but have received + a new cell id (and are considered a new cell) once merged. In general this submodule will label merged/split cells + with a TRACK number in addition to its CELL number. + +""" + + +def merge_split_MEST(TRACK, dxy, distance=None, frame_len=5): + """ + function to postprocess tobac track data for merge/split cells using a minimum euclidian spanning tree + + + Parameters + ---------- + TRACK : pandas.core.frame.DataFrame + Pandas dataframe of tobac Track information + + dxy : float, mandatory + The x/y grid spacing of the data. + Should be in meters. + + + distance : float, optional + Distance threshold determining how close two features must be in order to consider merge/splitting. + Default is 25x the x/y grid spacing of the data, given in dxy. + The distance should be in units of meters. + + frame_len : float, optional + Threshold for the maximum number of frames that can separate the end of cell and the start of a related cell. + Default is five (5) frames. + + Returns + ------- + + d : xarray.core.dataset.Dataset + xarray dataset of tobac merge/split cells with parent and child designations. + + Parent/child variables include: + - cell_parent_track_id: The associated track id for each cell. All cells that have merged or split will have the same parent track id. If a cell never merges/splits, only one cell will have a particular track id. + - feature_parent_cell_id: The associated parent cell id for each feature. All features in a given cell will have the same cell id. This is the original TRACK cell_id. + - feature_parent_track_id: The associated parent track id for each feature. This is not the same as the cell id number. + - track_child_cell_count: The total number of features belonging to all child cells of a given track id. + - cell_child_feature_count: The total number of features for each cell. + + + Example usage: + d = merge_split_MEST(Track) + ds = tobac.utils.standardize_track_dataset(Track, refl_mask) + both_ds = xr.merge([ds, d],compat ='override') + both_ds = tobac.utils.compress_all(both_ds) + both_ds.to_netcdf(os.path.join(savedir,'Track_features_merges.nc')) + + """ + try: + import networkx as nx + except ImportError: + networkx = None + + import logging + import numpy as np + from pandas.core.common import flatten + import xarray as xr + from scipy.spatial.distance import cdist + + # Immediately convert pandas dataframe of track information to xarray: + TRACK = TRACK.to_xarray() + track_groups = TRACK.groupby("cell") + first = track_groups.first() + last = track_groups.last() + + if distance is None: + distance = dxy * 25.0 + + a_names = list() + b_names = list() + dist = list() + + # write all sets of points (a and b) as Nx2 arrays + l = len(last["hdim_2"].values) + cells = first["cell"].values + a_xy = np.zeros((l, 2)) + a_xy[:, 0] = last["hdim_2"].values * dxy + a_xy[:, 1] = last["hdim_1"].values * dxy + b_xy = np.zeros((l, 2)) + b_xy[:, 0] = first["hdim_2"].values * dxy + b_xy[:, 1] = first["hdim_1"].values * dxy + # Use cdist to find distance matrix + out = cdist(a_xy, b_xy) + # Find all cells under the distance threshold + j = np.where(out <= distance) + + # Compile cells meeting the criteria to an array of both the distance and cell ids + a_names = cells[j[0]] + b_names = cells[j[1]] + dist = out[j] + + # This is inputing data to the object which will perform the spanning tree. + g = nx.Graph() + for i in np.arange(len(dist)): + g.add_edge(a_names[i], b_names[i], weight=dist[i]) + + tree = nx.minimum_spanning_edges(g) + tree_list = list(tree) + + new_tree = [] + + # Pruning the tree for time limits. + for i, j in enumerate(tree_list): + frame_a = np.nanmax(track_groups[j[0]].frame.values) + frame_b = np.nanmin(track_groups[j[1]].frame.values) + if np.abs(frame_a - frame_b) <= frame_len: + new_tree.append(tree_list[i][0:2]) + new_tree_arr = np.array(new_tree) + + TRACK["cell_parent_track_id"] = np.zeros(len(TRACK["cell"].values)) + cell_id = np.unique( + TRACK.cell.values.astype(int)[~np.isnan(TRACK.cell.values.astype(int))] + ) + track_id = dict() # same size as number of total merged tracks + + # Cleaning up tracks, combining tracks which contain the same cells. + arr = np.array([0]) + for p in cell_id: + j = np.where(arr == int(p)) + if len(j[0]) > 0: + continue + else: + k = np.where(new_tree_arr == p) + if len(k[0]) == 0: + track_id[p] = [p] + arr = np.append(arr, p) + else: + temp1 = list(np.unique(new_tree_arr[k[0]])) + temp = list(np.unique(new_tree_arr[k[0]])) + + for l in range(len(cell_id)): + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + + if len(temp1) == len(temp): + break + temp1 = np.array(temp) + + for i in temp1: + k2 = np.where(new_tree_arr == i) + temp.append(list(np.unique(new_tree_arr[k2[0]]).squeeze())) + + temp = list(flatten(temp)) + temp = list(np.unique(temp)) + arr = np.append(arr, np.unique(temp)) + + track_id[np.nanmax(np.unique(temp))] = list(np.unique(temp)) + + cell_id = list(np.unique(TRACK.cell.values.astype(int))) + logging.debug("found cell ids") + + cell_parent_track_id = np.zeros(len(cell_id)) + cell_parent_track_id[:] = -1 + + for i, id in enumerate(track_id, start=0): + for j in track_id[int(id)]: + cell_parent_track_id[cell_id.index(j)] = int(i) + + logging.debug("found cell parent track ids") + + track_ids = np.array(np.unique(cell_parent_track_id)) + logging.debug("found track ids") + + feature_parent_cell_id = list(TRACK.cell.values.astype(int)) + logging.debug("found feature parent cell ids") + + # # This version includes all the feature regardless of if they are used in cells or not. + feature_id = list(TRACK.feature.values.astype(int)) + logging.debug("found feature ids") + + feature_parent_track_id = [] + feature_parent_track_id = np.zeros(len(feature_id)) + for i, id in enumerate(feature_id): + cellid = feature_parent_cell_id[i] + if cellid < 0: + feature_parent_track_id[i] = -1 + else: + feature_parent_track_id[i] = cell_parent_track_id[cell_id.index(cellid)] + + track_child_cell_count = np.zeros(len(track_id)) + for i, id in enumerate(track_id): + track_child_cell_count[i] = len(np.where(cell_parent_track_id == i)[0]) + logging.debug("found track child cell count") + + cell_child_feature_count = np.zeros(len(cell_id)) + for i, id in enumerate(cell_id): + cell_child_feature_count[i] = len(track_groups[id].feature.values) + logging.debug("found cell child feature count") + + track_dim = "track" + cell_dim = "cell" + feature_dim = "feature" + + d = xr.Dataset( + { + "track": (track_dim, track_ids), + "cell": (cell_dim, cell_id), + "cell_parent_track_id": (cell_dim, cell_parent_track_id), + "feature": (feature_dim, feature_id), + "feature_parent_cell_id": (feature_dim, feature_parent_cell_id), + "feature_parent_track_id": (feature_dim, feature_parent_track_id), + "track_child_cell_count": (track_dim, track_child_cell_count), + "cell_child_feature_count": (cell_dim, cell_child_feature_count), + } + ) + + d = d.set_coords(["feature", "cell", "track"]) + + # assert len(cell_id) == len(cell_parent_track_id) + # assert len(feature_id) == len(feature_parent_cell_id) + # assert sum(track_child_cell_count) == len(cell_id) + # assert sum(cell_child_feature_count) == len(feature_id) + + return d diff --git a/tobac/tests/test_merge_split.py b/tobac/tests/test_merge_split.py new file mode 100644 index 00000000..389c3dc7 --- /dev/null +++ b/tobac/tests/test_merge_split.py @@ -0,0 +1,94 @@ +"""Module to test tobac.merge_split +""" + +import pandas as pd +import numpy as np + +import datetime + +import tobac.testing as tbtest +import tobac.tracking as tbtrack +import tobac.merge_split as mergesplit + + +def test_merge_split_MEST(): + """Tests tobac.merge_split.merge_split_cells by + generating two cells, colliding them into one another, + and merging them. + """ + + cell_1 = tbtest.generate_single_feature( + 1, + 1, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=0, + num_frames=2, + spd_h1=20, + spd_h2=20, + start_date=datetime.datetime(2022, 1, 1, 0), + ) + cell_1["feature"] = [1, 3] + + cell_2 = tbtest.generate_single_feature( + 1, + 100, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=0, + num_frames=2, + spd_h1=20, + spd_h2=-20, + start_date=datetime.datetime(2022, 1, 1, 0), + ) + cell_2["feature"] = [2, 4] + cell_3 = tbtest.generate_single_feature( + 30, + 50, + min_h1=0, + max_h1=100, + min_h2=0, + max_h2=100, + frame_start=2, + num_frames=2, + spd_h1=20, + spd_h2=0, + start_date=datetime.datetime(2022, 1, 1, 0, 10), + ) + cell_3["feature"] = [5, 6] + features = pd.concat([cell_1, cell_2, cell_3]) + output = tbtrack.linking_trackpy(features, None, 1, 1, v_max=40) + + dist_between = np.sqrt( + np.power( + output[output["frame"] == 1].iloc[0]["hdim_1"] + - output[output["frame"] == 1].iloc[1]["hdim_1"], + 2, + ) + + np.power( + output[output["frame"] == 1].iloc[0]["hdim_2"] + - output[output["frame"] == 1].iloc[1]["hdim_2"], + 2, + ) + ) + + # Test a successful merger + mergesplit_output_merged = mergesplit.merge_split_MEST( + output, dxy=1, distance=dist_between + 50 + ) + + # These cells should have merged together. + assert len(mergesplit_output_merged["track"]) == 1 + + # Test an unsuccessful merger. + mergesplit_output_unmerged = mergesplit.merge_split_MEST( + output, dxy=1, distance=dist_between - 50 + ) + + # These cells should NOT have merged together. + print(mergesplit_output_unmerged["track"]) + assert len(mergesplit_output_unmerged["track"]) == 2 diff --git a/tobac/utils.py b/tobac/utils.py index d9334239..bedcde1c 100644 --- a/tobac/utils.py +++ b/tobac/utils.py @@ -896,6 +896,179 @@ def spectral_filtering( return filtered_field +def compress_all(nc_grids, min_dims=2, comp_level=4): + """ + The purpose of this subroutine is to compress the netcdf variables as they are saved. + This does not change the data, but sets netcdf encoding parameters. + We allocate a minimum number of dimensions as variables with dimensions + under the minimum value do not benefit from tangibly from this encoding. + + Parameters + ---------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset that is intended to be exported as netcdf + + min_dims : integer + The minimum number of dimesnions, in integer value, a variable must have in order + set the netcdf compression encoding. + comp_level : integer + The level of compression. Default values is 4. + + Returns + ------- + nc_grids : xarray.core.dataset.Dataset + Xarray dataset with netcdf compression encoding for variables with two (2) or more dimensions + + """ + + for var in nc_grids: + if len(nc_grids[var].dims) >= min_dims: + # print("Compressing ", var) + nc_grids[var].encoding["zlib"] = True + nc_grids[var].encoding["complevel"] = comp_level + nc_grids[var].encoding["contiguous"] = False + return nc_grids + + +def standardize_track_dataset(TrackedFeatures, Mask, Projection=None): + """ + CAUTION: this function is experimental. No data structures output are guaranteed to be supported in future versions of tobac. + + Combine a feature mask with the feature data table into a common dataset. + + returned by tobac.segmentation + with the TrackedFeatures dataset returned by tobac.linking_trackpy. + + Also rename the variables to be more descriptive and comply with cf-tree. + + Convert the default cell parent ID to an integer table. + + Add a cell dimension to reflect + + Projection is an xarray DataArray + + TODO: Add metadata attributes + + Parameters + ---------- + TrackedFeatures : xarray.core.dataset.Dataset + xarray dataset of tobac Track information, the xarray dataset returned by tobac.tracking.linking_trackpy + + Mask: xarray.core.dataset.Dataset + xarray dataset of tobac segmentation mask information, the xarray dataset returned + by tobac.segmentation.segmentation + + + Projection : xarray.core.dataarray.DataArray, default = None + array.DataArray of the original input dataset (gridded nexrad data for example). + If using gridded nexrad data, this can be input as: data['ProjectionCoordinateSystem'] + An example of the type of information in the dataarray includes the following attributes: + latitude_of_projection_origin :29.471900939941406 + longitude_of_projection_origin :-95.0787353515625 + _CoordinateTransformType :Projection + _CoordinateAxes :x y z time + _CoordinateAxesTypes :GeoX GeoY Height Time + grid_mapping_name :azimuthal_equidistant + semi_major_axis :6370997.0 + inverse_flattening :298.25 + longitude_of_prime_meridian :0.0 + false_easting :0.0 + false_northing :0.0 + + Returns + ------- + + ds : xarray.core.dataset.Dataset + xarray dataset of merged Track and Segmentation Mask datasets with renamed variables. + + """ + import xarray as xr + + feature_standard_names = { + # new variable name, and long description for the NetCDF attribute + "frame": ( + "feature_time_index", + "positional index of the feature along the time dimension of the mask, from 0 to N-1", + ), + "hdim_1": ( + "feature_hdim1_coordinate", + "position of the feature along the first horizontal dimension in grid point space; a north-south coordinate for dim order (time, y, x)." + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "hdim_2": ( + "feature_hdim2_coordinate", + "position of the feature along the second horizontal dimension in grid point space; an east-west coordinate for dim order (time, y, x)" + "The numbering is consistent with positional indexing of the coordinate, but can be" + "fractional, to account for a centroid not aligned to the grid.", + ), + "idx": ("feature_id_this_frame",), + "num": ( + "feature_grid_cell_count", + "Number of grid points that are within the threshold of this feature", + ), + "threshold_value": ( + "feature_threshold_max", + "Feature number within that frame; starts at 1, increments by 1 to the number of features for each frame, and resets to 1 when the frame increments", + ), + "feature": ( + "feature", + "Unique number of the feature; starts from 1 and increments by 1 to the number of features", + ), + "time": ( + "feature_time", + "time of the feature, consistent with feature_time_index", + ), + "timestr": ( + "feature_time_str", + "String representation of the feature time, YYYY-MM-DD HH:MM:SS", + ), + "projection_y_coordinate": ( + "feature_projection_y_coordinate", + "y position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "projection_x_coordinate": ( + "feature_projection_x_coordinate", + "x position of the feature in the projection given by ProjectionCoordinateSystem", + ), + "lat": ("feature_latitude", "latitude of the feature"), + "lon": ("feature_longitude", "longitude of the feature"), + "ncells": ( + "feature_ncells", + "number of grid cells for this feature (meaning uncertain)", + ), + "areas": ("feature_area",), + "isolated": ("feature_isolation_flag",), + "num_objects": ("number_of_feature_neighbors",), + "cell": ("feature_parent_cell_id",), + "time_cell": ("feature_parent_cell_elapsed_time",), + "segmentation_mask": ("2d segmentation mask",), + } + new_feature_var_names = { + k: feature_standard_names[k][0] + for k in feature_standard_names.keys() + if k in TrackedFeatures.variables.keys() + } + + # TrackedFeatures = TrackedFeatures.drop(["cell_parent_track_id"]) + # Combine Track and Mask variables. Use the 'feature' variable as the coordinate variable instead of + # the 'index' variable and call the dimension 'feature' + ds = xr.merge( + [ + TrackedFeatures.swap_dims({"index": "feature"}) + .drop("index") + .rename_vars(new_feature_var_names), + Mask, + ] + ) + + # Add the projection data back in + if Projection is not None: + ds["ProjectionCoordinateSystem"] = Projection + + return ds + + def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None): """Function to combine a list of tobac feature detection dataframes into one combined dataframe that can be used for tracking