From dc204daef8aaa7590316f645d6ab6dd14b1fff02 Mon Sep 17 00:00:00 2001 From: Cameron Mattson Date: Mon, 11 Mar 2024 00:46:23 -0600 Subject: [PATCH 1/2] Updated the plate well sampling to use the dp solution --- 1.train_models/utils/GreedySubsetSum.py | 108 ----------------- 1.train_models/utils/WellSubsetSum.py | 154 ++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 108 deletions(-) delete mode 100644 1.train_models/utils/GreedySubsetSum.py create mode 100644 1.train_models/utils/WellSubsetSum.py diff --git a/1.train_models/utils/GreedySubsetSum.py b/1.train_models/utils/GreedySubsetSum.py deleted file mode 100644 index 17cfd58..0000000 --- a/1.train_models/utils/GreedySubsetSum.py +++ /dev/null @@ -1,108 +0,0 @@ -import pandas as pd -from PlateTrainingSplits import PlateTrainingSplits - - -class GreedySubsetSum: - """ - Select wells for the test dataset by maximizing the number of cells in the train-validation set for a given number of wells. - """ - - def __init__(self): - pass - - def update_test_wells(self, _welldf, _category_col, _well_col, _cell_count_col, _test_well_count): - """ - Parameters - ---------- - _welldf: Pandas Dataframe - Well data with unique wells represented as rows. - - _category_col: List of Strings or String - The categories to represent equally in the cell population. - - _well_col: String - Well column name. - - _cell_count_col: String - The cell count column name. - - _test_well_count: Integer - Number of wells to sample for testing. - - Returns - ------- - test_wells: List of Strings - Test well names. - """ - - # Check if all of the cell counts are integers. - if not pd.api.types.is_integer_dtype(_welldf[_cell_count_col]): - raise TypeError(f"{_cell_count_col} column does not only contain integers") - - # Check if all of the integers are greater than zero - if (_welldf[_cell_count_col] <= 0).all(): - raise ValueError(f"{_cell_count_col} column contains only positive integers") - - plate_split = PlateTrainingSplits() - - catdf = _welldf.groupby(_category_col) - - # Determine the smallest category (reference category) for sampling test wells - min_cat = plate_split.sum_train_val_category_cell_counts( - catdf, - _category_col, - _cell_count_col, - _test_well_count - ) - - test_wells = [] - - # Make the categorie(s) iterable - if not isinstance(_category_col, list): - category_col = [_category_col] - - # Iterate through each group - for cat, groupdf in catdf: - - # Number of wells in group - cat_num_wells = groupdf.shape[0] - - # Determine the wells for the reference category - if all((groupdf[cat] == min_cat[cat]).all() for cat in category_col): - base_cat_wells = ( - groupdf.nsmallest(_test_well_count, _cell_count_col) - [_well_col].tolist() - ) - print(f"{len(base_cat_wells)} wells of {cat_num_wells} wells are test wells for reference group {cat}") - - continue - - # Sort the grouped data by well cell count - groupdf = groupdf.sort_values(by=_cell_count_col, ascending=False) - - # Create a cumulative sum of the sorted well cell counts - cell_count_cum_sum = f"{_cell_count_col}_cumsum" - groupdf[cell_count_cum_sum] = groupdf[_cell_count_col].cumsum() - - # Total cell count for all wells in group - tot_cell_count = groupdf[cell_count_cum_sum].iloc[-1] - - # Number of train-val cells after adding sorted wells to test set - groupdf["group_trainval_count"] = tot_cell_count - groupdf[cell_count_cum_sum] - - # Wells if they have a higher train-val cell count - groupdf = groupdf.loc[min_cat[_cell_count_col] <= groupdf["group_trainval_count"]] - - # Number of cells added to test set - cat_num_test_wells = groupdf.shape[0] - - # Check that the number of test wells for the group isn't zero - if cat_num_test_wells == 0: - raise ValueError(f"The test well count of group {cat} is zero") - - # Add wells to test set - test_wells.extend(groupdf[_well_col].tolist()) - - print(f"{groupdf.shape[0]} wells of {cat_num_wells} wells are test wells for group {cat}") - - return test_wells diff --git a/1.train_models/utils/WellSubsetSum.py b/1.train_models/utils/WellSubsetSum.py new file mode 100644 index 0000000..7290336 --- /dev/null +++ b/1.train_models/utils/WellSubsetSum.py @@ -0,0 +1,154 @@ +from collections import defaultdict + +import numpy as np +import pandas as pd +from PlateTrainingSplits import PlateTrainingSplits + + +class WellSubsetSum: + """ + Select wells for the test dataset by maximizing the number of cells in the train-validation set for a given number of wells. + """ + + def __init__(self): + pass + + def update_test_wells(self, _welldf, _category_col, _well_col, _cell_count_col, _test_well_count): + """ + Parameters + ---------- + _welldf: Pandas Dataframe + Well data with unique wells represented as rows. + + _category_col: List of Strings or String + The categories to represent equally in the cell population. + + _well_col: String + Well column name. + + _cell_count_col: String + The cell count column name. + + _test_well_count: Integer + Number of wells to sample for testing. + + Returns + ------- + test_wells: List of Strings + Test well names. + """ + + # Check if all of the cell counts are integers. + if not pd.api.types.is_integer_dtype(_welldf[_cell_count_col]): + raise TypeError(f"{_cell_count_col} column does not only contain integers") + + # Check if all of the integers are greater than zero + if (_welldf[_cell_count_col] <= 0).all(): + raise ValueError(f"{_cell_count_col} column contains only positive integers") + + plate_split = PlateTrainingSplits() + + catdf = _welldf.groupby(_category_col) + + # Determine the smallest category (reference category) for sampling test wells + min_cat = plate_split.sum_train_val_category_cell_counts( + catdf, + _category_col, + _cell_count_col, + _test_well_count + ) + + test_wells = [] + + # Make the categorie(s) iterable + if not isinstance(_category_col, list): + category_col = [_category_col] + + # Iterate through each group + for cat, groupdf in catdf: + + # Number of wells in group + cat_num_wells = groupdf.shape[0] + + # Determine the test wells for the reference category + if all((groupdf[ref_cat] == min_cat[ref_cat]).all() for ref_cat in category_col): + base_cat_wells = ( + groupdf.nsmallest(_test_well_count, _cell_count_col) + [_well_col].tolist() + ) + + test_wells.extend(base_cat_wells) + + print(f"{len(base_cat_wells)} wells of {cat_num_wells} wells are test wells for reference group {cat}") + + continue + + # Cumulative number of cells for this group + tot_cell_count = groupdf[_cell_count_col].sum() + + # The Maximum number of cells (capacity) allowed in the test set + # until this group becomes the minority group for the train-validation set + max_test_size = tot_cell_count - min_cat[_cell_count_col] + + # The greatest number of cells using entire wells for the number + # of of wells (i) at capacity w + test_well_count = {i: defaultdict(int) for i in np.arange(cat_num_wells + 1)} + + # The well added, if any, at i and capacity w + test_well_idx = {i: defaultdict(list) for i in np.arange(cat_num_wells + 1)} + + # Iterate through each well + for df_idx, (_, well) in enumerate(groupdf.iterrows()): + + # Increment all indices by 1 to compute the base case in the first iteration + df_idx += 1 + + # Iterate through all possible capacities + # Each well must contain at least one cell + for w in np.arange(1, max_test_size + 1): + + # The largest number of wells at the current cell capacity (w) is + # either at the previous largest number of cells, + # or when adding the next wells. However, w + # can not be less than the number of cells added from the well. + test_well_count[df_idx][w] = test_well_count[df_idx - 1][w] + well_val = well[_cell_count_col] + if well_val <= w: + if well_val + test_well_count[df_idx - 1][w - well_val] > test_well_count[df_idx][w]: + # Increment the number of cells if adding the well + test_well_count[df_idx][w] = well_val + test_well_count[df_idx - 1][w - well_val] + # Track the 0-indexed location of the well + test_well_idx[df_idx][w].append(df_idx - 1) + + + # Reconstruct the solution (the maximum subset of wells + # in the test set from this group) + # Start from the number of wells and number of cells that give the + # largest number of cells (the optimal value) + idx = cat_num_wells + w = max_test_size + test_wells_int_idx = [] + + # Perform the reconstruction to find the test wells indices + while idx > 0 and w > 0: + test_well_items = test_well_idx[idx][w] + if test_well_items: + test_wells_int_idx.extend(test_well_items) + w -= groupdf.iloc[idx - 1][_cell_count_col] + idx -= 1 + + # Store the names of the wells + test_wells.extend(groupdf.iloc[test_wells_int_idx][_well_col].tolist()) + + # Check that the number of test wells for the group isn't zero + if len(test_wells) == 0: + raise ValueError(f"The test well count of group {cat} is zero") + + """ + # Add wells to test set + test_wells.extend(groupdf[_well_col].tolist()) + """ + + print(f"{len(test_wells)} wells of {cat_num_wells} wells are test wells for group {cat}") + + return test_wells From 75dcba4d8ef04ac3bbf49ef0bf1bffddcbfec0b0 Mon Sep 17 00:00:00 2001 From: Cameron Mattson Date: Mon, 11 Mar 2024 12:25:57 -0600 Subject: [PATCH 2/2] Improved code comments --- 1.train_models/utils/WellSubsetSum.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/1.train_models/utils/WellSubsetSum.py b/1.train_models/utils/WellSubsetSum.py index 7290336..01cc099 100644 --- a/1.train_models/utils/WellSubsetSum.py +++ b/1.train_models/utils/WellSubsetSum.py @@ -109,8 +109,8 @@ def update_test_wells(self, _welldf, _category_col, _well_col, _cell_count_col, # The largest number of wells at the current cell capacity (w) is # either at the previous largest number of cells, - # or when adding the next wells. However, w - # can not be less than the number of cells added from the well. + # or when adding the next well. However, w + # cannot be less than the number of cells added from the well. test_well_count[df_idx][w] = test_well_count[df_idx - 1][w] well_val = well[_cell_count_col] if well_val <= w: @@ -144,11 +144,6 @@ def update_test_wells(self, _welldf, _category_col, _well_col, _cell_count_col, if len(test_wells) == 0: raise ValueError(f"The test well count of group {cat} is zero") - """ - # Add wells to test set - test_wells.extend(groupdf[_well_col].tolist()) - """ - print(f"{len(test_wells)} wells of {cat_num_wells} wells are test wells for group {cat}") return test_wells