From d7e18cbea7abb9479e834af1af95c394fbfb1792 Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 20 Aug 2019 13:54:44 -0400 Subject: [PATCH 1/2] all a coercable string as input to subsample_n --- pycytominer/aggregate.py | 44 +++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py index 3be4dbe2..ef145666 100644 --- a/pycytominer/aggregate.py +++ b/pycytominer/aggregate.py @@ -55,11 +55,6 @@ def __init__( 0 < subsample_frac and 1 >= subsample_frac ), "subsample_frac must be between 0 and 1" - # Check that the user didn't specify both subset frac and - assert ( - subsample_frac == 1 or subsample_n == "all" - ), "Do not set both subsample_frac and subsample_n" - self.sql_file = sql_file self.strata = strata self.features = features @@ -73,9 +68,16 @@ def __init__( self.subsampling_random_state = subsampling_random_state self.is_aggregated = False + if self.subsample_n != "all": + try: + self.subsample_n = int(self.subsample_n) + except ValueError: + print("subsample n must be an integer or coercable") + # Connect to sqlite engine self.engine = create_engine(self.sql_file) self.conn = self.engine.connect() + self._check_subsampling() if load_image_data: self.load_image() @@ -90,16 +92,22 @@ def _check_compartments(self, compartments): elif isinstance(compartments, str): assert compartments in valid_compartments, error_str + def _check_subsampling(self): + # Check that the user didn't specify both subset frac and + assert ( + self.subsample_frac == 1 or self.subsample_n == "all" + ), "Do not set both subsample_frac and subsample_n" + def set_output_file(self, output_file): self.output_file = output_file def set_subsample_frac(self, subsample_frac): - self.subsample_n = "all" self.subsample_frac = subsample_frac + self._check_subsampling() def set_subsample_n(self, subsample_n): - self.subsample_frac = 1 self.subsample_n = subsample_n + self._check_subsampling() def set_subsample_random_state(self, random_state): self.subsampling_random_state = random_state @@ -109,9 +117,7 @@ def load_image(self): Load image table from sqlite file """ # Extract image metadata - image_cols = ( - "TableNumber, ImageNumber, {}".format(", ".join(self.strata)) - ) + image_cols = "TableNumber, ImageNumber, {}".format(", ".join(self.strata)) image_query = "select {} from image".format(image_cols) self.image_df = pd.read_sql(sql=image_query, con=self.conn) @@ -166,7 +172,7 @@ def subsample_profiles(self, x, random_state="none"): x, frac=self.subsample_frac, random_state=self.subsampling_random_state ) - def get_subsample(self, compartment="cells", subsample_frac=1, subsample_n="all"): + def get_subsample(self, compartment="cells"): """ Extract subsample from sqlite file @@ -175,12 +181,6 @@ def get_subsample(self, compartment="cells", subsample_frac=1, subsample_n="all" """ self._check_compartments(compartment) - if subsample_frac < 1: - self.set_subsample_frac(subsample_frac) - - if isinstance(subsample_n, int): - self.set_subsample_n(subsample_n) - query_cols = "TableNumber, ImageNumber, ObjectNumber" query = "select {} from {}".format(query_cols, compartment) @@ -301,15 +301,9 @@ def aggregate( population_df = population_df.groupby(strata) if operation == "median": - population_df = ( - population_df.median() - .reset_index() - ) + population_df = population_df.median().reset_index() else: - population_df = ( - population_df.mean() - .reset_index() - ) + population_df = population_df.mean().reset_index() # Aggregated image number and object number do not make sense for col in ["ImageNumber", "ObjectNumber"]: From 8d07f70e4ba83a673cef198cecf824e0e961b7be Mon Sep 17 00:00:00 2001 From: gwaygenomics Date: Tue, 20 Aug 2019 13:58:54 -0400 Subject: [PATCH 2/2] closes #28 --- pycytominer/aggregate.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py index ef145666..e9122383 100644 --- a/pycytominer/aggregate.py +++ b/pycytominer/aggregate.py @@ -67,6 +67,7 @@ def __init__( self.subset_data = "none" self.subsampling_random_state = subsampling_random_state self.is_aggregated = False + self.is_subset_computed = False if self.subsample_n != "all": try: @@ -133,6 +134,7 @@ def count_cells(self, compartment="cells", count_subset=False): if count_subset: assert self.is_aggregated, "Make sure to aggregate_profiles() first!" + assert self.is_subset_computed, "Make sure to get_subsample() first!" count_df = pd.crosstab( self.subset_data.loc[:, self.strata[1]], self.subset_data.loc[:, self.strata[0]], @@ -194,6 +196,7 @@ def get_subsample(self, compartment="cells"): .apply(lambda x: self.subsample_profiles(x)) .reset_index(drop=True) ) + self.is_subset_computed = True def aggregate_compartment(self, compartment, compute_subsample=False): """