cytomining · gwaybio · Apr 7, 2023 · Feb 24, 2023 · Feb 24, 2023 · Feb 24, 2023
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -18,6 +18,9 @@ jobs:
         os: [ubuntu-latest, macos-latest]
     env:
       OS: ${{ matrix.os }}
+      # This is needed to avoid a warning from SQLAlchemy
+      # https://sqlalche.me/e/b8d9
+      # We can remove this once we upgrade to SQLAlchemy >= 2.0
       SQLALCHEMY_SILENCE_UBER_WARNING: "1"
     steps:
       - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -78,8 +78,8 @@ pip install "pycytominer[cell_locations] @ git+git://github.com/cytomining/pycyt
 Example using this functionality:
 
 ```bash
-metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet"
-single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite"
+metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/test_BR00126114_load_data_with_illum.parquet"
+single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/test_BR00126114.sqlite"
 augmented_metadata_output="~/Desktop/load_data_with_illum_and_cell_location_subset.parquet"
 
 python \
@@ -88,6 +88,16 @@ python \
     --single_cell_input ${single_single_cell_input}   \
     --augmented_metadata_output ${augmented_metadata_output} \
     add_cell_location
+
+# Check the output
+
+python -c "import pandas as pd; print(pd.read_parquet('${augmented_metadata_output}').head())"
+
+# It should look something like this (depends on the width of your terminal):
+
+#   Metadata_Plate Metadata_Well Metadata_Site  ...                                   PathName_OrigRNA ImageNumber                                        CellCenters
+# 0     BR00126114           A01             1  ...  s3://cellpainting-gallery/cpg0016-jump/source_...           1  [{'Nuclei_Location_Center_X': 943.512129380054...
+# 1     BR00126114           A01             2  ...  s3://cellpainting-gallery/cpg0016-jump/source_...           2  [{'Nuclei_Location_Center_X': 29.9516027655562...
 ```
 
 ## Usage

diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py
@@ -94,6 +94,11 @@ def __init__(
         self.image_key = image_key
         self.cell_x_loc = cell_x_loc
         self.cell_y_loc = cell_y_loc
+        # Currently constrained to only anonymous access for S3 resources
+        # https://github.com/cytomining/pycytominer/issues/268
+        self.s3 = boto3.client(
+            "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
+        )
 
     def _expanduser(self, obj: Union[str, None]):
         """Expand the user home directory in a path"""
@@ -139,14 +144,10 @@ def _s3_file_exists(self, s3_path: str):
             True if the file exists on S3, False otherwise
         """
 
-        s3 = boto3.client(
-            "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
-        )
-
         bucket, key = self._parse_s3_path(s3_path)
 
         try:
-            s3.Object(bucket, key).load()
+            self.s3.Object(bucket, key).load()
         except botocore.exceptions.ClientError as e:
             if e.response["Error"]["Code"] == "404":
                 return False
@@ -160,17 +161,13 @@ def _download_s3(self, uri: str):
         Download a file from S3 to a temporary file and return the temporary path
         """
 
-        s3 = boto3.client(
-            "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED)
-        )
-
         bucket, key = self._parse_s3_path(uri)
 
         tmp_file = tempfile.NamedTemporaryFile(
             delete=False, suffix=pathlib.Path(key).name
         )
 
-        s3.download_file(bucket, key, tmp_file.name)
+        self.s3.download_file(bucket, key, tmp_file.name)
 
         return tmp_file.name
 
@@ -303,9 +300,10 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
 
         inspector = sqlalchemy.inspect(engine)
 
-        table_names = inspector.get_table_names()
-
-        if not ("Image" in table_names and "Nuclei" in table_names):
+        if not all(
+            table_name in inspector.get_table_names()
+            for table_name in ["Image", "Nuclei"]
+        ):
             raise ValueError(
                 "Image and Nuclei tables are not present in the single_cell file"
             )
@@ -314,11 +312,14 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
 
         nuclei_columns = [column["name"] for column in inspector.get_columns("Nuclei")]
 
-        if not (
-            self.image_column in nuclei_columns
-            and self.object_column in nuclei_columns
-            and self.cell_x_loc in nuclei_columns
-            and self.cell_y_loc in nuclei_columns
+        if not all(
+            column_name in nuclei_columns
+            for column_name in [
+                self.image_column,
+                self.object_column,
+                self.cell_x_loc,
+                self.cell_y_loc,
+            ]
         ):
             raise ValueError(
                 "Required columns are not present in the Nuclei table in the SQLite file"
@@ -355,7 +356,17 @@ def _get_joined_image_nuclei_tables(self):
         ON Nuclei.{self.image_column} = Image.{self.image_column};
         """
 
-        joined_df = pd.read_sql_query(join_query, engine)
+        column_types = {
+            self.image_column: "int64",
+            self.object_column: "int64",
+            self.cell_x_loc: "float",
+            self.cell_y_loc: "float",
+        }
+
+        for image_key in self.image_key:
+            column_types[image_key] = "str"
+
+        joined_df = pd.read_sql_query(join_query, engine, dtype=column_types)
 
         # if the single_cell file was downloaded from S3, delete the temporary file
         if temp_single_cell_input is not None:
@@ -372,22 +383,7 @@ def _load_single_cell(self):
             The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame
         """
 
-        joined_df = self._get_joined_image_nuclei_tables()
-
-        # Cast the cell location columns to float
-        joined_df[self.cell_x_loc] = joined_df[self.cell_x_loc].astype(float)
-        joined_df[self.cell_y_loc] = joined_df[self.cell_y_loc].astype(float)
-
-        # Cast the object column to int
-        joined_df[self.object_column] = joined_df[self.object_column].astype(int)
-
-        # Cast the image index columns to str
-        for col in self.image_key:
-            joined_df[col] = joined_df[col].astype(str)
-
-        joined_df = self._create_nested_df(joined_df)
-
-        return joined_df
+        return self._create_nested_df(self._get_joined_image_nuclei_tables())
 
     def add_cell_location(self):
         """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column.

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -72,8 +72,8 @@ class SingleCells(object):
     default_datatype_float: type
         Numpy floating point datatype to use for load_compartment and resulting
         dataframes. This parameter may be used to assist with performance-related
-        issues by reducing the memory required for floating-point data. 
-        For example, using np.float32 instead of np.float64 for this parameter 
+        issues by reducing the memory required for floating-point data.
+        For example, using np.float32 instead of np.float64 for this parameter
         will reduce memory consumed by float columns by roughly 50%.
         Please note: using any besides np.float64 are experimentally
         unverified.
@@ -365,7 +365,6 @@ def subsample_profiles(self, df, rename_col=True):
             self.set_subsample_random_state(random_state)
 
         if self.subsample_frac == 1:
-
             output_df = pd.DataFrame.sample(
                 df,
                 n=self.subsample_n,
@@ -537,7 +536,6 @@ def aggregate_compartment(
             compartment=compartment,
             n_aggregation_memory_strata=n_aggregation_memory_strata,
         ):
-
             population_df = self.image_df.merge(
                 compartment_df,
                 how="inner",
@@ -636,7 +634,7 @@ def _compartment_df_generator(
             con=self.conn,
         )
         all_columns = compartment_row1.columns
-        if self.features != "infer": # allow to get only some features
+        if self.features != "infer":  # allow to get only some features
             all_columns = [x for x in all_columns if x in self.features]
 
         typeof_str = ", ".join([f"typeof({x})" for x in all_columns])
@@ -754,22 +752,12 @@ def merge_single_cells(
                             sc_df, how="left", on=subset_logic_df.columns.tolist()
                         ).reindex(sc_df.columns, axis="columns")
 
-                    sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
-                        left_on=self.merge_cols + [left_link_col],
-                        right_on=self.merge_cols + [right_link_col],
-                        suffixes=merge_suffix,
-                    )
-
-                else:
-                    sc_df = sc_df.merge(
-                        self.load_compartment(
-                            compartment=right_compartment
-                        ),
-                        left_on=self.merge_cols + [left_link_col],
-                        right_on=self.merge_cols + [right_link_col],
-                        suffixes=merge_suffix,
-                    )
+                sc_df = sc_df.merge(
+                    self.load_compartment(compartment=right_compartment),
+                    left_on=self.merge_cols + [left_link_col],
+                    right_on=self.merge_cols + [right_link_col],
+                    suffixes=merge_suffix,
+                )
 
                 linking_check_cols.append(linking_check)
 

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -345,14 +345,16 @@ def test_get_sql_table_col_names():
     # Iterate over initialized compartments
     for compartment in AP.compartments:
         expected_meta_cols = ["ObjectNumber", "ImageNumber", "TableNumber"]
-        expected_feat_cols = [f"{compartment.capitalize()}_{i}" for i in ["a", "b", "c", "d"]]
-        if compartment == 'cytoplasm':
-            expected_feat_cols += ["Cytoplasm_Parent_Cells","Cytoplasm_Parent_Nuclei"]
+        expected_feat_cols = [
+            f"{compartment.capitalize()}_{i}" for i in ["a", "b", "c", "d"]
+        ]
+        if compartment == "cytoplasm":
+            expected_feat_cols += ["Cytoplasm_Parent_Cells", "Cytoplasm_Parent_Nuclei"]
         col_name_result = AP.get_sql_table_col_names(table=compartment)
-        assert sorted(col_name_result) == sorted(expected_feat_cols+expected_meta_cols)
-        meta_cols, feat_cols = AP.split_column_categories(
-            col_name_result
+        assert sorted(col_name_result) == sorted(
+            expected_feat_cols + expected_meta_cols
         )
+        meta_cols, feat_cols = AP.split_column_categories(col_name_result)
         assert meta_cols == expected_meta_cols
         assert feat_cols == expected_feat_cols
 
@@ -406,7 +408,6 @@ def test_merge_single_cells():
     for method in ["standardize", "robustize"]:
         for samples in ["all", "Metadata_ImageNumber == 'x'"]:
             for features in ["infer", ["Cytoplasm_a", "Cells_a"]]:
-
                 norm_method_df = AP.merge_single_cells(
                     single_cell_normalize=True,
                     normalize_args={
@@ -426,6 +427,17 @@ def test_merge_single_cells():
                     check_dtype=False,
                 )
 
+
+@pytest.mark.skip(
+    reason="This test will soon fail because of a logic error in merge_single_cells"
+)
+def test_merge_single_cells_non_canonical():
+    # The test raises this warning:
+    # FutureWarning: Passing 'suffixes' which cause duplicate columns
+    # {'ObjectNumber_cytoplasm'} in the result is deprecated and will raise a
+    # MergeError in a future version.
+    # See https://github.com/cytomining/pycytominer/issues/266
+
     # Test non-canonical compartment merging
     new_sc_merge_df = AP_NEW.merge_single_cells()
 
@@ -476,8 +488,8 @@ def test_merge_single_cells():
         traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
     )
 
-def test_merge_single_cells_subsample():
 
+def test_merge_single_cells_subsample():
     for subsample_frac in [0.1, 0.5, 0.9]:
         ap_subsample = SingleCells(
             sql_file=TMP_SQLITE_FILE, subsample_frac=subsample_frac
@@ -704,7 +716,6 @@ def test_aggregate_subsampling_count_cells():
 
 
 def test_aggregate_subsampling_profile():
-
     assert isinstance(
         AP_SUBSAMPLE.aggregate_profiles(compute_subsample=True), pd.DataFrame
     )
@@ -724,7 +735,6 @@ def test_aggregate_subsampling_profile():
 
 
 def test_aggregate_subsampling_profile_output():
-
     expected_result = pd.DataFrame(
         {
             "Metadata_Plate": ["plate", "plate"],
@@ -768,7 +778,6 @@ def test_aggregate_subsampling_profile_output():
 
 
 def test_aggregate_subsampling_profile_output_multiple_queries():
-
     expected_result = pd.DataFrame(
         {
             "Metadata_Plate": ["plate", "plate"],
@@ -814,7 +823,6 @@ def test_aggregate_subsampling_profile_output_multiple_queries():
 
 
 def test_n_aggregation_memory_strata():
-
     df_n1 = AP.aggregate_profiles(n_aggregation_memory_strata=1)
     df_n2 = AP.aggregate_profiles(n_aggregation_memory_strata=2)
     df_n3 = AP.aggregate_profiles(n_aggregation_memory_strata=3)
@@ -832,7 +840,6 @@ def test_invalid_n_aggregation_memory_strata():
 
 
 def test_sqlite_strata_conditions():
-
     df = pd.DataFrame(
         data={
             "TableNumber": [[1], [2], [3], [4]],
@@ -1082,4 +1089,3 @@ def test_load_non_canonical_image_table():
         result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"),
         sc_aggregated_df,
     )
-
diff --git a/pycytominer/tests/test_cyto_utils/test_modz.py b/pycytominer/tests/test_cyto_utils/test_modz.py
@@ -1,5 +1,6 @@
 import os
 import random
+import pytest
 import numpy as np
 import pandas as pd
 from pycytominer.cyto_utils import modz
@@ -143,7 +144,11 @@ def test_modz_multiple_columns_one_metadata_column():
     consensus_df = modz(
         data_replicate_multi_df, replicate_columns, min_weight=1, precision=precision
     )
-    expected_result = data_replicate_multi_df.groupby(replicate_columns).mean().round(4)
+    expected_result = (
+        data_replicate_multi_df.groupby(replicate_columns)
+        .mean(numeric_only=True)
+        .round(4)
+    )
     expected_result.index.name = replicate_columns
     pd.testing.assert_frame_equal(
         expected_result.reset_index(), consensus_df, check_exact=False, atol=1e-3