diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 1532e2e9..9180b1fe 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -53,7 +53,7 @@ class CellLocation: Path to the output file. If None, the metadata file is not saved to disk image_column : default = 'ImageNumber' - Name of the column in the metadata file that links to the single_cell file + Name of the column in the metadata file that links to the single_cell file, in combination with `table_column` image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site'] Names of the columns in the metadata file that uniquely identify each image @@ -67,6 +67,9 @@ class CellLocation: cell_y_loc : default = 'Nuclei_Location_Center_Y' Name of the column in the single_cell file that contains the Y location of each cell + table_column : default = 'TableNumber' + Name of the column in the metadata file that links to the single_cell file, in combination with `image_column` + Methods ------- add_cell_location() @@ -82,6 +85,7 @@ def __init__( overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", + table_column: str = "TableNumber", image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", @@ -92,6 +96,7 @@ def __init__( self.overwrite = overwrite self.image_column = image_column self.object_column = object_column + self.table_column = table_column self.image_key = image_key self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc @@ -235,7 +240,7 @@ def _create_nested_df(self, df: pd.DataFrame): output_df_list = collections.defaultdict(list) # iterate over each group of cells in the merged DataFrame - group_cols = [*self.image_key, self.image_column] + group_cols = [*self.image_key, self.image_column, self.table_column] for group_values, cell_df in df.groupby(group_cols): # add the image-level information to the output dictionary @@ -317,6 +322,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): column_name in nuclei_columns for column_name in [ self.image_column, + self.table_column, self.object_column, self.cell_x_loc, self.cell_y_loc, @@ -330,6 +336,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): if not ( self.image_column in image_columns + and self.table_column in image_columns and all(elem in image_columns for elem in self.image_key) ): raise ValueError( @@ -351,14 +358,15 @@ def _get_joined_image_nuclei_tables(self): # merge the Image and Nuclei tables in SQL join_query = f""" - SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} + SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} FROM Nuclei INNER JOIN Image - ON Nuclei.{self.image_column} = Image.{self.image_column}; + ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column}; """ column_types = { self.image_column: "int64", + self.table_column: "int64", self.object_column: "int64", self.cell_x_loc: "float", self.cell_y_loc: "float", diff --git a/tests/test_cyto_utils/test_cell_locations.py b/tests/test_cyto_utils/test_cell_locations.py index 54f8db1d..99f65681 100644 --- a/tests/test_cyto_utils/test_cell_locations.py +++ b/tests/test_cyto_utils/test_cell_locations.py @@ -48,9 +48,10 @@ def test_output_shape_and_required_columns( metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc) # check the shape of the data + # cell_loc will have 3 extra columns: TableNumber, ImageNumber, CellCenters assert cell_loc.shape == ( metadata_input_dataframe.shape[0], - metadata_input_dataframe.shape[1] + 2, + metadata_input_dataframe.shape[1] + 3, ) assert isinstance(cell_loc["CellCenters"][0][0], dict) @@ -89,7 +90,7 @@ def test_output_value_correctness( # gather an engine from the cell_loc class _, engine = cls_cell_loc._get_single_cell_engine() - nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" + nuclei_query = "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" nuclei_df = pd.read_sql_query(nuclei_query, engine) diff --git a/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh b/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh index 8a18202f..949fd790 100644 --- a/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh +++ b/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh @@ -16,16 +16,16 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021 aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet . # Write a SQL query to select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2. -# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber` +# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `TableNumber`, `ImageNumber` -sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, TableNumber, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv # Write a SQL query to select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. -# Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` +# Only select the columns: `TableNumber``, `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` -sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv -sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv csvstack nuclei_query_1.csv nuclei_query_2.csv > nuclei_query.csv diff --git a/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite b/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite index efeabe02..370710f6 100644 Binary files a/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite and b/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite differ diff --git a/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet b/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet index 9324e8b4..5eeb0b24 100644 Binary files a/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet and b/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet differ