Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: SQLite joins should be on ImageNumber,TableNumber and not ImageNumber #378

Merged
merged 5 commits into from
Mar 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions pycytominer/cyto_utils/cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class CellLocation:
Path to the output file. If None, the metadata file is not saved to disk

image_column : default = 'ImageNumber'
Name of the column in the metadata file that links to the single_cell file
Name of the column in the metadata file that links to the single_cell file, in combination with `table_column`

image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']
Names of the columns in the metadata file that uniquely identify each image
Expand All @@ -67,6 +67,9 @@ class CellLocation:
cell_y_loc : default = 'Nuclei_Location_Center_Y'
Name of the column in the single_cell file that contains the Y location of each cell

table_column : default = 'TableNumber'
Name of the column in the metadata file that links to the single_cell file, in combination with `image_column`

Methods
-------
add_cell_location()
Expand All @@ -82,6 +85,7 @@ def __init__(
overwrite: bool = False,
image_column: str = "ImageNumber",
object_column: str = "ObjectNumber",
table_column: str = "TableNumber",
image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
cell_x_loc: str = "Nuclei_Location_Center_X",
cell_y_loc: str = "Nuclei_Location_Center_Y",
Expand All @@ -92,6 +96,7 @@ def __init__(
self.overwrite = overwrite
self.image_column = image_column
self.object_column = object_column
self.table_column = table_column
self.image_key = image_key
self.cell_x_loc = cell_x_loc
self.cell_y_loc = cell_y_loc
Expand Down Expand Up @@ -235,7 +240,7 @@ def _create_nested_df(self, df: pd.DataFrame):
output_df_list = collections.defaultdict(list)

# iterate over each group of cells in the merged DataFrame
group_cols = [*self.image_key, self.image_column]
group_cols = [*self.image_key, self.image_column, self.table_column]

for group_values, cell_df in df.groupby(group_cols):
# add the image-level information to the output dictionary
Expand Down Expand Up @@ -317,6 +322,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
column_name in nuclei_columns
for column_name in [
self.image_column,
self.table_column,
self.object_column,
self.cell_x_loc,
self.cell_y_loc,
Expand All @@ -330,6 +336,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):

if not (
self.image_column in image_columns
and self.table_column in image_columns
and all(elem in image_columns for elem in self.image_key)
):
raise ValueError(
Expand All @@ -351,14 +358,15 @@ def _get_joined_image_nuclei_tables(self):
# merge the Image and Nuclei tables in SQL

join_query = f"""
SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
FROM Nuclei
INNER JOIN Image
ON Nuclei.{self.image_column} = Image.{self.image_column};
ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column};
"""

column_types = {
self.image_column: "int64",
self.table_column: "int64",
self.object_column: "int64",
self.cell_x_loc: "float",
self.cell_y_loc: "float",
Expand Down
5 changes: 3 additions & 2 deletions tests/test_cyto_utils/test_cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def test_output_shape_and_required_columns(
metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc)

# check the shape of the data
# cell_loc will have 3 extra columns: TableNumber, ImageNumber, CellCenters
assert cell_loc.shape == (
metadata_input_dataframe.shape[0],
metadata_input_dataframe.shape[1] + 2,
metadata_input_dataframe.shape[1] + 3,
)

assert isinstance(cell_loc["CellCenters"][0][0], dict)
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_output_value_correctness(
# gather an engine from the cell_loc class
_, engine = cls_cell_loc._get_single_cell_engine()

nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;"
nuclei_query = "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;"

nuclei_df = pd.read_sql_query(nuclei_query, engine)

Expand Down
10 changes: 5 additions & 5 deletions tests/test_data/cell_locations_example_data/shrink_BR00126114.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021
aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet .

# Write a SQL query to select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2.
# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber`
# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `TableNumber`, `ImageNumber`

sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, TableNumber, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv


# Write a SQL query to select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2.
# Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`
# Only select the columns: `TableNumber``, `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`

sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv

csvstack nuclei_query_1.csv nuclei_query_2.csv > nuclei_query.csv

Expand Down
Binary file not shown.
Binary file not shown.
Loading