diff --git a/pycytominer/cyto_utils/collate.py b/pycytominer/cyto_utils/collate.py index d4b0d3db..6fa3adab 100644 --- a/pycytominer/cyto_utils/collate.py +++ b/pycytominer/cyto_utils/collate.py @@ -1,4 +1,5 @@ import os +import pathlib import subprocess import sys @@ -23,10 +24,10 @@ def collate( base_directory="../..", column=None, munge=False, - pipeline="analysis", - remote=None, + csv_dir="analysis", + aws_remote=None, aggregate_only=False, - temp="/tmp", + tmp_dir="/tmp", overwrite=False, add_image_features=True, image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"], @@ -43,18 +44,18 @@ def collate( plate : str Plate name to process base_directory : str, default "../.." - Base directory where the CSV files will be located + Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory column : str, optional, default None - An existing column to be explicitly copied to a Metadata_Plate column if Metadata_Plate was not set + An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists munge : bool, default False - Whether munge should be passed to cytominer-database, if True will break a single object CSV down by objects - pipeline : str, default 'analysis' - A string used in path creation - remote : str, optional, default None - A remote AWS directory, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run + Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table + csv_dir : str, default 'analysis' + The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis" + aws_remote : str, optional, default None + A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run aggregate_only : bool, default False Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps - tmp: str, default '/tmp' + tmp_dir: str, default '/tmp' The temporary directory to be used by cytominer-databases for output overwrite: bool, optional, default False Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists @@ -69,15 +70,13 @@ def collate( from pycytominer.cyto_utils.cells import SingleCells # Set up directories (these need to be abspaths to keep from confusing makedirs later) - input_dir = os.path.abspath( - os.path.join(base_directory, "analysis", batch, plate, pipeline) - ) - backend_dir = os.path.abspath(os.path.join(base_directory, "backend", batch, plate)) - cache_backend_dir = os.path.abspath(os.path.join(temp, "backend", batch, plate)) + input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}") + backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}") + cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}") - aggregated_file = os.path.join(backend_dir, plate + ".csv") - backend_file = os.path.join(backend_dir, plate + ".sqlite") - cache_backend_file = os.path.join(cache_backend_dir, plate + ".sqlite") + aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv") + backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite") + cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite") if not aggregate_only: if os.path.exists(cache_backend_file): @@ -92,22 +91,15 @@ def collate( if not os.path.exists(eachdir): os.makedirs(eachdir, exist_ok=True) - if remote: + if aws_remote: - remote_input_dir = os.path.join(remote, "analysis", batch, plate, pipeline) - remote_backend_file = os.path.join( - remote, "backend", batch, plate, plate + ".sqlite" - ) - remote_aggregated_file = os.path.join( - remote, "backend", batch, plate, plate + ".csv" - ) + remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}" - sync_cmd = ( - 'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" ' - + remote_input_dir - + " " - + input_dir - ) + remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite" + + remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" + + sync_cmd = f'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" {remote_input_dir} {input_dir}' if printtoscreen: print(f"Downloading CSVs from {remote_input_dir} to {input_dir}") run_check_errors(sync_cmd) @@ -116,7 +108,7 @@ def collate( "cytominer-database", "ingest", input_dir, - "sqlite:///" + cache_backend_file, + f"sqlite:///{cache_backend_file}", "-c", config, ] @@ -139,44 +131,33 @@ def collate( update_cmd = [ "sqlite3", cache_backend_file, - "UPDATE image SET Metadata_Plate =" + column + ";", + f"UPDATE image SET Metadata_Plate ={column};", ] run_check_errors(update_cmd) if printtoscreen: print(f"Indexing database {cache_backend_file}") - index_cmd_1 = [ + index_cmd_img = [ "sqlite3", cache_backend_file, "CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);", ] - run_check_errors(index_cmd_1) - index_cmd_2 = [ - "sqlite3", - cache_backend_file, - "CREATE INDEX IF NOT EXISTS table_image_object_cells_idx ON Cells(TableNumber, ImageNumber, ObjectNumber);", - ] - run_check_errors(index_cmd_2) - index_cmd_3 = [ - "sqlite3", - cache_backend_file, - "CREATE INDEX IF NOT EXISTS table_image_object_cytoplasm_idx ON Cytoplasm(TableNumber, ImageNumber, ObjectNumber);", - ] - run_check_errors(index_cmd_3) - index_cmd_4 = [ - "sqlite3", - cache_backend_file, - "CREATE INDEX IF NOT EXISTS table_image_object_nuclei_idx ON Nuclei(TableNumber, ImageNumber, ObjectNumber);", - ] - run_check_errors(index_cmd_4) - index_cmd_5 = [ + run_check_errors(index_cmd_img) + for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]: + index_cmd_compartment = [ + "sqlite3", + cache_backend_file, + f"CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);", + ] + run_check_errors(index_cmd_compartment) + index_cmd_metadata = [ "sqlite3", cache_backend_file, "CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);", ] - run_check_errors(index_cmd_5) + run_check_errors(index_cmd_metadata) - if remote: + if aws_remote: if printtoscreen: print(f"Uploading {cache_backend_file} to {remote_backend_file}") @@ -198,13 +179,10 @@ def collate( if printtoscreen: print(f"Aggregating sqlite:///{backend_file}") - if aggregate_only and remote: - remote_backend_file = os.path.join( - remote, "backend", batch, plate, plate + ".sqlite" - ) - remote_aggregated_file = os.path.join( - remote, "backend", batch, plate, plate + ".csv" - ) + if aggregate_only and aws_remote: + remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite" + + remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv" cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file] if printtoscreen: @@ -222,14 +200,14 @@ def collate( image_feature_categories = None # defensive but not sure what will happen if we give a list but set to False database = SingleCells( - "sqlite:///" + backend_file, + f"sqlite:///{backend_file}", aggregation_operation="mean", add_image_features=add_image_features, image_feature_categories=image_feature_categories, ) database.aggregate_profiles(output_file=aggregated_file) - if remote: + if aws_remote: if printtoscreen: print(f"Uploading {aggregated_file} to {remote_aggregated_file}") csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file] diff --git a/pycytominer/cyto_utils/collate_cmd.py b/pycytominer/cyto_utils/collate_cmd.py index fa064cdb..7812d98f 100644 --- a/pycytominer/cyto_utils/collate_cmd.py +++ b/pycytominer/cyto_utils/collate_cmd.py @@ -1,6 +1,7 @@ +import argparse +from pycytominer.cyto_utils.collate import collate + if __name__ == "__main__": - import argparse - from pycytominer.cyto_utils.collate import collate parser = argparse.ArgumentParser(description="Collate CSVs") parser.add_argument("batch", help="Batch name to process") @@ -11,7 +12,7 @@ "--base-directory", dest="base_directory", default="../..", - help="Base directory where the CSV files will be located", + help="Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the 'workspace' directory", ) parser.add_argument( "--column", @@ -22,15 +23,19 @@ "--munge", action="store_true", default=False, - help="Whether munge should be passed to cytominer-database, if True will break a single object CSV down by objects", + help="Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table", ) parser.add_argument( - "--pipeline", default="analysis", help="A string used in path creation" + "--csv-dir", + dest="csv_dir", + default="analysis", + help="The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be 'analysis'", ) parser.add_argument( - "--remote", + "--aws-remote", + dest="aws_remote", default=None, - help="A remote AWS directory, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run", + help="A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run", ) parser.add_argument( "--aggregate-only", @@ -81,8 +86,8 @@ base_directory=args.base_directory, column=args.column, munge=args.munge, - pipeline=args.pipeline, - remote=args.remote, + csv_dir=args.csv_dir, + aws_remote=args.aws_remote, aggregate_only=args.aggregate_only, temp=args.temp, overwrite=args.overwrite, diff --git a/pycytominer/tests/test_cyto_utils/test_collate.py b/pycytominer/tests/test_cyto_utils/test_collate.py index 4a622357..4810d38f 100644 --- a/pycytominer/tests/test_cyto_utils/test_collate.py +++ b/pycytominer/tests/test_cyto_utils/test_collate.py @@ -1,35 +1,30 @@ import os +import pathlib import pytest import pandas as pd from pycytominer.cyto_utils.collate import collate -test_config_location = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "..", "cyto_utils/database_config/ingest_config.ini") -) -test_data_location = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "test_data/collate") +batch = "2021_04_20_Target2" + +plate = "BR00121431" + +test_config_location = pathlib.Path( + f"{os.path.dirname(__file__)}/../../cyto_utils/database_config/ingest_config.ini" ) -test_backend_location = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - "..", - "test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431.sqlite", - ) + +test_data_location = pathlib.Path(f"{os.path.dirname(__file__)}/../test_data/collate") + +test_backend_location = pathlib.Path( + f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}.sqlite" ) -test_csv_location = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - "..", - "test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431.csv", - ) + +test_csv_location = pathlib.Path( + f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}.csv" ) -master_csv_location = os.path.abspath( - os.path.join( - os.path.dirname(__file__), - "..", - "test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431_master.csv", - ) + +master_csv_location = pathlib.Path( + f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}_master.csv" ) @@ -61,7 +56,7 @@ def test_base_case(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, add_image_features=False, printtoscreen=False, ) @@ -81,7 +76,7 @@ def test_base_case_with_image_features(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, add_image_features=True, image_feature_categories=["Granularity"], printtoscreen=False, @@ -108,7 +103,7 @@ def test_overwrite(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, add_image_features=False, printtoscreen=False, ) @@ -121,7 +116,7 @@ def test_overwrite(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, overwrite=True, add_image_features=False, printtoscreen=False, @@ -133,7 +128,7 @@ def test_overwrite(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, add_image_features=False, printtoscreen=False, ) @@ -153,7 +148,7 @@ def test_aggregate_only(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, aggregate_only=True, add_image_features=False, printtoscreen=False, @@ -165,7 +160,7 @@ def test_aggregate_only(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, add_image_features=False, printtoscreen=False, ) @@ -179,7 +174,7 @@ def test_aggregate_only(): test_config_location, "BR00121431", base_directory=test_data_location, - temp=test_data_location, + tmp_dir=test_data_location, aggregate_only=True, add_image_features=False, printtoscreen=False,