Skip to content

Commit

Permalink
address fstrings, pathlib, suggested name changes
Browse files Browse the repository at this point in the history
  • Loading branch information
bethac07 committed Jun 8, 2022
1 parent 486fc16 commit ead2547
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 107 deletions.
112 changes: 45 additions & 67 deletions pycytominer/cyto_utils/collate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pathlib
import subprocess
import sys

Expand All @@ -23,10 +24,10 @@ def collate(
base_directory="../..",
column=None,
munge=False,
pipeline="analysis",
remote=None,
csv_dir="analysis",
aws_remote=None,
aggregate_only=False,
temp="/tmp",
tmp_dir="/tmp",
overwrite=False,
add_image_features=True,
image_feature_categories=["Granularity", "Texture", "ImageQuality", "Threshold"],
Expand All @@ -43,18 +44,18 @@ def collate(
plate : str
Plate name to process
base_directory : str, default "../.."
Base directory where the CSV files will be located
Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the "workspace" directory
column : str, optional, default None
An existing column to be explicitly copied to a Metadata_Plate column if Metadata_Plate was not set
An existing column to be explicitly copied to a new column called Metadata_Plate if no Metadata_Plate column already explicitly exists
munge : bool, default False
Whether munge should be passed to cytominer-database, if True will break a single object CSV down by objects
pipeline : str, default 'analysis'
A string used in path creation
remote : str, optional, default None
A remote AWS directory, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table
csv_dir : str, default 'analysis'
The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be "analysis"
aws_remote : str, optional, default None
A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run
aggregate_only : bool, default False
Whether to perform only the aggregation of existent SQLite files and bypass previous collation steps
tmp: str, default '/tmp'
tmp_dir: str, default '/tmp'
The temporary directory to be used by cytominer-databases for output
overwrite: bool, optional, default False
Whether or not to overwrite an sqlite that exists in the temporary directory if it already exists
Expand All @@ -69,15 +70,13 @@ def collate(
from pycytominer.cyto_utils.cells import SingleCells

# Set up directories (these need to be abspaths to keep from confusing makedirs later)
input_dir = os.path.abspath(
os.path.join(base_directory, "analysis", batch, plate, pipeline)
)
backend_dir = os.path.abspath(os.path.join(base_directory, "backend", batch, plate))
cache_backend_dir = os.path.abspath(os.path.join(temp, "backend", batch, plate))
input_dir = pathlib.Path(f"{base_directory}/analysis/{batch}/{plate}/{csv_dir}")
backend_dir = pathlib.Path(f"{base_directory}/backend/{batch}/{plate}")
cache_backend_dir = pathlib.Path(f"{tmp_dir}/backend/{batch}/{plate}")

aggregated_file = os.path.join(backend_dir, plate + ".csv")
backend_file = os.path.join(backend_dir, plate + ".sqlite")
cache_backend_file = os.path.join(cache_backend_dir, plate + ".sqlite")
aggregated_file = pathlib.Path(f"{backend_dir}/{plate}.csv")
backend_file = pathlib.Path(f"{backend_dir}/{plate}.sqlite")
cache_backend_file = pathlib.Path(f"{cache_backend_dir}/{plate}.sqlite")

if not aggregate_only:
if os.path.exists(cache_backend_file):
Expand All @@ -92,22 +91,15 @@ def collate(
if not os.path.exists(eachdir):
os.makedirs(eachdir, exist_ok=True)

if remote:
if aws_remote:

remote_input_dir = os.path.join(remote, "analysis", batch, plate, pipeline)
remote_backend_file = os.path.join(
remote, "backend", batch, plate, plate + ".sqlite"
)
remote_aggregated_file = os.path.join(
remote, "backend", batch, plate, plate + ".csv"
)
remote_input_dir = f"{aws_remote}/analysis/{batch}/{plate}/{csv_dir}"

sync_cmd = (
'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" '
+ remote_input_dir
+ " "
+ input_dir
)
remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

sync_cmd = f'aws s3 sync --exclude "*" --include "*/Cells.csv" --include "*/Nuclei.csv" --include "*/Cytoplasm.csv" --include "*/Image.csv" {remote_input_dir} {input_dir}'
if printtoscreen:
print(f"Downloading CSVs from {remote_input_dir} to {input_dir}")
run_check_errors(sync_cmd)
Expand All @@ -116,7 +108,7 @@ def collate(
"cytominer-database",
"ingest",
input_dir,
"sqlite:///" + cache_backend_file,
f"sqlite:///{cache_backend_file}",
"-c",
config,
]
Expand All @@ -139,44 +131,33 @@ def collate(
update_cmd = [
"sqlite3",
cache_backend_file,
"UPDATE image SET Metadata_Plate =" + column + ";",
f"UPDATE image SET Metadata_Plate ={column};",
]
run_check_errors(update_cmd)

if printtoscreen:
print(f"Indexing database {cache_backend_file}")
index_cmd_1 = [
index_cmd_img = [
"sqlite3",
cache_backend_file,
"CREATE INDEX IF NOT EXISTS table_image_idx ON Image(TableNumber, ImageNumber);",
]
run_check_errors(index_cmd_1)
index_cmd_2 = [
"sqlite3",
cache_backend_file,
"CREATE INDEX IF NOT EXISTS table_image_object_cells_idx ON Cells(TableNumber, ImageNumber, ObjectNumber);",
]
run_check_errors(index_cmd_2)
index_cmd_3 = [
"sqlite3",
cache_backend_file,
"CREATE INDEX IF NOT EXISTS table_image_object_cytoplasm_idx ON Cytoplasm(TableNumber, ImageNumber, ObjectNumber);",
]
run_check_errors(index_cmd_3)
index_cmd_4 = [
"sqlite3",
cache_backend_file,
"CREATE INDEX IF NOT EXISTS table_image_object_nuclei_idx ON Nuclei(TableNumber, ImageNumber, ObjectNumber);",
]
run_check_errors(index_cmd_4)
index_cmd_5 = [
run_check_errors(index_cmd_img)
for eachcompartment in ["Cells", "Cytoplasm", "Nuclei"]:
index_cmd_compartment = [
"sqlite3",
cache_backend_file,
f"CREATE INDEX IF NOT EXISTS table_image_object_{eachcompartment.lower()}_idx ON {eachcompartment}(TableNumber, ImageNumber, ObjectNumber);",
]
run_check_errors(index_cmd_compartment)
index_cmd_metadata = [
"sqlite3",
cache_backend_file,
"CREATE INDEX IF NOT EXISTS plate_well_image_idx ON Image(Metadata_Plate, Metadata_Well);",
]
run_check_errors(index_cmd_5)
run_check_errors(index_cmd_metadata)

if remote:
if aws_remote:

if printtoscreen:
print(f"Uploading {cache_backend_file} to {remote_backend_file}")
Expand All @@ -198,13 +179,10 @@ def collate(
if printtoscreen:
print(f"Aggregating sqlite:///{backend_file}")

if aggregate_only and remote:
remote_backend_file = os.path.join(
remote, "backend", batch, plate, plate + ".sqlite"
)
remote_aggregated_file = os.path.join(
remote, "backend", batch, plate, plate + ".csv"
)
if aggregate_only and aws_remote:
remote_backend_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.sqlite"

remote_aggregated_file = f"{aws_remote}/backend/{batch}/{plate}/{plate}.csv"

cp_cmd = ["aws", "s3", "cp", remote_backend_file, backend_file]
if printtoscreen:
Expand All @@ -222,14 +200,14 @@ def collate(
image_feature_categories = None # defensive but not sure what will happen if we give a list but set to False

database = SingleCells(
"sqlite:///" + backend_file,
f"sqlite:///{backend_file}",
aggregation_operation="mean",
add_image_features=add_image_features,
image_feature_categories=image_feature_categories,
)
database.aggregate_profiles(output_file=aggregated_file)

if remote:
if aws_remote:
if printtoscreen:
print(f"Uploading {aggregated_file} to {remote_aggregated_file}")
csv_cp_cmd = ["aws", "s3", "cp", aggregated_file, remote_aggregated_file]
Expand Down
23 changes: 14 additions & 9 deletions pycytominer/cyto_utils/collate_cmd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
from pycytominer.cyto_utils.collate import collate

if __name__ == "__main__":
import argparse
from pycytominer.cyto_utils.collate import collate

parser = argparse.ArgumentParser(description="Collate CSVs")
parser.add_argument("batch", help="Batch name to process")
Expand All @@ -11,7 +12,7 @@
"--base-directory",
dest="base_directory",
default="../..",
help="Base directory where the CSV files will be located",
help="Base directory for subdirectories containing CSVs, backends, etc; in our preferred structure, this is the 'workspace' directory",
)
parser.add_argument(
"--column",
Expand All @@ -22,15 +23,19 @@
"--munge",
action="store_true",
default=False,
help="Whether munge should be passed to cytominer-database, if True will break a single object CSV down by objects",
help="Whether munge should be passed to cytominer-database, if True cytominer-database will expect a single all-object CSV; it will split each object into its own table",
)
parser.add_argument(
"--pipeline", default="analysis", help="A string used in path creation"
"--csv-dir",
dest="csv_dir",
default="analysis",
help="The directory under the base directory where the analysis CSVs will be found. If running the analysis pipeline, this should nearly always be 'analysis'",
)
parser.add_argument(
"--remote",
"--aws-remote",
dest="aws_remote",
default=None,
help="A remote AWS directory, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run",
help="A remote AWS prefix, if set CSV files will be synced down from at the beginning and to which SQLite files will be synced up at the end of the run",
)
parser.add_argument(
"--aggregate-only",
Expand Down Expand Up @@ -81,8 +86,8 @@
base_directory=args.base_directory,
column=args.column,
munge=args.munge,
pipeline=args.pipeline,
remote=args.remote,
csv_dir=args.csv_dir,
aws_remote=args.aws_remote,
aggregate_only=args.aggregate_only,
temp=args.temp,
overwrite=args.overwrite,
Expand Down
57 changes: 26 additions & 31 deletions pycytominer/tests/test_cyto_utils/test_collate.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,30 @@
import os
import pathlib
import pytest

import pandas as pd
from pycytominer.cyto_utils.collate import collate

test_config_location = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "..", "cyto_utils/database_config/ingest_config.ini")
)
test_data_location = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "test_data/collate")
batch = "2021_04_20_Target2"

plate = "BR00121431"

test_config_location = pathlib.Path(
f"{os.path.dirname(__file__)}/../../cyto_utils/database_config/ingest_config.ini"
)
test_backend_location = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
"..",
"test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431.sqlite",
)

test_data_location = pathlib.Path(f"{os.path.dirname(__file__)}/../test_data/collate")

test_backend_location = pathlib.Path(
f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}.sqlite"
)
test_csv_location = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
"..",
"test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431.csv",
)

test_csv_location = pathlib.Path(
f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}.csv"
)
master_csv_location = os.path.abspath(
os.path.join(
os.path.dirname(__file__),
"..",
"test_data/collate/backend/2021_04_20_Target2/BR00121431/BR00121431_master.csv",
)

master_csv_location = pathlib.Path(
f"{os.path.dirname(__file__)}/../test_data/collate/backend/{batch}/{plate}/{plate}_master.csv"
)


Expand Down Expand Up @@ -61,7 +56,7 @@ def test_base_case():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
add_image_features=False,
printtoscreen=False,
)
Expand All @@ -81,7 +76,7 @@ def test_base_case_with_image_features():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
add_image_features=True,
image_feature_categories=["Granularity"],
printtoscreen=False,
Expand All @@ -108,7 +103,7 @@ def test_overwrite():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
add_image_features=False,
printtoscreen=False,
)
Expand All @@ -121,7 +116,7 @@ def test_overwrite():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
overwrite=True,
add_image_features=False,
printtoscreen=False,
Expand All @@ -133,7 +128,7 @@ def test_overwrite():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
add_image_features=False,
printtoscreen=False,
)
Expand All @@ -153,7 +148,7 @@ def test_aggregate_only():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
aggregate_only=True,
add_image_features=False,
printtoscreen=False,
Expand All @@ -165,7 +160,7 @@ def test_aggregate_only():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
add_image_features=False,
printtoscreen=False,
)
Expand All @@ -179,7 +174,7 @@ def test_aggregate_only():
test_config_location,
"BR00121431",
base_directory=test_data_location,
temp=test_data_location,
tmp_dir=test_data_location,
aggregate_only=True,
add_image_features=False,
printtoscreen=False,
Expand Down

0 comments on commit ead2547

Please sign in to comment.