From d22cb6b51ed0e4da1aa9bc3048858c7923b961a2 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Tue, 26 Apr 2022 10:30:18 -0600 Subject: [PATCH 1/9] initial Implementation feature select DAG --- Snakefile | 4 +- .../analysis_configs/evaluate_configs.yaml | 14 +++++ .../feature_select_configs.yaml | 17 ++++++ configs/configuration.yaml | 2 + envs/cytominer_env.yaml | 2 + rules/feature_select.smk | 26 +++++++++ scripts/evaluate_features.py | 51 +++++++++++++++++ scripts/feature_select.py | 57 +++++++++++++++++++ 8 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 configs/analysis_configs/evaluate_configs.yaml create mode 100644 configs/analysis_configs/feature_select_configs.yaml create mode 100644 rules/feature_select.smk create mode 100644 scripts/evaluate_features.py create mode 100644 scripts/feature_select.py diff --git a/Snakefile b/Snakefile index 1088adf8..4230caf9 100644 --- a/Snakefile +++ b/Snakefile @@ -14,4 +14,6 @@ rule all: expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS), - expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS) + expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), + expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), + expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), diff --git a/configs/analysis_configs/evaluate_configs.yaml b/configs/analysis_configs/evaluate_configs.yaml new file mode 100644 index 00000000..7a81b57e --- /dev/null +++ b/configs/analysis_configs/evaluate_configs.yaml @@ -0,0 +1,14 @@ + +evaluate_configs: + params: + operation: "replicate_reproducibility" + groupby_columns: ["Metadata_broad_sample"] + similarity_metric: "pearson" + replicate_reproducibility_quantile: 0.95 + replicate_reproducibility_return_median_cor: False + precision_recall_k: 10 + grit_control_perts: ["None"] + grit_replicate_summary_method: "mean" + mp_value_params: {} + enrichment_percentile: 0.99 + hitk_percent_list: [2, 5, 10] \ No newline at end of file diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml new file mode 100644 index 00000000..b0113132 --- /dev/null +++ b/configs/analysis_configs/feature_select_configs.yaml @@ -0,0 +1,17 @@ +feature_select_configs: + params: + features: "infer" + image_features: False + samples: "all" + operation: "variance_threshold" + na_cutoff: 0.05 + corr_threshold: 0.9 + corr_method: "pearson" + freq_cut: 0.05 + unique_cut: 0.1 + compression_options: None + float_format: None + blocklist_file: None + outlier_cutoff: 15 + noise_removal_perturb_groups: None + noise_removal_stdev_cutoff: None \ No newline at end of file diff --git a/configs/configuration.yaml b/configs/configuration.yaml index 944863c5..6ba9b5e3 100644 --- a/configs/configuration.yaml +++ b/configs/configuration.yaml @@ -3,3 +3,5 @@ config_paths: single_cell: "configs/analysis_configs/single_cell_configs.yaml" annotate: "configs/analysis_configs/annotate_configs.yaml" normalize: "configs/analysis_configs/normalize_configs.yaml" + feature_select: "configs/analysis_configs/feature_select_configs.yaml" + evaluate: "configs/analysis_configs/evaluate_configs.yaml" diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index 5467886e..03fb2200 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -6,3 +6,5 @@ channels: dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 + - pip: + - git+git://github.com/cytomining/cytominer-eval diff --git a/rules/feature_select.smk b/rules/feature_select.smk new file mode 100644 index 00000000..5304d2ec --- /dev/null +++ b/rules/feature_select.smk @@ -0,0 +1,26 @@ +configfile: "configs/configuration.yaml" + +rule feature_select: + input: + expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), + output: + expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), + params: + feature_select_config=config["config_paths"]["feature_select"] + conda: + "../envs/cytominer_env.yaml" + script: + "../scripts/feature_select.py" + + +rule evaluate_features: + input: + expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), + output: + expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), + params: + eval_config=config["config_paths"]["evaluate"] + conda: + "../envs/cytominer_env.yaml" + script: + "../scripts/evaluate_features.py" diff --git a/scripts/evaluate_features.py b/scripts/evaluate_features.py new file mode 100644 index 00000000..4facd5a9 --- /dev/null +++ b/scripts/evaluate_features.py @@ -0,0 +1,51 @@ +import yaml +from cytominer_eval.evaluate import evaluate + + +def evaluate_features(profile, features, meta_features, replicate_groups, outname): + + # opening config file + evaluate_ep = Path(snakemake.params["eval_config"]) + evaluate_config_path = evaluate_ep.absolute() + with open(evaluate_config_path, "r") as yaml_contents: + evaluate_config = yaml.safe_load(yaml_contents)["evaluate_configs"]["params"] + + evaluate_df = evaluate( + profile, + features, + meta_features, + replicate_groups, + operation=evaluate_config["operation"], + groupby_columns=evaluate_config["groupby_columns"], + similarity_metric=evaluate_config["similarity_metric"], + replicate_reproducibility_quantile=evaluate_config[ + "replicate_reproducibility_quantile" + ], + replicate_reproducibility_return_median_cor=evaluate_config[ + "replicate_reproducibility_return_median" + ], + precision_recall_k=evaluate_config["precision_recall_k"], + grit_control_perts=evaluate_config["grit_control_perts"], + grit_replicate_summary_method=evaluate_config["grit_replicate_summary_method"], + mp_value_params=evaluate_config["mp_value_params"], + enrichment_percentile=evaluate_config["enrichment_percentile"], + hitk_percent_list=evaluate_config["hitk_percent_list"], + ) + + evaluate_df.to_csv(outname, index=False) + +if __name__ == "__main__": + + profiles = [str(f_in) for f_in in snakemake.input] + outnames = [(str(f_out) for f_out in snakemake.output)] + io_files = zip(profiles, outnames) + + # TODO: + # there are required inputs for the `evaluate()` function + # Not implemented yet + features = None + meta_features = None + replicate_groups = None + + for profile, outname in io_files: + evaluate_features(profile, features, meta_features, replicate_groups, outname) diff --git a/scripts/feature_select.py b/scripts/feature_select.py new file mode 100644 index 00000000..e86d5ea0 --- /dev/null +++ b/scripts/feature_select.py @@ -0,0 +1,57 @@ +from pathlib import Path +import yaml +from pycytominer import audit +from pycytominer.feature_select import feature_select + + +def feature_selection(normalized_profile, out_file): + """Perfroms feature selection based on the given parameters explained + in the configs/analysis_configs/feature_selection_configs.yaml file. + + Parameters + ---------- + normalized_profile : str + path that points to normalized profile + + Returns + ------- + Generates output + """ + + # loading paramters + feature_select_ep = Path(snakemake.params["feature_select"]) + feature_select_config_path = feature_select_ep.absolute() + with open(feature_select_config_path, "r") as yaml_contents: + feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"]["params"] + + feature_select( + normalized_profile, + features=feature_select_config["features"], + image_features=feature_select_config["image_features"], + samples=feature_select_config["samples"], + operation=feature_select_config["operation"], + na_cutoff=feature_select_config["na_cutoff"], + corr_threshold=feature_select_config["corr_threshold"], + corr_method=feature_select_config["corr_method"], + freq_cut=feature_select_config["freq_cut"], + unique_cut=feature_select_config["unique_cut"], + compression_options=feature_select_config["compression_options"], + float_format=feature_select_config["float_format"], + blocklist_file=feature_select_config["blocklist_file"], + outlier_cutoff=feature_select_config["outlier_cutoff"], + noise_removal_perturb_groups=feature_select_config["noise_removal_perturb_groups"], + noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"], + output_file=out_file, + ) + + +if __name__ == '__main__': + norm_data = [str(f_in) for f_in in snakemake.input] + out_files = [str(f_out) for f_out in snakemake.output] + io_files = zip(norm_data, out_files) + + # iteratively passing normalized data + for norm_data, out_file in io_files: + feature_selection(norm_data, out_file) + + From c7a09c830e45893a4e11c6688f99db2fa54638c2 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Tue, 26 Apr 2022 10:35:19 -0600 Subject: [PATCH 2/9] fixed formating --- configs/analysis_configs/evaluate_configs.yaml | 5 ++--- configs/analysis_configs/feature_select_configs.yaml | 2 +- envs/cytominer_env.yaml | 2 +- scripts/evaluate_features.py | 1 + scripts/feature_select.py | 12 +++++++----- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/configs/analysis_configs/evaluate_configs.yaml b/configs/analysis_configs/evaluate_configs.yaml index 7a81b57e..66274b83 100644 --- a/configs/analysis_configs/evaluate_configs.yaml +++ b/configs/analysis_configs/evaluate_configs.yaml @@ -1,7 +1,6 @@ - evaluate_configs: params: - operation: "replicate_reproducibility" + operation: "replicate_reproducibility" groupby_columns: ["Metadata_broad_sample"] similarity_metric: "pearson" replicate_reproducibility_quantile: 0.95 @@ -11,4 +10,4 @@ evaluate_configs: grit_replicate_summary_method: "mean" mp_value_params: {} enrichment_percentile: 0.99 - hitk_percent_list: [2, 5, 10] \ No newline at end of file + hitk_percent_list: [2, 5, 10] diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml index b0113132..ed9ecbcf 100644 --- a/configs/analysis_configs/feature_select_configs.yaml +++ b/configs/analysis_configs/feature_select_configs.yaml @@ -14,4 +14,4 @@ feature_select_configs: blocklist_file: None outlier_cutoff: 15 noise_removal_perturb_groups: None - noise_removal_stdev_cutoff: None \ No newline at end of file + noise_removal_stdev_cutoff: None diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index 03fb2200..fc162c4d 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -7,4 +7,4 @@ dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 - pip: - - git+git://github.com/cytomining/cytominer-eval + - git+git://github.com/cytomining/cytominer-eval diff --git a/scripts/evaluate_features.py b/scripts/evaluate_features.py index 4facd5a9..92889382 100644 --- a/scripts/evaluate_features.py +++ b/scripts/evaluate_features.py @@ -34,6 +34,7 @@ def evaluate_features(profile, features, meta_features, replicate_groups, outnam evaluate_df.to_csv(outname, index=False) + if __name__ == "__main__": profiles = [str(f_in) for f_in in snakemake.input] diff --git a/scripts/feature_select.py b/scripts/feature_select.py index e86d5ea0..35fd5d71 100644 --- a/scripts/feature_select.py +++ b/scripts/feature_select.py @@ -22,7 +22,9 @@ def feature_selection(normalized_profile, out_file): feature_select_ep = Path(snakemake.params["feature_select"]) feature_select_config_path = feature_select_ep.absolute() with open(feature_select_config_path, "r") as yaml_contents: - feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"]["params"] + feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][ + "params" + ] feature_select( normalized_profile, @@ -39,13 +41,15 @@ def feature_selection(normalized_profile, out_file): float_format=feature_select_config["float_format"], blocklist_file=feature_select_config["blocklist_file"], outlier_cutoff=feature_select_config["outlier_cutoff"], - noise_removal_perturb_groups=feature_select_config["noise_removal_perturb_groups"], + noise_removal_perturb_groups=feature_select_config[ + "noise_removal_perturb_groups" + ], noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"], output_file=out_file, ) -if __name__ == '__main__': +if __name__ == "__main__": norm_data = [str(f_in) for f_in in snakemake.input] out_files = [str(f_out) for f_out in snakemake.output] io_files = zip(norm_data, out_files) @@ -53,5 +57,3 @@ def feature_selection(normalized_profile, out_file): # iteratively passing normalized data for norm_data, out_file in io_files: feature_selection(norm_data, out_file) - - From 12e3ac367a4fa13be3970eba864ca942cd969997 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Wed, 27 Apr 2022 09:26:49 -0600 Subject: [PATCH 3/9] added DAG import to SnakeFile --- Snakefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Snakefile b/Snakefile index 4230caf9..97561714 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,9 @@ import glob sql_paths = glob.glob("./data/*.sqlite") PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths] +# importing DAGs include: "rules/preprocessing.smk" +include: "rules/feature_select.smk" rule all: input: From ece6c3a175a3e9c9f3a14997ddc77e7c4d6df424 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Wed, 27 Apr 2022 10:16:05 -0600 Subject: [PATCH 4/9] Bug fixes and Formatting --- Snakefile | 7 ++++++- configs/analysis_configs/feature_select_configs.yaml | 10 +++++----- envs/cytominer_env.yaml | 2 +- scripts/evaluate_features.py | 1 + scripts/feature_select.py | 3 +-- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Snakefile b/Snakefile index 97561714..87f60c78 100644 --- a/Snakefile +++ b/Snakefile @@ -6,10 +6,12 @@ import glob sql_paths = glob.glob("./data/*.sqlite") PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths] + # importing DAGs include: "rules/preprocessing.smk" include: "rules/feature_select.smk" + rule all: input: # expected outputs from the first DAG "Preprocessing" @@ -17,5 +19,8 @@ rule all: expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), - expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml index ed9ecbcf..08afaf03 100644 --- a/configs/analysis_configs/feature_select_configs.yaml +++ b/configs/analysis_configs/feature_select_configs.yaml @@ -1,15 +1,15 @@ feature_select_configs: params: - features: "infer" + features: infer image_features: False - samples: "all" - operation: "variance_threshold" + samples: all + operation: variance_threshold na_cutoff: 0.05 corr_threshold: 0.9 - corr_method: "pearson" + corr_method: pearson freq_cut: 0.05 unique_cut: 0.1 - compression_options: None + compression_options: gzip float_format: None blocklist_file: None outlier_cutoff: 15 diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index fc162c4d..9712b053 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -7,4 +7,4 @@ dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 - pip: - - git+git://github.com/cytomining/cytominer-eval + - cytominer-eval diff --git a/scripts/evaluate_features.py b/scripts/evaluate_features.py index 92889382..1d8038ee 100644 --- a/scripts/evaluate_features.py +++ b/scripts/evaluate_features.py @@ -1,3 +1,4 @@ +from pathlib import Path import yaml from cytominer_eval.evaluate import evaluate diff --git a/scripts/feature_select.py b/scripts/feature_select.py index 35fd5d71..2122612b 100644 --- a/scripts/feature_select.py +++ b/scripts/feature_select.py @@ -1,6 +1,5 @@ from pathlib import Path import yaml -from pycytominer import audit from pycytominer.feature_select import feature_select @@ -19,7 +18,7 @@ def feature_selection(normalized_profile, out_file): """ # loading paramters - feature_select_ep = Path(snakemake.params["feature_select"]) + feature_select_ep = Path(snakemake.params["feature_select_config"]) feature_select_config_path = feature_select_ep.absolute() with open(feature_select_config_path, "r") as yaml_contents: feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][ From 223dc5dbb8429a16684d8766dd5f610be38c061d Mon Sep 17 00:00:00 2001 From: axiomcura Date: Wed, 27 Apr 2022 10:24:10 -0600 Subject: [PATCH 5/9] removed cyto_eval plugin --- .../analysis_configs/evaluate_configs.yaml | 13 ----- configs/configuration.yaml | 1 - envs/cytominer_env.yaml | 2 - rules/feature_select.smk | 21 +++----- scripts/evaluate_features.py | 53 ------------------- 5 files changed, 6 insertions(+), 84 deletions(-) delete mode 100644 configs/analysis_configs/evaluate_configs.yaml delete mode 100644 scripts/evaluate_features.py diff --git a/configs/analysis_configs/evaluate_configs.yaml b/configs/analysis_configs/evaluate_configs.yaml deleted file mode 100644 index 66274b83..00000000 --- a/configs/analysis_configs/evaluate_configs.yaml +++ /dev/null @@ -1,13 +0,0 @@ -evaluate_configs: - params: - operation: "replicate_reproducibility" - groupby_columns: ["Metadata_broad_sample"] - similarity_metric: "pearson" - replicate_reproducibility_quantile: 0.95 - replicate_reproducibility_return_median_cor: False - precision_recall_k: 10 - grit_control_perts: ["None"] - grit_replicate_summary_method: "mean" - mp_value_params: {} - enrichment_percentile: 0.99 - hitk_percent_list: [2, 5, 10] diff --git a/configs/configuration.yaml b/configs/configuration.yaml index 6ba9b5e3..ac2e4738 100644 --- a/configs/configuration.yaml +++ b/configs/configuration.yaml @@ -4,4 +4,3 @@ config_paths: annotate: "configs/analysis_configs/annotate_configs.yaml" normalize: "configs/analysis_configs/normalize_configs.yaml" feature_select: "configs/analysis_configs/feature_select_configs.yaml" - evaluate: "configs/analysis_configs/evaluate_configs.yaml" diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index 9712b053..5467886e 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -6,5 +6,3 @@ channels: dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 - - pip: - - cytominer-eval diff --git a/rules/feature_select.smk b/rules/feature_select.smk index 5304d2ec..3136a4b4 100644 --- a/rules/feature_select.smk +++ b/rules/feature_select.smk @@ -1,26 +1,17 @@ configfile: "configs/configuration.yaml" + rule feature_select: input: expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), output: - expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), params: - feature_select_config=config["config_paths"]["feature_select"] + feature_select_config=config["config_paths"]["feature_select"], conda: "../envs/cytominer_env.yaml" script: "../scripts/feature_select.py" - - -rule evaluate_features: - input: - expand("results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS), - output: - expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), - params: - eval_config=config["config_paths"]["evaluate"] - conda: - "../envs/cytominer_env.yaml" - script: - "../scripts/evaluate_features.py" diff --git a/scripts/evaluate_features.py b/scripts/evaluate_features.py deleted file mode 100644 index 1d8038ee..00000000 --- a/scripts/evaluate_features.py +++ /dev/null @@ -1,53 +0,0 @@ -from pathlib import Path -import yaml -from cytominer_eval.evaluate import evaluate - - -def evaluate_features(profile, features, meta_features, replicate_groups, outname): - - # opening config file - evaluate_ep = Path(snakemake.params["eval_config"]) - evaluate_config_path = evaluate_ep.absolute() - with open(evaluate_config_path, "r") as yaml_contents: - evaluate_config = yaml.safe_load(yaml_contents)["evaluate_configs"]["params"] - - evaluate_df = evaluate( - profile, - features, - meta_features, - replicate_groups, - operation=evaluate_config["operation"], - groupby_columns=evaluate_config["groupby_columns"], - similarity_metric=evaluate_config["similarity_metric"], - replicate_reproducibility_quantile=evaluate_config[ - "replicate_reproducibility_quantile" - ], - replicate_reproducibility_return_median_cor=evaluate_config[ - "replicate_reproducibility_return_median" - ], - precision_recall_k=evaluate_config["precision_recall_k"], - grit_control_perts=evaluate_config["grit_control_perts"], - grit_replicate_summary_method=evaluate_config["grit_replicate_summary_method"], - mp_value_params=evaluate_config["mp_value_params"], - enrichment_percentile=evaluate_config["enrichment_percentile"], - hitk_percent_list=evaluate_config["hitk_percent_list"], - ) - - evaluate_df.to_csv(outname, index=False) - - -if __name__ == "__main__": - - profiles = [str(f_in) for f_in in snakemake.input] - outnames = [(str(f_out) for f_out in snakemake.output)] - io_files = zip(profiles, outnames) - - # TODO: - # there are required inputs for the `evaluate()` function - # Not implemented yet - features = None - meta_features = None - replicate_groups = None - - for profile, outname in io_files: - evaluate_features(profile, features, meta_features, replicate_groups, outname) From 133e84e4ca2c8c738ab842eb83eed78d4a9fa120 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Wed, 27 Apr 2022 12:58:12 -0600 Subject: [PATCH 6/9] init of consensus aggregation --- Snakefile | 1 + .../analysis_configs/aggregate_configs.yaml | 11 ++++ configs/configuration.yaml | 1 + rules/feature_select.smk | 16 +++++ scripts/consensus.py | 61 +++++++++++++++++++ 5 files changed, 90 insertions(+) create mode 100644 configs/analysis_configs/aggregate_configs.yaml create mode 100644 scripts/consensus.py diff --git a/Snakefile b/Snakefile index 87f60c78..ddc6c5f7 100644 --- a/Snakefile +++ b/Snakefile @@ -24,3 +24,4 @@ rule all: plate_id=PLATE_IDS, ), expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), + expand("results/preprocessing/{plate_id}_consensus.csv", plate_id=PLATE_IDS), diff --git a/configs/analysis_configs/aggregate_configs.yaml b/configs/analysis_configs/aggregate_configs.yaml new file mode 100644 index 00000000..d99d8097 --- /dev/null +++ b/configs/analysis_configs/aggregate_configs.yaml @@ -0,0 +1,11 @@ +aggregate_configs: + params: + strata: ["Metadata_Plate", "Metadata_Well"] + features: "infer" + operation: "median" + output_file: "none" + compute_object_count: False + object_feature: "Metadata_ObjectNumber" + subset_data_df: "none" + compression_options: None + float_format: None diff --git a/configs/configuration.yaml b/configs/configuration.yaml index ac2e4738..4e46da80 100644 --- a/configs/configuration.yaml +++ b/configs/configuration.yaml @@ -4,3 +4,4 @@ config_paths: annotate: "configs/analysis_configs/annotate_configs.yaml" normalize: "configs/analysis_configs/normalize_configs.yaml" feature_select: "configs/analysis_configs/feature_select_configs.yaml" + aggregate: "configs/analysis_configs/aggregate_configs.yaml" diff --git a/rules/feature_select.smk b/rules/feature_select.smk index 3136a4b4..a2857ae4 100644 --- a/rules/feature_select.smk +++ b/rules/feature_select.smk @@ -15,3 +15,19 @@ rule feature_select: "../envs/cytominer_env.yaml" script: "../scripts/feature_select.py" + + +rule create_consensus: + input: + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), + output: + expand("results/preprocessing/{plate_id}_consensus.csv", plate_id=PLATE_IDS), + params: + aggregate_config=config["config_paths"]["aggregate"], + conda: + "../envs/cytominer_env.yaml" + script: + "../script/consensus.py" diff --git a/scripts/consensus.py b/scripts/consensus.py new file mode 100644 index 00000000..c0f40e49 --- /dev/null +++ b/scripts/consensus.py @@ -0,0 +1,61 @@ +from calendar import c +from pathlib import Path +from re import A +import numpy as np +import pandas as pd + +from pycytominer.consensus import modz +from pycytominer import get_na_columns, aggregate + + +def concatenate_data(profiles: list) -> pd.DataFrame: + """Concatenates all normalized aggregated features into one + pandas DataFrame + + Parameters + ---------- + profiles : list + list of paths pointing to normalized aggregated features + + Returns + ------- + pd.DataFrame + concatenated normalized aggregated features + """ + concat_df = ( + pd.concat(profiles, sort=True) + .rename( + { + "Image_Metadata_Plate": "Metadata_Plate", + "Image_Metadata_Well": "Metadata_Well", + }, + axis="column", + ) + .drop(["Metadata_broad_sample"], axis="columns") + ) + + # realignment of the meta data column names + concat_metadata_cols = concat_df.columns[ + concat_df.columns.str.startswith("Metadata") + ] + concat_metadata_df = concat_df.loc[:, concat_metadata_cols] + concat_df = concat_df.drop(concat_metadata_cols, axis="columns") + concat_df = pd.concat([concat_metadata_df, concat_df]) + + # dropping columns with na values + na_cols = get_na_columns(concat_df, cutoff=0) + concat_df = concat_df.drop(na_cols, axis="columns") + + # droping costes features + costes_cols= [x for x in concat_df.columns if "costes" in x.lower()] + concat_df = concat_df.drop(costes_cols, axis="columns") + + return cocnat_df + +if __name__ in "__main__": + + inputs = [str(f_in) for f_in in snakemake.input] + output = None + + # concatenated all Normalized aggregated profiles + concat_dataset = concatenate_data(inputs) From b63fddaaea3aa0be7d92f2b15f38901c1d4c00c1 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Wed, 27 Apr 2022 12:58:31 -0600 Subject: [PATCH 7/9] formatting --- scripts/consensus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/consensus.py b/scripts/consensus.py index c0f40e49..f3ec2b18 100644 --- a/scripts/consensus.py +++ b/scripts/consensus.py @@ -46,12 +46,13 @@ def concatenate_data(profiles: list) -> pd.DataFrame: na_cols = get_na_columns(concat_df, cutoff=0) concat_df = concat_df.drop(na_cols, axis="columns") - # droping costes features - costes_cols= [x for x in concat_df.columns if "costes" in x.lower()] + # droping costes features + costes_cols = [x for x in concat_df.columns if "costes" in x.lower()] concat_df = concat_df.drop(costes_cols, axis="columns") return cocnat_df + if __name__ in "__main__": inputs = [str(f_in) for f_in in snakemake.input] From 8354204f3720494a5c969a4e63eb1c40a9ea8e62 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Mon, 9 May 2022 12:20:45 -0600 Subject: [PATCH 8/9] finalized consensus rule. Parameter changes --- Snakefile | 8 ++++---- rules/feature_select.smk | 4 ++-- rules/preprocessing.smk | 7 ++++--- scripts/consensus.py | 23 ++++++++++++++--------- scripts/feature_select.py | 2 +- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/Snakefile b/Snakefile index ddc6c5f7..57fcf8a7 100644 --- a/Snakefile +++ b/Snakefile @@ -9,13 +9,14 @@ PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths] # importing DAGs include: "rules/preprocessing.smk" + + include: "rules/feature_select.smk" rule all: input: - # expected outputs from the first DAG "Preprocessing" - expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), + expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), # expected outputs from the first DAG "Preprocessing" expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), @@ -23,5 +24,4 @@ rule all: "results/preprocessing/{plate_id}_feature_select.csv.gz", plate_id=PLATE_IDS, ), - expand("results/preprocessing/{plate_id}_evaluated.csv", plate_id=PLATE_IDS), - expand("results/preprocessing/{plate_id}_consensus.csv", plate_id=PLATE_IDS), + "results/preprocessing/consensus.csv", diff --git a/rules/feature_select.smk b/rules/feature_select.smk index a2857ae4..68923c47 100644 --- a/rules/feature_select.smk +++ b/rules/feature_select.smk @@ -24,10 +24,10 @@ rule create_consensus: plate_id=PLATE_IDS, ), output: - expand("results/preprocessing/{plate_id}_consensus.csv", plate_id=PLATE_IDS), + "results/preprocessing/consensus.csv", params: aggregate_config=config["config_paths"]["aggregate"], conda: "../envs/cytominer_env.yaml" script: - "../script/consensus.py" + "../scripts/consensus.py" diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 1ce797e2..e21a6d4c 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -41,7 +41,8 @@ rule aggregate: conda: "../envs/cytominer_env.yaml" params: - aggregate_config=config["config_paths"]["single_cell"] + aggregate_config=config["config_paths"]["single_cell"], + threads: 6 script: "../scripts/aggregate_cells.py" @@ -57,7 +58,7 @@ rule annotate: conda: "../envs/cytominer_env.yaml" params: - annotate_config=config["config_paths"]["annotate"] + annotate_config=config["config_paths"]["annotate"], script: "../scripts/annotate.py" @@ -70,6 +71,6 @@ rule normalize: conda: "../envs/cytominer_env.yaml" params: - normalize_config=config["config_paths"]["normalize"] + normalize_config=config["config_paths"]["normalize"], script: "../scripts/normalize.py" diff --git a/scripts/consensus.py b/scripts/consensus.py index f3ec2b18..68febfba 100644 --- a/scripts/consensus.py +++ b/scripts/consensus.py @@ -5,10 +5,11 @@ import pandas as pd from pycytominer.consensus import modz -from pycytominer import get_na_columns, aggregate +from pycytominer import aggregate +from pycytominer.operations import get_na_columns -def concatenate_data(profiles: list) -> pd.DataFrame: +def concatenate_data(profile_list: list) -> pd.DataFrame: """Concatenates all normalized aggregated features into one pandas DataFrame @@ -22,16 +23,18 @@ def concatenate_data(profiles: list) -> pd.DataFrame: pd.DataFrame concatenated normalized aggregated features """ + concat_df = ( - pd.concat(profiles, sort=True) - .rename( + pd.concat( + [pd.read_csv(profile_path) for profile_path in profile_list], sort=True + ).rename( { "Image_Metadata_Plate": "Metadata_Plate", "Image_Metadata_Well": "Metadata_Well", }, - axis="column", + axis="columns", ) - .drop(["Metadata_broad_sample"], axis="columns") + # .drop(["Metadata_broad_sample"], axis="columns") ) # realignment of the meta data column names @@ -50,13 +53,15 @@ def concatenate_data(profiles: list) -> pd.DataFrame: costes_cols = [x for x in concat_df.columns if "costes" in x.lower()] concat_df = concat_df.drop(costes_cols, axis="columns") - return cocnat_df + return concat_df if __name__ in "__main__": inputs = [str(f_in) for f_in in snakemake.input] - output = None - + output = str(snakemake.output) + print(inputs) # concatenated all Normalized aggregated profiles concat_dataset = concatenate_data(inputs) + + concat_dataset.to_csv(output, compression="gzip") diff --git a/scripts/feature_select.py b/scripts/feature_select.py index 2122612b..e38d8685 100644 --- a/scripts/feature_select.py +++ b/scripts/feature_select.py @@ -4,7 +4,7 @@ def feature_selection(normalized_profile, out_file): - """Perfroms feature selection based on the given parameters explained + """Performs feature selection based on the given parameters explained in the configs/analysis_configs/feature_selection_configs.yaml file. Parameters From b094d3d575ec5bb793f5701ca3efc812e8951263 Mon Sep 17 00:00:00 2001 From: axiomcura Date: Fri, 13 May 2022 13:24:58 -0600 Subject: [PATCH 9/9] Update: configs files, feature_select and scripts --- .../feature_select_configs.yaml | 8 +++++- envs/cytominer_env.yaml | 1 + rules/feature_select.smk | 2 +- scripts/consensus.py | 27 +++++++------------ scripts/feature_select.py | 3 +++ 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml index 08afaf03..fa53e7c6 100644 --- a/configs/analysis_configs/feature_select_configs.yaml +++ b/configs/analysis_configs/feature_select_configs.yaml @@ -3,7 +3,13 @@ feature_select_configs: features: infer image_features: False samples: all - operation: variance_threshold + operation: + - variance_threshold + - drop_na_columns + - correlation_threshold + - blocklist + - drop_outliers + - noise_removal na_cutoff: 0.05 corr_threshold: 0.9 corr_method: pearson diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index 5467886e..f741a6c3 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -6,3 +6,4 @@ channels: dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 + - pyyaml diff --git a/rules/feature_select.smk b/rules/feature_select.smk index 68923c47..f3e952ec 100644 --- a/rules/feature_select.smk +++ b/rules/feature_select.smk @@ -24,7 +24,7 @@ rule create_consensus: plate_id=PLATE_IDS, ), output: - "results/preprocessing/consensus.csv", + "results/preprocessing/consensus.csv.gz", params: aggregate_config=config["config_paths"]["aggregate"], conda: diff --git a/scripts/consensus.py b/scripts/consensus.py index 68febfba..d28e23eb 100644 --- a/scripts/consensus.py +++ b/scripts/consensus.py @@ -1,4 +1,3 @@ -from calendar import c from pathlib import Path from re import A import numpy as np @@ -24,17 +23,14 @@ def concatenate_data(profile_list: list) -> pd.DataFrame: concatenated normalized aggregated features """ - concat_df = ( - pd.concat( - [pd.read_csv(profile_path) for profile_path in profile_list], sort=True - ).rename( - { - "Image_Metadata_Plate": "Metadata_Plate", - "Image_Metadata_Well": "Metadata_Well", - }, - axis="columns", - ) - # .drop(["Metadata_broad_sample"], axis="columns") + concat_df = pd.concat( + [pd.read_csv(profile_path) for profile_path in profile_list], sort=True + ).rename( + { + "Image_Metadata_Plate": "Metadata_Plate", + "Image_Metadata_Well": "Metadata_Well", + }, + axis="columns", ) # realignment of the meta data column names @@ -49,10 +45,6 @@ def concatenate_data(profile_list: list) -> pd.DataFrame: na_cols = get_na_columns(concat_df, cutoff=0) concat_df = concat_df.drop(na_cols, axis="columns") - # droping costes features - costes_cols = [x for x in concat_df.columns if "costes" in x.lower()] - concat_df = concat_df.drop(costes_cols, axis="columns") - return concat_df @@ -60,8 +52,7 @@ def concatenate_data(profile_list: list) -> pd.DataFrame: inputs = [str(f_in) for f_in in snakemake.input] output = str(snakemake.output) - print(inputs) + # concatenated all Normalized aggregated profiles concat_dataset = concatenate_data(inputs) - concat_dataset.to_csv(output, compression="gzip") diff --git a/scripts/feature_select.py b/scripts/feature_select.py index e38d8685..6528e237 100644 --- a/scripts/feature_select.py +++ b/scripts/feature_select.py @@ -12,6 +12,9 @@ def feature_selection(normalized_profile, out_file): normalized_profile : str path that points to normalized profile + out_file : str + Name of generated outfile + Returns ------- Generates output