diff --git a/Snakefile b/Snakefile index 1088adf8..57fcf8a7 100644 --- a/Snakefile +++ b/Snakefile @@ -6,12 +6,22 @@ import glob sql_paths = glob.glob("./data/*.sqlite") PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths] + +# importing DAGs include: "rules/preprocessing.smk" + +include: "rules/feature_select.smk" + + rule all: input: - # expected outputs from the first DAG "Preprocessing" - expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), + expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), # expected outputs from the first DAG "Preprocessing" expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS), expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS), - expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS) + expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), + "results/preprocessing/consensus.csv", diff --git a/configs/analysis_configs/aggregate_configs.yaml b/configs/analysis_configs/aggregate_configs.yaml new file mode 100644 index 00000000..d99d8097 --- /dev/null +++ b/configs/analysis_configs/aggregate_configs.yaml @@ -0,0 +1,11 @@ +aggregate_configs: + params: + strata: ["Metadata_Plate", "Metadata_Well"] + features: "infer" + operation: "median" + output_file: "none" + compute_object_count: False + object_feature: "Metadata_ObjectNumber" + subset_data_df: "none" + compression_options: None + float_format: None diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml new file mode 100644 index 00000000..fa53e7c6 --- /dev/null +++ b/configs/analysis_configs/feature_select_configs.yaml @@ -0,0 +1,23 @@ +feature_select_configs: + params: + features: infer + image_features: False + samples: all + operation: + - variance_threshold + - drop_na_columns + - correlation_threshold + - blocklist + - drop_outliers + - noise_removal + na_cutoff: 0.05 + corr_threshold: 0.9 + corr_method: pearson + freq_cut: 0.05 + unique_cut: 0.1 + compression_options: gzip + float_format: None + blocklist_file: None + outlier_cutoff: 15 + noise_removal_perturb_groups: None + noise_removal_stdev_cutoff: None diff --git a/configs/configuration.yaml b/configs/configuration.yaml index 944863c5..4e46da80 100644 --- a/configs/configuration.yaml +++ b/configs/configuration.yaml @@ -3,3 +3,5 @@ config_paths: single_cell: "configs/analysis_configs/single_cell_configs.yaml" annotate: "configs/analysis_configs/annotate_configs.yaml" normalize: "configs/analysis_configs/normalize_configs.yaml" + feature_select: "configs/analysis_configs/feature_select_configs.yaml" + aggregate: "configs/analysis_configs/aggregate_configs.yaml" diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml index 5467886e..f741a6c3 100644 --- a/envs/cytominer_env.yaml +++ b/envs/cytominer_env.yaml @@ -6,3 +6,4 @@ channels: dependencies: - python>=3.7.0 - conda-forge::pycytominer=0.1 + - pyyaml diff --git a/rules/feature_select.smk b/rules/feature_select.smk new file mode 100644 index 00000000..f3e952ec --- /dev/null +++ b/rules/feature_select.smk @@ -0,0 +1,33 @@ +configfile: "configs/configuration.yaml" + + +rule feature_select: + input: + expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS), + output: + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), + params: + feature_select_config=config["config_paths"]["feature_select"], + conda: + "../envs/cytominer_env.yaml" + script: + "../scripts/feature_select.py" + + +rule create_consensus: + input: + expand( + "results/preprocessing/{plate_id}_feature_select.csv.gz", + plate_id=PLATE_IDS, + ), + output: + "results/preprocessing/consensus.csv.gz", + params: + aggregate_config=config["config_paths"]["aggregate"], + conda: + "../envs/cytominer_env.yaml" + script: + "../scripts/consensus.py" diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk index 1ce797e2..e21a6d4c 100644 --- a/rules/preprocessing.smk +++ b/rules/preprocessing.smk @@ -41,7 +41,8 @@ rule aggregate: conda: "../envs/cytominer_env.yaml" params: - aggregate_config=config["config_paths"]["single_cell"] + aggregate_config=config["config_paths"]["single_cell"], + threads: 6 script: "../scripts/aggregate_cells.py" @@ -57,7 +58,7 @@ rule annotate: conda: "../envs/cytominer_env.yaml" params: - annotate_config=config["config_paths"]["annotate"] + annotate_config=config["config_paths"]["annotate"], script: "../scripts/annotate.py" @@ -70,6 +71,6 @@ rule normalize: conda: "../envs/cytominer_env.yaml" params: - normalize_config=config["config_paths"]["normalize"] + normalize_config=config["config_paths"]["normalize"], script: "../scripts/normalize.py" diff --git a/scripts/consensus.py b/scripts/consensus.py new file mode 100644 index 00000000..d28e23eb --- /dev/null +++ b/scripts/consensus.py @@ -0,0 +1,58 @@ +from pathlib import Path +from re import A +import numpy as np +import pandas as pd + +from pycytominer.consensus import modz +from pycytominer import aggregate +from pycytominer.operations import get_na_columns + + +def concatenate_data(profile_list: list) -> pd.DataFrame: + """Concatenates all normalized aggregated features into one + pandas DataFrame + + Parameters + ---------- + profiles : list + list of paths pointing to normalized aggregated features + + Returns + ------- + pd.DataFrame + concatenated normalized aggregated features + """ + + concat_df = pd.concat( + [pd.read_csv(profile_path) for profile_path in profile_list], sort=True + ).rename( + { + "Image_Metadata_Plate": "Metadata_Plate", + "Image_Metadata_Well": "Metadata_Well", + }, + axis="columns", + ) + + # realignment of the meta data column names + concat_metadata_cols = concat_df.columns[ + concat_df.columns.str.startswith("Metadata") + ] + concat_metadata_df = concat_df.loc[:, concat_metadata_cols] + concat_df = concat_df.drop(concat_metadata_cols, axis="columns") + concat_df = pd.concat([concat_metadata_df, concat_df]) + + # dropping columns with na values + na_cols = get_na_columns(concat_df, cutoff=0) + concat_df = concat_df.drop(na_cols, axis="columns") + + return concat_df + + +if __name__ in "__main__": + + inputs = [str(f_in) for f_in in snakemake.input] + output = str(snakemake.output) + + # concatenated all Normalized aggregated profiles + concat_dataset = concatenate_data(inputs) + concat_dataset.to_csv(output, compression="gzip") diff --git a/scripts/feature_select.py b/scripts/feature_select.py new file mode 100644 index 00000000..6528e237 --- /dev/null +++ b/scripts/feature_select.py @@ -0,0 +1,61 @@ +from pathlib import Path +import yaml +from pycytominer.feature_select import feature_select + + +def feature_selection(normalized_profile, out_file): + """Performs feature selection based on the given parameters explained + in the configs/analysis_configs/feature_selection_configs.yaml file. + + Parameters + ---------- + normalized_profile : str + path that points to normalized profile + + out_file : str + Name of generated outfile + + Returns + ------- + Generates output + """ + + # loading paramters + feature_select_ep = Path(snakemake.params["feature_select_config"]) + feature_select_config_path = feature_select_ep.absolute() + with open(feature_select_config_path, "r") as yaml_contents: + feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][ + "params" + ] + + feature_select( + normalized_profile, + features=feature_select_config["features"], + image_features=feature_select_config["image_features"], + samples=feature_select_config["samples"], + operation=feature_select_config["operation"], + na_cutoff=feature_select_config["na_cutoff"], + corr_threshold=feature_select_config["corr_threshold"], + corr_method=feature_select_config["corr_method"], + freq_cut=feature_select_config["freq_cut"], + unique_cut=feature_select_config["unique_cut"], + compression_options=feature_select_config["compression_options"], + float_format=feature_select_config["float_format"], + blocklist_file=feature_select_config["blocklist_file"], + outlier_cutoff=feature_select_config["outlier_cutoff"], + noise_removal_perturb_groups=feature_select_config[ + "noise_removal_perturb_groups" + ], + noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"], + output_file=out_file, + ) + + +if __name__ == "__main__": + norm_data = [str(f_in) for f_in in snakemake.input] + out_files = [str(f_out) for f_out in snakemake.output] + io_files = zip(norm_data, out_files) + + # iteratively passing normalized data + for norm_data, out_file in io_files: + feature_selection(norm_data, out_file)