Merge pull request #6 from axiomcura/main

Feature Select Implementation to CytoPipe
WayScience · May 13, 2022 · 638009d · 638009d
2 parents b32f297 + b094d3d
commit 638009d
Show file tree

Hide file tree

Showing 9 changed files with 206 additions and 6 deletions.
diff --git a/Snakefile b/Snakefile
@@ -6,12 +6,22 @@ import glob
 sql_paths = glob.glob("./data/*.sqlite")
 PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths]
 
+
+# importing DAGs
 include: "rules/preprocessing.smk"
 
+
+include: "rules/feature_select.smk"
+
+
 rule all:
     input:
-        # expected outputs from the first DAG "Preprocessing"
-        expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS),
+        expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), # expected outputs from the first DAG "Preprocessing"
         expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS),
         expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS),
-        expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS)
+        expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
+        expand(
+            "results/preprocessing/{plate_id}_feature_select.csv.gz",
+            plate_id=PLATE_IDS,
+        ),
+        "results/preprocessing/consensus.csv",
diff --git a/configs/analysis_configs/aggregate_configs.yaml b/configs/analysis_configs/aggregate_configs.yaml
@@ -0,0 +1,11 @@
+aggregate_configs:
+  params:
+  strata: ["Metadata_Plate", "Metadata_Well"]
+  features: "infer"
+  operation: "median"
+  output_file: "none"
+  compute_object_count: False
+  object_feature: "Metadata_ObjectNumber"
+  subset_data_df: "none"
+  compression_options: None
+  float_format: None
diff --git a/configs/analysis_configs/feature_select_configs.yaml b/configs/analysis_configs/feature_select_configs.yaml
@@ -0,0 +1,23 @@
+feature_select_configs:
+  params:
+    features: infer
+    image_features: False
+    samples: all
+    operation:
+      - variance_threshold
+      - drop_na_columns
+      - correlation_threshold
+      - blocklist
+      - drop_outliers
+      - noise_removal
+    na_cutoff: 0.05
+    corr_threshold: 0.9
+    corr_method: pearson
+    freq_cut: 0.05
+    unique_cut: 0.1
+    compression_options: gzip
+    float_format: None
+    blocklist_file: None
+    outlier_cutoff: 15
+    noise_removal_perturb_groups: None
+    noise_removal_stdev_cutoff: None
diff --git a/configs/configuration.yaml b/configs/configuration.yaml
@@ -3,3 +3,5 @@ config_paths:
   single_cell: "configs/analysis_configs/single_cell_configs.yaml"
   annotate: "configs/analysis_configs/annotate_configs.yaml"
   normalize: "configs/analysis_configs/normalize_configs.yaml"
+  feature_select: "configs/analysis_configs/feature_select_configs.yaml"
+  aggregate: "configs/analysis_configs/aggregate_configs.yaml"
diff --git a/envs/cytominer_env.yaml b/envs/cytominer_env.yaml
@@ -6,3 +6,4 @@ channels:
 dependencies:
   - python>=3.7.0
   - conda-forge::pycytominer=0.1
+  - pyyaml
diff --git a/rules/feature_select.smk b/rules/feature_select.smk
@@ -0,0 +1,33 @@
+configfile: "configs/configuration.yaml"
+
+
+rule feature_select:
+    input:
+        expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
+    output:
+        expand(
+            "results/preprocessing/{plate_id}_feature_select.csv.gz",
+            plate_id=PLATE_IDS,
+        ),
+    params:
+        feature_select_config=config["config_paths"]["feature_select"],
+    conda:
+        "../envs/cytominer_env.yaml"
+    script:
+        "../scripts/feature_select.py"
+
+
+rule create_consensus:
+    input:
+        expand(
+            "results/preprocessing/{plate_id}_feature_select.csv.gz",
+            plate_id=PLATE_IDS,
+        ),
+    output:
+        "results/preprocessing/consensus.csv.gz",
+    params:
+        aggregate_config=config["config_paths"]["aggregate"],
+    conda:
+        "../envs/cytominer_env.yaml"
+    script:
+        "../scripts/consensus.py"
diff --git a/rules/preprocessing.smk b/rules/preprocessing.smk
@@ -41,7 +41,8 @@ rule aggregate:
     conda:
         "../envs/cytominer_env.yaml"
     params:
-        aggregate_config=config["config_paths"]["single_cell"]
+        aggregate_config=config["config_paths"]["single_cell"],
+    threads: 6
     script:
         "../scripts/aggregate_cells.py"
 
@@ -57,7 +58,7 @@ rule annotate:
     conda:
         "../envs/cytominer_env.yaml"
     params:
-        annotate_config=config["config_paths"]["annotate"]
+        annotate_config=config["config_paths"]["annotate"],
     script:
         "../scripts/annotate.py"
 
@@ -70,6 +71,6 @@ rule normalize:
     conda:
         "../envs/cytominer_env.yaml"
     params:
-        normalize_config=config["config_paths"]["normalize"]
+        normalize_config=config["config_paths"]["normalize"],
     script:
         "../scripts/normalize.py"
diff --git a/scripts/consensus.py b/scripts/consensus.py
@@ -0,0 +1,58 @@
+from pathlib import Path
+from re import A
+import numpy as np
+import pandas as pd
+
+from pycytominer.consensus import modz
+from pycytominer import aggregate
+from pycytominer.operations import get_na_columns
+
+
+def concatenate_data(profile_list: list) -> pd.DataFrame:
+    """Concatenates all normalized aggregated features into one
+    pandas DataFrame
+
+    Parameters
+    ----------
+    profiles : list
+        list of paths pointing to normalized aggregated features
+
+    Returns
+    -------
+    pd.DataFrame
+        concatenated normalized aggregated features
+    """
+
+    concat_df = pd.concat(
+        [pd.read_csv(profile_path) for profile_path in profile_list], sort=True
+    ).rename(
+        {
+            "Image_Metadata_Plate": "Metadata_Plate",
+            "Image_Metadata_Well": "Metadata_Well",
+        },
+        axis="columns",
+    )
+
+    # realignment of the meta data column names
+    concat_metadata_cols = concat_df.columns[
+        concat_df.columns.str.startswith("Metadata")
+    ]
+    concat_metadata_df = concat_df.loc[:, concat_metadata_cols]
+    concat_df = concat_df.drop(concat_metadata_cols, axis="columns")
+    concat_df = pd.concat([concat_metadata_df, concat_df])
+
+    # dropping columns with na values
+    na_cols = get_na_columns(concat_df, cutoff=0)
+    concat_df = concat_df.drop(na_cols, axis="columns")
+
+    return concat_df
+
+
+if __name__ in "__main__":
+
+    inputs = [str(f_in) for f_in in snakemake.input]
+    output = str(snakemake.output)
+
+    # concatenated all Normalized aggregated profiles
+    concat_dataset = concatenate_data(inputs)
+    concat_dataset.to_csv(output, compression="gzip")
diff --git a/scripts/feature_select.py b/scripts/feature_select.py
@@ -0,0 +1,61 @@
+from pathlib import Path
+import yaml
+from pycytominer.feature_select import feature_select
+
+
+def feature_selection(normalized_profile, out_file):
+    """Performs feature selection based on the given parameters explained
+    in the configs/analysis_configs/feature_selection_configs.yaml file.
+
+    Parameters
+    ----------
+    normalized_profile : str
+        path that points to normalized profile
+
+    out_file : str
+        Name of generated outfile
+
+    Returns
+    -------
+    Generates output
+    """
+
+    # loading paramters
+    feature_select_ep = Path(snakemake.params["feature_select_config"])
+    feature_select_config_path = feature_select_ep.absolute()
+    with open(feature_select_config_path, "r") as yaml_contents:
+        feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][
+            "params"
+        ]
+
+    feature_select(
+        normalized_profile,
+        features=feature_select_config["features"],
+        image_features=feature_select_config["image_features"],
+        samples=feature_select_config["samples"],
+        operation=feature_select_config["operation"],
+        na_cutoff=feature_select_config["na_cutoff"],
+        corr_threshold=feature_select_config["corr_threshold"],
+        corr_method=feature_select_config["corr_method"],
+        freq_cut=feature_select_config["freq_cut"],
+        unique_cut=feature_select_config["unique_cut"],
+        compression_options=feature_select_config["compression_options"],
+        float_format=feature_select_config["float_format"],
+        blocklist_file=feature_select_config["blocklist_file"],
+        outlier_cutoff=feature_select_config["outlier_cutoff"],
+        noise_removal_perturb_groups=feature_select_config[
+            "noise_removal_perturb_groups"
+        ],
+        noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"],
+        output_file=out_file,
+    )
+
+
+if __name__ == "__main__":
+    norm_data = [str(f_in) for f_in in snakemake.input]
+    out_files = [str(f_out) for f_out in snakemake.output]
+    io_files = zip(norm_data, out_files)
+
+    # iteratively passing normalized data
+    for norm_data, out_file in io_files:
+        feature_selection(norm_data, out_file)