Skip to content

Commit

Permalink
Merge pull request #6 from axiomcura/main
Browse files Browse the repository at this point in the history
Feature Select Implementation to CytoPipe
  • Loading branch information
axiomcura authored May 13, 2022
2 parents b32f297 + b094d3d commit 638009d
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 6 deletions.
16 changes: 13 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,22 @@ import glob
sql_paths = glob.glob("./data/*.sqlite")
PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths]


# importing DAGs
include: "rules/preprocessing.smk"


include: "rules/feature_select.smk"


rule all:
input:
# expected outputs from the first DAG "Preprocessing"
expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), # expected outputs from the first DAG "Preprocessing"
expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS)
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
expand(
"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
"results/preprocessing/consensus.csv",
11 changes: 11 additions & 0 deletions configs/analysis_configs/aggregate_configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
aggregate_configs:
params:
strata: ["Metadata_Plate", "Metadata_Well"]
features: "infer"
operation: "median"
output_file: "none"
compute_object_count: False
object_feature: "Metadata_ObjectNumber"
subset_data_df: "none"
compression_options: None
float_format: None
23 changes: 23 additions & 0 deletions configs/analysis_configs/feature_select_configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
feature_select_configs:
params:
features: infer
image_features: False
samples: all
operation:
- variance_threshold
- drop_na_columns
- correlation_threshold
- blocklist
- drop_outliers
- noise_removal
na_cutoff: 0.05
corr_threshold: 0.9
corr_method: pearson
freq_cut: 0.05
unique_cut: 0.1
compression_options: gzip
float_format: None
blocklist_file: None
outlier_cutoff: 15
noise_removal_perturb_groups: None
noise_removal_stdev_cutoff: None
2 changes: 2 additions & 0 deletions configs/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ config_paths:
single_cell: "configs/analysis_configs/single_cell_configs.yaml"
annotate: "configs/analysis_configs/annotate_configs.yaml"
normalize: "configs/analysis_configs/normalize_configs.yaml"
feature_select: "configs/analysis_configs/feature_select_configs.yaml"
aggregate: "configs/analysis_configs/aggregate_configs.yaml"
1 change: 1 addition & 0 deletions envs/cytominer_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ channels:
dependencies:
- python>=3.7.0
- conda-forge::pycytominer=0.1
- pyyaml
33 changes: 33 additions & 0 deletions rules/feature_select.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
configfile: "configs/configuration.yaml"


rule feature_select:
input:
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
output:
expand(
"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
params:
feature_select_config=config["config_paths"]["feature_select"],
conda:
"../envs/cytominer_env.yaml"
script:
"../scripts/feature_select.py"


rule create_consensus:
input:
expand(
"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
output:
"results/preprocessing/consensus.csv.gz",
params:
aggregate_config=config["config_paths"]["aggregate"],
conda:
"../envs/cytominer_env.yaml"
script:
"../scripts/consensus.py"
7 changes: 4 additions & 3 deletions rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ rule aggregate:
conda:
"../envs/cytominer_env.yaml"
params:
aggregate_config=config["config_paths"]["single_cell"]
aggregate_config=config["config_paths"]["single_cell"],
threads: 6
script:
"../scripts/aggregate_cells.py"

Expand All @@ -57,7 +58,7 @@ rule annotate:
conda:
"../envs/cytominer_env.yaml"
params:
annotate_config=config["config_paths"]["annotate"]
annotate_config=config["config_paths"]["annotate"],
script:
"../scripts/annotate.py"

Expand All @@ -70,6 +71,6 @@ rule normalize:
conda:
"../envs/cytominer_env.yaml"
params:
normalize_config=config["config_paths"]["normalize"]
normalize_config=config["config_paths"]["normalize"],
script:
"../scripts/normalize.py"
58 changes: 58 additions & 0 deletions scripts/consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from pathlib import Path
from re import A
import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import aggregate
from pycytominer.operations import get_na_columns


def concatenate_data(profile_list: list) -> pd.DataFrame:
"""Concatenates all normalized aggregated features into one
pandas DataFrame
Parameters
----------
profiles : list
list of paths pointing to normalized aggregated features
Returns
-------
pd.DataFrame
concatenated normalized aggregated features
"""

concat_df = pd.concat(
[pd.read_csv(profile_path) for profile_path in profile_list], sort=True
).rename(
{
"Image_Metadata_Plate": "Metadata_Plate",
"Image_Metadata_Well": "Metadata_Well",
},
axis="columns",
)

# realignment of the meta data column names
concat_metadata_cols = concat_df.columns[
concat_df.columns.str.startswith("Metadata")
]
concat_metadata_df = concat_df.loc[:, concat_metadata_cols]
concat_df = concat_df.drop(concat_metadata_cols, axis="columns")
concat_df = pd.concat([concat_metadata_df, concat_df])

# dropping columns with na values
na_cols = get_na_columns(concat_df, cutoff=0)
concat_df = concat_df.drop(na_cols, axis="columns")

return concat_df


if __name__ in "__main__":

inputs = [str(f_in) for f_in in snakemake.input]
output = str(snakemake.output)

# concatenated all Normalized aggregated profiles
concat_dataset = concatenate_data(inputs)
concat_dataset.to_csv(output, compression="gzip")
61 changes: 61 additions & 0 deletions scripts/feature_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from pathlib import Path
import yaml
from pycytominer.feature_select import feature_select


def feature_selection(normalized_profile, out_file):
"""Performs feature selection based on the given parameters explained
in the configs/analysis_configs/feature_selection_configs.yaml file.
Parameters
----------
normalized_profile : str
path that points to normalized profile
out_file : str
Name of generated outfile
Returns
-------
Generates output
"""

# loading paramters
feature_select_ep = Path(snakemake.params["feature_select_config"])
feature_select_config_path = feature_select_ep.absolute()
with open(feature_select_config_path, "r") as yaml_contents:
feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][
"params"
]

feature_select(
normalized_profile,
features=feature_select_config["features"],
image_features=feature_select_config["image_features"],
samples=feature_select_config["samples"],
operation=feature_select_config["operation"],
na_cutoff=feature_select_config["na_cutoff"],
corr_threshold=feature_select_config["corr_threshold"],
corr_method=feature_select_config["corr_method"],
freq_cut=feature_select_config["freq_cut"],
unique_cut=feature_select_config["unique_cut"],
compression_options=feature_select_config["compression_options"],
float_format=feature_select_config["float_format"],
blocklist_file=feature_select_config["blocklist_file"],
outlier_cutoff=feature_select_config["outlier_cutoff"],
noise_removal_perturb_groups=feature_select_config[
"noise_removal_perturb_groups"
],
noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"],
output_file=out_file,
)


if __name__ == "__main__":
norm_data = [str(f_in) for f_in in snakemake.input]
out_files = [str(f_out) for f_out in snakemake.output]
io_files = zip(norm_data, out_files)

# iteratively passing normalized data
for norm_data, out_file in io_files:
feature_selection(norm_data, out_file)

0 comments on commit 638009d

Please sign in to comment.