Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature Select Implementation to CytoPipe #6

Merged
merged 10 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,22 @@ import glob
sql_paths = glob.glob("./data/*.sqlite")
PLATE_IDS = [Path(sql_file).stem for sql_file in sql_paths]


# importing DAGs
include: "rules/preprocessing.smk"


include: "rules/feature_select.smk"


rule all:
input:
# expected outputs from the first DAG "Preprocessing"
expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_aggregate.csv.gz", plate_id=PLATE_IDS), # expected outputs from the first DAG "Preprocessing"
expand("results/preprocessing/{plate_id}_cell_counts.tsv", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_augmented.csv.gz", plate_id=PLATE_IDS),
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS)
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
expand(
"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
"results/preprocessing/consensus.csv",
11 changes: 11 additions & 0 deletions configs/analysis_configs/aggregate_configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
aggregate_configs:
params:
strata: ["Metadata_Plate", "Metadata_Well"]
features: "infer"
operation: "median"
output_file: "none"
compute_object_count: False
object_feature: "Metadata_ObjectNumber"
subset_data_df: "none"
compression_options: None
float_format: None
23 changes: 23 additions & 0 deletions configs/analysis_configs/feature_select_configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
feature_select_configs:
params:
features: infer
image_features: False
samples: all
operation:
- variance_threshold
- drop_na_columns
- correlation_threshold
- blocklist
- drop_outliers
- noise_removal
na_cutoff: 0.05
corr_threshold: 0.9
corr_method: pearson
freq_cut: 0.05
unique_cut: 0.1
compression_options: gzip
float_format: None
blocklist_file: None
outlier_cutoff: 15
noise_removal_perturb_groups: None
noise_removal_stdev_cutoff: None
2 changes: 2 additions & 0 deletions configs/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ config_paths:
single_cell: "configs/analysis_configs/single_cell_configs.yaml"
annotate: "configs/analysis_configs/annotate_configs.yaml"
normalize: "configs/analysis_configs/normalize_configs.yaml"
feature_select: "configs/analysis_configs/feature_select_configs.yaml"
aggregate: "configs/analysis_configs/aggregate_configs.yaml"
1 change: 1 addition & 0 deletions envs/cytominer_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ channels:
dependencies:
- python>=3.7.0
- conda-forge::pycytominer=0.1
- pyyaml
33 changes: 33 additions & 0 deletions rules/feature_select.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
configfile: "configs/configuration.yaml"


rule feature_select:
input:
expand("results/preprocessing/{plate_id}_normalized.csv.gz", plate_id=PLATE_IDS),
output:
expand(
"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
params:
feature_select_config=config["config_paths"]["feature_select"],
conda:
"../envs/cytominer_env.yaml"
script:
"../scripts/feature_select.py"


rule create_consensus:
input:
expand(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: beyond scope of this PR

sometimes we want to create consensus profiles from more than just feature selected files.

We may also want to perform feature selection on all profiles (instead of just by plate).

How would this work? (we can discuss later too, but just something to keep in mind)

Copy link
Member Author

@axiomcura axiomcura May 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrote it down on my notes ! I'll see what snakemake has to offer!

Based on my current knowledge this, seems this can be done at the configuration level. We can provide a flag indicating the types of consensus we want and the script can handle it.

In addition, I feel that the consensus step is general enough to create a modular function and call it when we are generating different consensus files.

"results/preprocessing/{plate_id}_feature_select.csv.gz",
plate_id=PLATE_IDS,
),
output:
"results/preprocessing/consensus.csv.gz",
params:
aggregate_config=config["config_paths"]["aggregate"],
conda:
"../envs/cytominer_env.yaml"
script:
"../scripts/consensus.py"
7 changes: 4 additions & 3 deletions rules/preprocessing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ rule aggregate:
conda:
"../envs/cytominer_env.yaml"
params:
aggregate_config=config["config_paths"]["single_cell"]
aggregate_config=config["config_paths"]["single_cell"],
threads: 6
script:
"../scripts/aggregate_cells.py"

Expand All @@ -57,7 +58,7 @@ rule annotate:
conda:
"../envs/cytominer_env.yaml"
params:
annotate_config=config["config_paths"]["annotate"]
annotate_config=config["config_paths"]["annotate"],
script:
"../scripts/annotate.py"

Expand All @@ -70,6 +71,6 @@ rule normalize:
conda:
"../envs/cytominer_env.yaml"
params:
normalize_config=config["config_paths"]["normalize"]
normalize_config=config["config_paths"]["normalize"],
script:
"../scripts/normalize.py"
58 changes: 58 additions & 0 deletions scripts/consensus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from pathlib import Path
from re import A
import numpy as np
import pandas as pd

from pycytominer.consensus import modz
from pycytominer import aggregate
from pycytominer.operations import get_na_columns


def concatenate_data(profile_list: list) -> pd.DataFrame:
"""Concatenates all normalized aggregated features into one
pandas DataFrame

Parameters
----------
profiles : list
list of paths pointing to normalized aggregated features

Returns
-------
pd.DataFrame
concatenated normalized aggregated features
"""

concat_df = pd.concat(
[pd.read_csv(profile_path) for profile_path in profile_list], sort=True
).rename(
{
"Image_Metadata_Plate": "Metadata_Plate",
"Image_Metadata_Well": "Metadata_Well",
},
axis="columns",
)

# realignment of the meta data column names
concat_metadata_cols = concat_df.columns[
concat_df.columns.str.startswith("Metadata")
]
concat_metadata_df = concat_df.loc[:, concat_metadata_cols]
concat_df = concat_df.drop(concat_metadata_cols, axis="columns")
concat_df = pd.concat([concat_metadata_df, concat_df])

# dropping columns with na values
na_cols = get_na_columns(concat_df, cutoff=0)
concat_df = concat_df.drop(na_cols, axis="columns")

return concat_df


if __name__ in "__main__":

inputs = [str(f_in) for f_in in snakemake.input]
output = str(snakemake.output)

# concatenated all Normalized aggregated profiles
concat_dataset = concatenate_data(inputs)
concat_dataset.to_csv(output, compression="gzip")
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
61 changes: 61 additions & 0 deletions scripts/feature_select.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from pathlib import Path
import yaml
from pycytominer.feature_select import feature_select


def feature_selection(normalized_profile, out_file):
"""Performs feature selection based on the given parameters explained
in the configs/analysis_configs/feature_selection_configs.yaml file.

Parameters
----------
normalized_profile : str
axiomcura marked this conversation as resolved.
Show resolved Hide resolved
path that points to normalized profile

out_file : str
Name of generated outfile

Returns
-------
Generates output
"""

# loading paramters
feature_select_ep = Path(snakemake.params["feature_select_config"])
feature_select_config_path = feature_select_ep.absolute()
with open(feature_select_config_path, "r") as yaml_contents:
feature_select_config = yaml.safe_load(yaml_contents)["feature_select_configs"][
"params"
]

feature_select(
normalized_profile,
features=feature_select_config["features"],
image_features=feature_select_config["image_features"],
samples=feature_select_config["samples"],
operation=feature_select_config["operation"],
na_cutoff=feature_select_config["na_cutoff"],
corr_threshold=feature_select_config["corr_threshold"],
corr_method=feature_select_config["corr_method"],
freq_cut=feature_select_config["freq_cut"],
unique_cut=feature_select_config["unique_cut"],
compression_options=feature_select_config["compression_options"],
float_format=feature_select_config["float_format"],
blocklist_file=feature_select_config["blocklist_file"],
outlier_cutoff=feature_select_config["outlier_cutoff"],
noise_removal_perturb_groups=feature_select_config[
"noise_removal_perturb_groups"
],
noise_removal_stdev_cutoff=feature_select_config["noise_removal_stdev_cutoff"],
output_file=out_file,
)


if __name__ == "__main__":
norm_data = [str(f_in) for f_in in snakemake.input]
out_files = [str(f_out) for f_out in snakemake.output]
io_files = zip(norm_data, out_files)

# iteratively passing normalized data
for norm_data, out_file in io_files:
feature_selection(norm_data, out_file)