Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding functionality to aggregate and annotate DeepProfiler output #78

Merged
merged 36 commits into from
Jun 4, 2021
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ea261bf
add functions to aggregate and annotate DeepProfiler output
gwaybio May 8, 2020
2b3b17f
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio Aug 14, 2020
5c6f5c0
move load_npz() to cyto_utils.load
gwaybio Aug 14, 2020
f3f8e98
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio Sep 24, 2020
9df2936
add feature prefix loading from metadata npz
gwaybio Sep 25, 2020
60566fd
black on __init__
gwaybio Sep 25, 2020
d872010
add load_npz() tests
gwaybio Sep 25, 2020
f3ffc84
add assertions for real data
gwaybio Sep 25, 2020
a7c9c92
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio May 14, 2021
519a127
First commit, updated docstring
michaelbornholdt May 19, 2021
d237b41
add deepprofiler testing data
gwaybio May 19, 2021
7c62275
Add util files to init
michaelbornholdt May 19, 2021
1879349
adding docstrings
michaelbornholdt May 19, 2021
b899e58
start test file
michaelbornholdt May 19, 2021
8d26729
Merge remote-tracking branch 'gwaygenomics/add-deepprofiler-processin…
michaelbornholdt May 19, 2021
c01efba
Fixed the main function, I hope
michaelbornholdt May 20, 2021
17f92c6
Add first test run
michaelbornholdt May 20, 2021
f52b640
Add first test run
michaelbornholdt May 20, 2021
dcc99b4
Add example data 1
michaelbornholdt May 20, 2021
95ce6c0
Fixed some docstring
michaelbornholdt May 20, 2021
91d14cb
Further update and run black
michaelbornholdt May 20, 2021
92be40c
Black
michaelbornholdt May 20, 2021
c8b98b1
Add second version of data, for additional tests
michaelbornholdt May 20, 2021
f1d9a18
less data
michaelbornholdt May 20, 2021
200e0b1
move test data
michaelbornholdt May 20, 2021
112ab81
fix final querks
michaelbornholdt May 21, 2021
cb8db7e
final version of test data
michaelbornholdt May 21, 2021
703a0b5
add all tests
michaelbornholdt May 21, 2021
406695b
fix import
michaelbornholdt May 21, 2021
412dfa4
minor updates, mostly documentation
gwaybio May 21, 2021
4573ef3
run black
gwaybio May 21, 2021
c060c49
fix variable name
gwaybio May 21, 2021
b7ec71f
minor documentation updates
gwaybio May 21, 2021
353cb38
make sure metadata columns are strings
gwaybio May 24, 2021
d2db947
reduce redundancy
gwaybio May 24, 2021
4074b56
update docstring for infer_delim
gwaybio Jun 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
274 changes: 274 additions & 0 deletions pycytominer/cyto_utils/DeepProfiler_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
"""
Utility function to load and process the output files of a DeepProfiler run.
"""
import os
import pathlib
import numpy as np
import pandas as pd
import warnings

from pycytominer import aggregate
from pycytominer.cyto_utils import load_npz, infer_cp_features


class AggregateDeepProfiler:
"""This class holds all functions needed to load and annotate the DeepProfiler (DP) run.

Attributes
----------

profile_dir : str
file location of the output profiles from DeepProfiler
(e.g. `/project1/outputs/results/features/`)
aggregate_operation : ['median', 'mean']
method of aggregation
aggregate_on : ['site', 'well', 'plate']
up to which level to aggregate
filename_delimiter : default = '_'
delimiter for the filenames of the profiles (e.g. B02_4.npz).
file_extension : default = '.npz'
extension of the profile file.
index_df : pandas.DataFrame
load in the index.csv file from DeepProfiler, provided by an input index file.
filenames : list of paths
list of Purepaths that point to the npz files.
aggregated_profiles : pandas.DataFrame
df to hold the metadata and profiles.
file_aggregate : dict
dict that holds the file names and metadata.
Is used to load in the npz files in the correct order and grouping.

Methods
-------
annotate_deep()
Given an initialized AggregateDeepProfiler() class, run this function to output
level 3 profiles (aggregated profiles with annotated metadata).

"""

def __init__(
self,
index_file,
profile_dir,
aggregate_operation="median",
aggregate_on="well",
filename_delimiter="_",
file_extension=".npz",
):
"""
__init__ function for this class.

Arguments
---------
index_file : str
file location of the index.csv from DP

See above for all other parameters.
"""
assert aggregate_operation in [
"median",
"mean",
], "Input of aggregate_operation is incorrect, it must be either median or mean"
assert aggregate_on in [
"site",
"well",
"plate",
], "Input of aggregate_on is incorrect, it must be either site or well or plate"

self.aggregate_operation = aggregate_operation
self.profile_dir = profile_dir
self.aggregate_on = aggregate_on
self.filename_delimiter = filename_delimiter
self.file_extension = file_extension
if not self.file_extension.startswith("."):
self.file_extension = f".{self.file_extension}"
self.index_df = pd.read_csv(index_file, dtype=str)

def build_filenames(self):
"""
Create file names indicated by plate, well, and site information
"""
self.filenames = self.index_df.apply(
self.build_filename_from_index, axis="columns"
)
self.filenames = [
pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
]

def build_filename_from_index(self, row):
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
"""
Builds the name of the profile files
"""
plate = row["Metadata_Plate"]
well = row["Metadata_Well"]
site = row["Metadata_Site"]

filename = f"{plate}/{well}_{site}{self.file_extension}"
return filename

def extract_filename_metadata(self, npz_file, delimiter="_"):
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
"""
Extract metadata (site, well and plate) from the filename.
The input format of the file: path/plate/well_site.npz

Arguments
---------
npz_file : str
file path

delimiter : str
the delimiter used in the naming convention of the files. default = '_'

Returns
-------
loc : dict
dict with metadata
"""
base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
site = base_file[-1]
well = base_file[-2]
plate = str(npz_file).split("/")[-2]

loc = {"site": site, "well": well, "plate": plate}
return loc

def setup_aggregate(self):
"""
Sets up the file_aggregate attribute. This is a helper function to aggregate_deep().

the file_aggregate dictionary contains the file locations and metadata for each grouping.
If for example we are grouping by well then the keys of self.file_aggregate would be:
plate1/well1, plate1/well2, plate2/well1, etc.
"""
if not hasattr(self, "filenames"):
self.build_filenames()

self.file_aggregate = {}
for filename in self.filenames:
file_info = self.extract_filename_metadata(
filename, self.filename_delimiter
)
file_key = file_info[self.aggregate_on]

if self.aggregate_on == "site":
file_key = (
f"{file_info['plate']}/{file_info['well']}_{file_info['site']}"
)

if self.aggregate_on == "well":
file_key = f"{file_info['plate']}/{file_info['well']}"

if file_key in self.file_aggregate:
self.file_aggregate[file_key]["files"].append(filename)
else:
self.file_aggregate[file_key] = {}
self.file_aggregate[file_key]["files"] = [filename]

self.file_aggregate[file_key]["metadata"] = file_info

def aggregate_deep(self):
"""
Aggregates the profiles into a pandas dataframe.

For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
If files are missing, we throw a warning but continue the code.
"""
if not hasattr(self, "file_aggregate"):
self.setup_aggregate()

self.aggregated_profiles = []
self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

for metadata_level in self.file_aggregate:
# uses custom load function to create df with metadata and profiles
arr = [load_npz(x) for x in self.file_aggregate[metadata_level]["files"]]
# empty dataframes from missing files are deleted
arr = [x for x in arr if not x.empty]
# if no files were found there is a miss-match between the index and the output files
if not len(arr):
warnings.warn(
f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
)
continue

df = pd.concat(arr)

# Prepare inputs for the aggregate function
meta_df = pd.DataFrame(
self.file_aggregate[metadata_level]["metadata"], index=[0]
).reset_index(drop=True)
meta_df.columns = [
f"Metadata_{x.capitalize()}" if not x.startswith("Metadata_") else x
for x in meta_df.columns
]

if self.aggregate_on == "well":
meta_df = (
meta_df.drop("Metadata_Site", axis="columns")
.drop_duplicates()
.reset_index(drop=True)
)

metadata_cols = [x for x in df if x.startswith("Metadata_")]
profiles = [x for x in df.columns.tolist() if x not in metadata_cols]
df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
df = aggregate.aggregate(
population_df=df,
strata="Metadata_Aggregate_On",
features=profiles,
operation=self.aggregate_operation,
).reset_index(drop=True)

df.loc[:, self.aggregate_merge_col] = metadata_level
df = meta_df.merge(df, left_index=True, right_index=True)
self.aggregated_profiles.append(df)

# Concatenate all of the above created profiles
self.aggregated_profiles = pd.concat(
[x for x in self.aggregated_profiles]
).reset_index(drop=True)
self.aggregated_profiles.columns = [
str(x) for x in self.aggregated_profiles.columns
]
meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
reindex_profiles = [str(x) for x in profiles]
self.aggregated_profiles = self.aggregated_profiles.reindex(
meta_features + reindex_profiles, axis="columns"
)

def annotate_deep(
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
self,
):
"""
Main function of this class. Merges the index df and the profiles back into one dataframe.

Returns
-------
df_out : pandas.dataframe
dataframe with all metadata and the feature space.
This is the input to any further pycytominer or pycytominer-eval processing
"""
if not hasattr(self, "aggregated_profiles"):
self.aggregate_deep()

meta_df = self.index_df
meta_df.columns = [
"Metadata_{}".format(x) if not x.startswith("Metadata_") else x
for x in meta_df.columns
]

# prepare for merge with profiles
if self.aggregate_on == "plate":
meta_df = meta_df.drop(["Metadata_Site", "Metadata_Well"], axis="columns")
merge_cols = ["Metadata_Plate"]

elif self.aggregate_on == "well":
meta_df = meta_df.drop("Metadata_Site", axis="columns")
merge_cols = ["Metadata_Well", "Metadata_Plate"]

elif self.aggregate_on == "site":
merge_cols = ["Metadata_Well", "Metadata_Plate", "Metadata_Site"]

meta_df = meta_df.drop_duplicates(subset=merge_cols)
df_out = meta_df.merge(self.aggregated_profiles, on=merge_cols, how="inner")
return df_out
6 changes: 2 additions & 4 deletions pycytominer/cyto_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
assert_linking_cols_complete,
provide_linking_cols_feature_name_update,
)
from .load import (
load_profiles,
load_platemap,
)
from .load import load_profiles, load_platemap, load_npz, infer_delim
from .features import (
get_blocklist_features,
label_compartment,
Expand All @@ -30,3 +27,4 @@
from .write_gct import write_gct
from .modz import modz
from .annotate_custom import annotate_cmap, cp_clean
from .DeepProfiler_processing import AggregateDeepProfiler
Loading