Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding functionality to aggregate and annotate DeepProfiler output #78

Merged
merged 36 commits into from
Jun 4, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
ea261bf
add functions to aggregate and annotate DeepProfiler output
gwaybio May 8, 2020
2b3b17f
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio Aug 14, 2020
5c6f5c0
move load_npz() to cyto_utils.load
gwaybio Aug 14, 2020
f3f8e98
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio Sep 24, 2020
9df2936
add feature prefix loading from metadata npz
gwaybio Sep 25, 2020
60566fd
black on __init__
gwaybio Sep 25, 2020
d872010
add load_npz() tests
gwaybio Sep 25, 2020
f3ffc84
add assertions for real data
gwaybio Sep 25, 2020
a7c9c92
Merge remote-tracking branch 'upstream/master' into add-deepprofiler-…
gwaybio May 14, 2021
519a127
First commit, updated docstring
michaelbornholdt May 19, 2021
d237b41
add deepprofiler testing data
gwaybio May 19, 2021
7c62275
Add util files to init
michaelbornholdt May 19, 2021
1879349
adding docstrings
michaelbornholdt May 19, 2021
b899e58
start test file
michaelbornholdt May 19, 2021
8d26729
Merge remote-tracking branch 'gwaygenomics/add-deepprofiler-processin…
michaelbornholdt May 19, 2021
c01efba
Fixed the main function, I hope
michaelbornholdt May 20, 2021
17f92c6
Add first test run
michaelbornholdt May 20, 2021
f52b640
Add first test run
michaelbornholdt May 20, 2021
dcc99b4
Add example data 1
michaelbornholdt May 20, 2021
95ce6c0
Fixed some docstring
michaelbornholdt May 20, 2021
91d14cb
Further update and run black
michaelbornholdt May 20, 2021
92be40c
Black
michaelbornholdt May 20, 2021
c8b98b1
Add second version of data, for additional tests
michaelbornholdt May 20, 2021
f1d9a18
less data
michaelbornholdt May 20, 2021
200e0b1
move test data
michaelbornholdt May 20, 2021
112ab81
fix final querks
michaelbornholdt May 21, 2021
cb8db7e
final version of test data
michaelbornholdt May 21, 2021
703a0b5
add all tests
michaelbornholdt May 21, 2021
406695b
fix import
michaelbornholdt May 21, 2021
412dfa4
minor updates, mostly documentation
gwaybio May 21, 2021
4573ef3
run black
gwaybio May 21, 2021
c060c49
fix variable name
gwaybio May 21, 2021
b7ec71f
minor documentation updates
gwaybio May 21, 2021
353cb38
make sure metadata columns are strings
gwaybio May 24, 2021
d2db947
reduce redundancy
gwaybio May 24, 2021
4074b56
update docstring for infer_delim
gwaybio Jun 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions pycytominer/cyto_utils/DeepProfiler_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""
Utility function to load and process the output files of a DeepProfiler run.
"""
import os
import pathlib
import numpy as np
import pandas as pd

from pycytominer import aggregate
from pycytominer.cyto_utils import infer_cp_features, load_npz


class AggregateDeepProfiler:
"""This class holds all functions needed to load and annotate the DeepProfiler run.

If the class has public attributes, they may be documented here
in an ``Attributes`` section and follow the same formatting as a
function's ``Args`` section. Alternatively, attributes may be documented
inline with the attribute's declaration (see __init__ method below).

Properties created with the ``@property`` decorator should be documented
in the property's getter method.

Attributes
----------
attr1 : str
Description of `attr1`.
attr2 : :obj:`int`, optional
Description of `attr2`.
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved

"""
def __init__(
self,
index_file,
profile_dir,
aggregate_operation="median",
aggregate_on="well",
file_delimiter="_",
file_extension=".npz",
):
self.index_file = index_file
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
self.profile_dir = profile_dir
self.aggregate_operation = aggregate_operation
self.aggregate_on = aggregate_on
self.file_delimiter = file_delimiter
self.file_extension = file_extension
if not self.file_extension.startswith("."):
self.file_extension = f".{self.file_extension}"
self.index_df = pd.read_csv(index_file)

def build_filenames(self):
"""
Single cell profile file names indicated by plate, well, and site information
"""
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
self.filenames = self.index_df.apply(
self.build_filename_from_index, axis="columns"
)
self.filenames = [
pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
]

def build_filename_from_index(self, row):
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
plate = row["Metadata_Plate"]
well = row["Metadata_Well"]
site = row["Metadata_Site"]

filename = f"{plate}_{well}_{site}{self.file_extension}"
return filename

def extract_filename_metadata(self, npz_file, delimiter="_"):
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
"""
Format: plate_well_site.npz
"""
base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
site = base_file[-1]
well = base_file[-2]
plate = delimiter.join(base_file[:-2])

return {"site": site, "well": well, "plate": plate}

def setup_aggregate(self):
if not hasattr(self, "filenames"):
self.build_filenames()

self.file_aggregate = {}
for filename in self.filenames:
file_info = self.extract_filename_metadata(filename, self.file_delimiter)
file_key = file_info[self.aggregate_on]

if self.aggregate_on == "site":
file_key = (
f"{file_info['plate']}_{file_info['well']}_{file_info['site']}"
)

if self.aggregate_on == "well":
file_key = f"{file_info['plate']}_{file_info['well']}"

if file_key in self.file_aggregate:
self.file_aggregate[file_key]["files"].append(filename)
else:
self.file_aggregate[file_key] = {}
self.file_aggregate[file_key]["files"] = [filename]

self.file_aggregate[file_key]["metadata"] = file_info

def aggregate_deep(self):
if not hasattr(self, "file_aggregate"):
self.setup_aggregate()

self.aggregated_profiles = []
self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

for metadata_level in self.file_aggregate:
df = pd.concat(
[load_npz(x) for x in self.file_aggregate[metadata_level]["files"]]
)
meta_df = pd.DataFrame(
self.file_aggregate[metadata_level]["metadata"], index=[0]
)
meta_df.columns = [f"Metadata_{x.capitalize()}" for x in meta_df.columns]

if self.aggregate_on == "well":
meta_df = meta_df.drop("Metadata_Site", axis="columns")

features = df.columns.tolist()
df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
df = aggregate(
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
population_df=df,
strata="Metadata_Aggregate_On",
features=features,
operation=self.aggregate_operation,
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
)
df.loc[:, self.aggregate_merge_col] = metadata_level
df = meta_df.merge(df, left_index=True, right_index=True)
self.aggregated_profiles.append(df)

self.aggregated_profiles = pd.concat([x for x in self.aggregated_profiles])
self.aggregated_profiles.columns = [
str(x) for x in self.aggregated_profiles.columns
]
meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
reindex_features = [str(x) for x in features]
self.aggregated_profiles = self.aggregated_profiles.reindex(
meta_features + reindex_features, axis="columns"
)

def annotate_deep(
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
self, annotate_cols, merge_cols=["Metadata_Plate", "Metadata_Well"]
):
if not hasattr(self, "aggregated_profiles"):
self.aggregate_deep()

meta_df = self.index_df.loc[:, annotate_cols].drop_duplicates()

meta_df.columns = [
"Metadata_{}".format(x) if not x.startswith("Metadata_") else x
for x in meta_df.columns
]

return meta_df.merge(self.aggregated_profiles, on=merge_cols, how="inner")
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
5 changes: 1 addition & 4 deletions pycytominer/cyto_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
assert_linking_cols_complete,
provide_linking_cols_feature_name_update,
)
from .load import (
load_profiles,
load_platemap,
)
from .load import load_profiles, load_platemap, load_npz, infer_delim
from .features import (
get_blocklist_features,
label_compartment,
Expand Down
47 changes: 47 additions & 0 deletions pycytominer/cyto_utils/load.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import gzip
import numpy as np
import pandas as pd


Expand Down Expand Up @@ -68,3 +69,49 @@ def load_platemap(platemap, add_metadata_id=True):
for x in platemap.columns
]
return platemap


def load_npz(npz_file, fallback_feature_prefix="DP"):
"""
Load an npz file storing features and, sometimes, metadata.

Arguments:
npz_file - file path to the compressed output (typically DeepProfiler output)
fallback_feature_prefix - a string to prefix all features [default: "DP"].
The function will first search the .npz file for a metadata column called
"Metadata_Model". If the field exists, the function uses this entry as the
feature prefix. If it doesn't exist, use the fallback_feature_prefix.
"""
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
npz = np.load(npz_file)
michaelbornholdt marked this conversation as resolved.
Show resolved Hide resolved
files = npz.files

# Load features
df = pd.DataFrame(npz["features"])

# Load metadata
if "metadata" in files:
metadata = npz["metadata"].item()
metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]))
metadata_df.columns = [
f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
]

# Determine the appropriate metadata prefix
if "Metadata_Model" in metadata_df.columns:
feature_prefix = metadata_df.Metadata_Model.unique()[0]
else:
feature_prefix = fallback_feature_prefix
else:
feature_prefix = fallback_feature_prefix

# Append feature prefix
df.columns = [
f"{feature_prefix}_{x}" if not str(x).startswith(feature_prefix) else x
for x in df
]

# Append metadata with features
if "metadata" in files:
df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)

return df
71 changes: 69 additions & 2 deletions pycytominer/tests/test_cyto_utils/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,34 @@
import tempfile
import numpy as np
import pandas as pd
from pycytominer.cyto_utils import load_profiles, load_platemap
from pycytominer.cyto_utils import load_profiles, load_platemap, load_npz
from pycytominer.cyto_utils.load import infer_delim

random.seed(123)

# Get temporary directory
tmpdir = tempfile.gettempdir()

# Lauch a sqlite connection
# Set file paths for data-to-be-loaded
output_data_file = os.path.join(tmpdir, "test_data.csv")
output_data_comma_file = os.path.join(tmpdir, "test_data_comma.csv")
output_data_gzip_file = "{}.gz".format(output_data_file)
output_platemap_file = os.path.join(tmpdir, "test_platemap.csv")
output_platemap_comma_file = os.path.join(tmpdir, "test_platemap_comma.csv")
output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
output_npz_file = os.path.join(tmpdir, "test_npz.npz")
output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")

# Example .npz file with real data
example_npz_file = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"data",
"DeepProfiler_example_data",
"Week1_22123_B02_s1.npz",
)

# Build data to use in tests
data_df = pd.concat(
Expand All @@ -39,6 +52,10 @@
}
).reset_index(drop=True)

npz_metadata_dict = {"Plate": "PlateA", "Well": "A01", "Site": 2}
npz_model_key = {"Model": "cnn"}
npz_feats = data_df.drop("Metadata_Well", axis="columns").values

# Write to temp files
data_df.to_csv(output_data_file, sep="\t", index=False)
data_df.to_csv(output_data_comma_file, sep=",", index=False)
Expand All @@ -47,6 +64,17 @@
platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")

# Write npz temp files
key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
npz_metadata_dict.update(npz_model_key)
key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}

np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
np.savez_compressed(
output_npz_with_model_file, features=npz_feats, metadata=key_with_model_values
)
np.savez_compressed(output_npz_without_metadata_file, features=npz_feats)


def test_infer_delim():
delim = infer_delim(output_platemap_file)
Expand Down Expand Up @@ -88,3 +116,42 @@ def test_load_platemap():
platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)


def test_load_npz():
npz_df = load_npz(output_npz_file)
npz_custom_prefix_df = load_npz(output_npz_file, fallback_feature_prefix="test")
npz_with_model_df = load_npz(output_npz_with_model_file)
npz_no_meta_df = load_npz(output_npz_without_metadata_file)
real_data_df = load_npz(example_npz_file)

core_cols = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]

assert npz_df.shape == (6, 5)
assert npz_df.columns.tolist() == core_cols + ["DP_0", "DP_1"]

assert npz_custom_prefix_df.shape == (6, 5)
assert npz_custom_prefix_df.columns.tolist() == core_cols + ["test_0", "test_1"]

assert npz_with_model_df.shape == (6, 6)
assert npz_with_model_df.columns.tolist() == core_cols + [
"Metadata_Model",
"cnn_0",
"cnn_1",
]

assert npz_no_meta_df.shape == (6, 2)
assert npz_no_meta_df.columns.tolist() == ["DP_0", "DP_1"]

pd.testing.assert_frame_equal(
npz_df.drop(core_cols, axis="columns"), npz_no_meta_df
)

# Check real data
assert real_df.shape == (206, 54)
assert all([x in real_df.columns for x in core_cols + ["Metadata_Model"]])
assert len(real_df.Metadata_Model.unique()) == 1
assert real_df.Metadata_Model.unique()[0] == "cnn"
assert real_df.drop(
core_cols + ["Metadata_Model"], axis="columns"
).columns.tolist() == [f"cnn_{x}" for x in range(0, 50)]