Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing DeepProfiler single cell output and normalization #210

Merged
merged 22 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 257 additions & 43 deletions pycytominer/cyto_utils/DeepProfiler_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,24 @@
import pandas as pd
import warnings

from pycytominer import aggregate
from pycytominer.cyto_utils import load_npz, infer_cp_features
from pycytominer import aggregate, normalize
from pycytominer.cyto_utils import (
load_npz_features,
load_npz_locations,
infer_cp_features,
output,
)


class AggregateDeepProfiler:
class DeepProfilerData:

"""This class holds all functions needed to load and annotate the DeepProfiler (DP) run.

Attributes
----------
profile_dir : str
file location of the output profiles from DeepProfiler
(e.g. `/project1/outputs/results/features/`)
aggregate_operation : ['median', 'mean']
method of aggregation
aggregate_on : ['site', 'well', 'plate']
up to which level to aggregate
filename_delimiter : default = '_'
delimiter for the filenames of the profiles (e.g. B02_4.npz).
file_extension : default = '.npz'
Expand All @@ -31,30 +33,21 @@ class AggregateDeepProfiler:
load in the index.csv file from DeepProfiler, provided by an input index file.
filenames : list of paths
list of Purepaths that point to the npz files.
aggregated_profiles : pandas.DataFrame
df to hold the metadata and profiles.
file_aggregate : dict
dict that holds the file names and metadata.
Is used to load in the npz files in the correct order and grouping.
output_file : str
If provided, will write annotated profiles to folder. Defaults to "none".

Methods
-------
aggregate_deep()
Given an initialized AggregateDeepProfiler() class, run this function to output
level 3 profiles (aggregated profiles with annotated metadata).
build_filenames()
build filenames from index_df
extract_filename_metadata(npz_file, delimiter="_")
get site, well, plate info for npz file
"""

def __init__(
self,
index_file,
profile_dir,
aggregate_operation="median",
aggregate_on="well",
filename_delimiter="_",
file_extension=".npz",
output_file="none",
):
"""
__init__ function for this class.
Expand All @@ -66,25 +59,13 @@ def __init__(

See above for all other parameters.
"""
assert aggregate_operation in [
"median",
"mean",
], "Input of aggregate_operation is incorrect, it must be either median or mean"
assert aggregate_on in [
"site",
"well",
"plate",
], "Input of aggregate_on is incorrect, it must be either site or well or plate"

self.index_df = pd.read_csv(index_file, dtype=str)
self.profile_dir = profile_dir
self.aggregate_operation = aggregate_operation
self.aggregate_on = aggregate_on
self.filename_delimiter = filename_delimiter
self.file_extension = file_extension
if not self.file_extension.startswith("."):
self.file_extension = f".{self.file_extension}"
self.output_file = output_file

def build_filenames(self):
"""
Expand All @@ -105,13 +86,13 @@ def build_filename_from_index(self, row):
well = row["Metadata_Well"]
site = row["Metadata_Site"]

filename = f"{plate}/{well}_{site}{self.file_extension}"
filename = f"{plate}/{well}{self.filename_delimiter}{site}{self.file_extension}"
return filename

def extract_filename_metadata(self, npz_file, delimiter="_"):
"""
Extract metadata (site, well and plate) from the filename.
The input format of the file: path/plate/well_site.npz
The input format of the file: path/plate/well{delimiter}site.npz

Arguments
---------
Expand All @@ -126,14 +107,83 @@ def extract_filename_metadata(self, npz_file, delimiter="_"):
loc : dict
dict with metadata
"""
base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
site = base_file[-1]
well = base_file[-2]
if delimiter == "/":
site = str(npz_file).split("/")[-1].strip(".npz")
well = str(npz_file).split("/")[-2]
else:
base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
site = base_file[-1]
well = base_file[-2]
plate = str(npz_file).split("/")[-2]

loc = {"site": site, "well": well, "plate": plate}
return loc


class AggregateDeepProfiler:

"""This class holds all functions needed to aggregate the DeepProfiler (DP) run.

Attributes
----------
deep_data : DeepProfilerData
DeepProfilerData object to load data from DeepProfiler project
aggregated_profiles : pandas.DataFrame
df to hold the metadata and profiles.
file_aggregate : dict
dict that holds the file names and metadata.
Is used to load in the npz files in the correct order and grouping.
output_file : str
If provided, will write annotated profiles to folder. Defaults to "none".

Methods
-------
aggregate_deep()
Given an initialized AggregateDeepProfiler() class, run this function to output
level 3 profiles (aggregated profiles with annotated metadata).

Example
-------
import pathlib
from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv")
profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
deep_aggregate = DeepProfiler_processing.AggregateDeepProfiler(deep_data)
deep_aggregate = aggregate.aggregate_deep()
"""

def __init__(
self,
deep_data: DeepProfilerData,
aggregate_operation="median",
aggregate_on="well",
output_file="none",
):
"""
__init__ function for this class.

Arguments
---------
See above for all parameters.
"""
assert aggregate_operation in [
"median",
"mean",
], "Input of aggregate_operation is incorrect, it must be either median or mean"
assert aggregate_on in [
"site",
"well",
"plate",
], "Input of aggregate_on is incorrect, it must be either site or well or plate"

self.deep_data = deep_data
self.aggregate_operation = aggregate_operation
self.aggregate_on = aggregate_on
self.output_file = output_file

def setup_aggregate(self):
"""
Sets up the file_aggregate attribute. This is a helper function to aggregate_deep().
Expand All @@ -142,13 +192,13 @@ def setup_aggregate(self):
If for example we are grouping by well then the keys of self.file_aggregate would be:
plate1/well1, plate1/well2, plate2/well1, etc.
"""
if not hasattr(self, "filenames"):
self.build_filenames()
if not hasattr(self.deep_data, "filenames"):
self.deep_data.build_filenames()

self.file_aggregate = {}
for filename in self.filenames:
file_info = self.extract_filename_metadata(
filename, self.filename_delimiter
for filename in self.deep_data.filenames:
file_info = self.deep_data.extract_filename_metadata(
filename, self.deep_data.filename_delimiter
)
file_key = file_info[self.aggregate_on]

Expand Down Expand Up @@ -191,7 +241,10 @@ def aggregate_deep(self):
# Iterates over all sites, wells or plates
for metadata_level in self.file_aggregate:
# uses custom load function to create df with metadata and profiles
arr = [load_npz(x) for x in self.file_aggregate[metadata_level]["files"]]
arr = [
load_npz_features(x)
for x in self.file_aggregate[metadata_level]["files"]
]
# empty dataframes from missing files are deleted
arr = [x for x in arr if not x.empty]
# if no files were found there is a miss-match between the index and the output files
Expand Down Expand Up @@ -256,3 +309,164 @@ def aggregate_deep(self):

df_out = self.aggregated_profiles
return df_out


class SingleCellDeepProfiler:

"""This class holds functions needed to analyze single cells from the DeepProfiler (DP) run. Only pycytominer.normalization() is implemented.

Attributes
----------
deep_data : DeepProfilerData
DeepProfilerData object to load data from DeepProfiler project
aggregated_profiles : pandas.DataFrame
df to hold the metadata and profiles.
file_aggregate : dict
dict that holds the file names and metadata.
Is used to load in the npz files in the correct order and grouping.
output_file : str
If provided, will write annotated profiles to folder. Defaults to "none".

Methods
-------
normalize(profiles, features, image_features, meta_features, samples, method, output_file, compression_options,
float_format, mad_robustize_epsilon, spherize_center, spherize_method, spherize_epsilon)
normalize profiling features from DeepProfiler run with pycytominer.normalize()

Example
-------
import pathlib
from pycytominer.cyto_utils import DeepProfiler_processing

index_file = pathlib.Path("path/to/index.csv")
profile_dir = pathlib.Path("path/to/features/")

deep_data = DeepProfiler_processing.DeepProfilerData(index_file, profile_dir, filename_delimiter="/", file_extension=".npz")
deep_single_cell = DeepProfiler_processing.SingleCellDeepProfiler(deep_data)
normalized = deep_single_cell.normalize_deep_single_cells()
"""

def __init__(
self,
deep_data: DeepProfilerData,
):
"""
__init__ function for this class.

Arguments
---------
See above for all parameters.
"""

self.deep_data = deep_data
roshankern marked this conversation as resolved.
Show resolved Hide resolved

def get_single_cells(self, output=False):
"""
Sets up the single_cells attribute or output as a variable. This is a helper function to normalize_deep_single_cells().
single_cells is a pandas dataframe in the format expected by pycytominer.normalize().

Arguments
-----------
output : bool
If true, will output the single cell dataframe instead of setting to self attribute
"""
# build filenames if they do not already exist
if not hasattr(self.deep_data, "filenames"):
self.deep_data.build_filenames()

# compile features dataframe with single cell locations
total_df = []
for features_path in self.deep_data.filenames:
features = load_npz_features(features_path)
# skip a file if there are no features
if len(features.index) == 0:
warnings.warn(
f"No features could be found at {features_path}.\nThis program will continue, but be aware that this might induce errors!"
)
continue
locations = load_npz_locations(features_path)
detailed_df = pd.concat([locations, features], axis=1)

total_df.append(detailed_df)

sc_df = pd.concat(total_df).reset_index(drop=True)
if output:
return sc_df
else:
self.single_cells = sc_df

def normalize_deep_single_cells(
self,
roshankern marked this conversation as resolved.
Show resolved Hide resolved
sc_df="none",
image_features=False, # not implemented with DeepProfiler
meta_features="infer",
samples="all",
method="standardize",
output_file="none",
compression_options=None,
float_format=None,
mad_robustize_epsilon=1e-18,
spherize_center=True,
spherize_method="ZCA-cor",
spherize_epsilon=1e-6,
):

"""
Normalizes all cells into a pandas dataframe.

For each file in the DP project features folder, the features from each cell are loaded.
These features are put into a profiles dataframe for use in pycytominer.normalize.
A features list is also compiled for use in pycytominer.normalize.

Returns
-------
df_out : pandas.dataframe
dataframe with all metadata and the feature space.
This is the input to any further pycytominer or pycytominer-eval processing
"""
# setup single_cells attribute
if not hasattr(self, "single_cells"):
self.get_single_cells(output=False)

# extract metadata prior to normalization
metadata_cols = infer_cp_features(self.single_cells, metadata=True)
# locations are not automatically inferred with cp features
metadata_cols.append("Location_Center_X")
metadata_cols.append("Location_Center_Y")
derived_features = [
x for x in self.single_cells.columns.tolist() if x not in metadata_cols
]

# wrapper for pycytominer.normalize() function
normalized = normalize.normalize(
profiles=self.single_cells,
features=derived_features,
image_features=image_features,
meta_features=meta_features,
samples=samples,
method=method,
output_file="none",
compression_options=compression_options,
float_format=float_format,
mad_robustize_epsilon=mad_robustize_epsilon,
spherize_center=spherize_center,
spherize_method=spherize_method,
spherize_epsilon=spherize_epsilon,
)

# move x locations and y locations to metadata columns of normalized df
x_locations = self.single_cells["Location_Center_X"]
normalized.insert(0, "Location_Center_X", x_locations)
y_locations = self.single_cells["Location_Center_Y"]
normalized.insert(1, "Location_Center_Y", y_locations)

# separate code because normalize() will not return if it has an output file specified
if output_file != "none":
output(
df=normalized,
output_filename=output_file,
compression_options=compression_options,
float_format=float_format,
)

return normalized
Loading