cytomining · gwaybio · Jun 4, 2021 · May 8, 2020 · Aug 14, 2020 · Aug 14, 2020
diff --git a/pycytominer/cyto_utils/DeepProfiler_processing.py b/pycytominer/cyto_utils/DeepProfiler_processing.py
@@ -0,0 +1,160 @@
+"""
+Utility function to load and process the output files of a DeepProfiler run.
+"""
+import os
+import pathlib
+import numpy as np
+import pandas as pd
+
+from pycytominer import aggregate
+from pycytominer.cyto_utils import infer_cp_features, load_npz
+
+
+class AggregateDeepProfiler:
+    """This class holds all functions needed to load and annotate the DeepProfiler run.
+
+    If the class has public attributes, they may be documented here
+    in an ``Attributes`` section and follow the same formatting as a
+    function's ``Args`` section. Alternatively, attributes may be documented
+    inline with the attribute's declaration (see __init__ method below).
+
+    Properties created with the ``@property`` decorator should be documented
+    in the property's getter method.
+
+    Attributes
+    ----------
+    attr1 : str
+        Description of `attr1`.
+    attr2 : :obj:`int`, optional
+        Description of `attr2`.
+
+    """
+    def __init__(
+        self,
+        index_file,
+        profile_dir,
+        aggregate_operation="median",
+        aggregate_on="well",
+        file_delimiter="_",
+        file_extension=".npz",
+    ):
+        self.index_file = index_file
+        self.profile_dir = profile_dir
+        self.aggregate_operation = aggregate_operation
+        self.aggregate_on = aggregate_on
+        self.file_delimiter = file_delimiter
+        self.file_extension = file_extension
+        if not self.file_extension.startswith("."):
+            self.file_extension = f".{self.file_extension}"
+        self.index_df = pd.read_csv(index_file)
+
+    def build_filenames(self):
+        """
+        Single cell profile file names indicated by plate, well, and site information
+        """
+        self.filenames = self.index_df.apply(
+            self.build_filename_from_index, axis="columns"
+        )
+        self.filenames = [
+            pathlib.PurePath(f"{self.profile_dir}/{x}") for x in self.filenames
+        ]
+
+    def build_filename_from_index(self, row):
+        plate = row["Metadata_Plate"]
+        well = row["Metadata_Well"]
+        site = row["Metadata_Site"]
+
+        filename = f"{plate}_{well}_{site}{self.file_extension}"
+        return filename
+
+    def extract_filename_metadata(self, npz_file, delimiter="_"):
+        """
+        Format: plate_well_site.npz
+        """
+        base_file = os.path.basename(npz_file).strip(".npz").split(delimiter)
+        site = base_file[-1]
+        well = base_file[-2]
+        plate = delimiter.join(base_file[:-2])
+
+        return {"site": site, "well": well, "plate": plate}
+
+    def setup_aggregate(self):
+        if not hasattr(self, "filenames"):
+            self.build_filenames()
+
+        self.file_aggregate = {}
+        for filename in self.filenames:
+            file_info = self.extract_filename_metadata(filename, self.file_delimiter)
+            file_key = file_info[self.aggregate_on]
+
+            if self.aggregate_on == "site":
+                file_key = (
+                    f"{file_info['plate']}_{file_info['well']}_{file_info['site']}"
+                )
+
+            if self.aggregate_on == "well":
+                file_key = f"{file_info['plate']}_{file_info['well']}"
+
+            if file_key in self.file_aggregate:
+                self.file_aggregate[file_key]["files"].append(filename)
+            else:
+                self.file_aggregate[file_key] = {}
+                self.file_aggregate[file_key]["files"] = [filename]
+
+            self.file_aggregate[file_key]["metadata"] = file_info
+
+    def aggregate_deep(self):
+        if not hasattr(self, "file_aggregate"):
+            self.setup_aggregate()
+
+        self.aggregated_profiles = []
+        self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"
+
+        for metadata_level in self.file_aggregate:
+            df = pd.concat(
+                [load_npz(x) for x in self.file_aggregate[metadata_level]["files"]]
+            )
+            meta_df = pd.DataFrame(
+                self.file_aggregate[metadata_level]["metadata"], index=[0]
+            )
+            meta_df.columns = [f"Metadata_{x.capitalize()}" for x in meta_df.columns]
+
+            if self.aggregate_on == "well":
+                meta_df = meta_df.drop("Metadata_Site", axis="columns")
+
+            features = df.columns.tolist()
+            df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
+            df = aggregate(
+                population_df=df,
+                strata="Metadata_Aggregate_On",
+                features=features,
+                operation=self.aggregate_operation,
+            )
+            df.loc[:, self.aggregate_merge_col] = metadata_level
+            df = meta_df.merge(df, left_index=True, right_index=True)
+            self.aggregated_profiles.append(df)
+
+        self.aggregated_profiles = pd.concat([x for x in self.aggregated_profiles])
+        self.aggregated_profiles.columns = [
+            str(x) for x in self.aggregated_profiles.columns
+        ]
+        meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
+        reindex_features = [str(x) for x in features]
+        self.aggregated_profiles = self.aggregated_profiles.reindex(
+            meta_features + reindex_features, axis="columns"
+        )
+
+    def annotate_deep(
+        self, annotate_cols, merge_cols=["Metadata_Plate", "Metadata_Well"]
+    ):
+        if not hasattr(self, "aggregated_profiles"):
+            self.aggregate_deep()
+
+        meta_df = self.index_df.loc[:, annotate_cols].drop_duplicates()
+
+        meta_df.columns = [
+            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
+            for x in meta_df.columns
+        ]
+
+        return meta_df.merge(self.aggregated_profiles, on=merge_cols, how="inner")
diff --git a/pycytominer/cyto_utils/__init__.py b/pycytominer/cyto_utils/__init__.py
@@ -15,10 +15,7 @@
     assert_linking_cols_complete,
     provide_linking_cols_feature_name_update,
 )
-from .load import (
-    load_profiles,
-    load_platemap,
-)
+from .load import load_profiles, load_platemap, load_npz, infer_delim
 from .features import (
     get_blocklist_features,
     label_compartment,

diff --git a/pycytominer/cyto_utils/load.py b/pycytominer/cyto_utils/load.py
@@ -1,5 +1,6 @@
 import csv
 import gzip
+import numpy as np
 import pandas as pd
 
 
@@ -68,3 +69,49 @@ def load_platemap(platemap, add_metadata_id=True):
             for x in platemap.columns
         ]
     return platemap
+
+
+def load_npz(npz_file, fallback_feature_prefix="DP"):
+    """
+    Load an npz file storing features and, sometimes, metadata.
+
+    Arguments:
+    npz_file - file path to the compressed output (typically DeepProfiler output)
+    fallback_feature_prefix - a string to prefix all features [default: "DP"].
+        The function will first search the .npz file for a metadata column called
+        "Metadata_Model". If the field exists, the function uses this entry as the
+        feature prefix. If it doesn't exist, use the fallback_feature_prefix.
+    """
+    npz = np.load(npz_file)
+    files = npz.files
+
+    # Load features
+    df = pd.DataFrame(npz["features"])
+
+    # Load metadata
+    if "metadata" in files:
+        metadata = npz["metadata"].item()
+        metadata_df = pd.DataFrame(metadata, index=range(0, df.shape[0]))
+        metadata_df.columns = [
+            f"Metadata_{x}" if not x.startswith("Metadata_") else x for x in metadata_df
+        ]
+
+        # Determine the appropriate metadata prefix
+        if "Metadata_Model" in metadata_df.columns:
+            feature_prefix = metadata_df.Metadata_Model.unique()[0]
+        else:
+            feature_prefix = fallback_feature_prefix
+    else:
+        feature_prefix = fallback_feature_prefix
+
+    # Append feature prefix
+    df.columns = [
+        f"{feature_prefix}_{x}" if not str(x).startswith(feature_prefix) else x
+        for x in df
+    ]
+
+    # Append metadata with features
+    if "metadata" in files:
+        df = metadata_df.merge(df, how="outer", left_index=True, right_index=True)
+
+    return df
diff --git a/pycytominer/tests/test_cyto_utils/test_load.py b/pycytominer/tests/test_cyto_utils/test_load.py
@@ -4,21 +4,34 @@
 import tempfile
 import numpy as np
 import pandas as pd
-from pycytominer.cyto_utils import load_profiles, load_platemap
+from pycytominer.cyto_utils import load_profiles, load_platemap, load_npz
 from pycytominer.cyto_utils.load import infer_delim
 
 random.seed(123)
 
 # Get temporary directory
 tmpdir = tempfile.gettempdir()
 
-# Lauch a sqlite connection
+# Set file paths for data-to-be-loaded
 output_data_file = os.path.join(tmpdir, "test_data.csv")
 output_data_comma_file = os.path.join(tmpdir, "test_data_comma.csv")
 output_data_gzip_file = "{}.gz".format(output_data_file)
 output_platemap_file = os.path.join(tmpdir, "test_platemap.csv")
 output_platemap_comma_file = os.path.join(tmpdir, "test_platemap_comma.csv")
 output_platemap_file_gzip = "{}.gz".format(output_platemap_file)
+output_npz_file = os.path.join(tmpdir, "test_npz.npz")
+output_npz_with_model_file = os.path.join(tmpdir, "test_npz_withmodel.npz")
+output_npz_without_metadata_file = os.path.join(tmpdir, "test_npz_withoutmetadata.npz")
+
+# Example .npz file with real data
+example_npz_file = os.path.join(
+    os.path.dirname(__file__),
+    "..",
+    "..",
+    "data",
+    "DeepProfiler_example_data",
+    "Week1_22123_B02_s1.npz",
+)
 
 # Build data to use in tests
 data_df = pd.concat(
@@ -39,6 +52,10 @@
     }
 ).reset_index(drop=True)
 
+npz_metadata_dict = {"Plate": "PlateA", "Well": "A01", "Site": 2}
+npz_model_key = {"Model": "cnn"}
+npz_feats = data_df.drop("Metadata_Well", axis="columns").values
+
 # Write to temp files
 data_df.to_csv(output_data_file, sep="\t", index=False)
 data_df.to_csv(output_data_comma_file, sep=",", index=False)
@@ -47,6 +64,17 @@
 platemap_df.to_csv(output_platemap_comma_file, sep=",", index=False)
 platemap_df.to_csv(output_platemap_file_gzip, sep="\t", index=False, compression="gzip")
 
+# Write npz temp files
+key_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
+npz_metadata_dict.update(npz_model_key)
+key_with_model_values = {k: npz_metadata_dict[k] for k in npz_metadata_dict.keys()}
+
+np.savez_compressed(output_npz_file, features=npz_feats, metadata=key_values)
+np.savez_compressed(
+    output_npz_with_model_file, features=npz_feats, metadata=key_with_model_values
+)
+np.savez_compressed(output_npz_without_metadata_file, features=npz_feats)
+
 
 def test_infer_delim():
     delim = infer_delim(output_platemap_file)
@@ -88,3 +116,42 @@ def test_load_platemap():
     platemap_with_annotation = load_platemap(output_platemap_file, add_metadata_id=True)
     platemap_df.columns = [f"Metadata_{x}" for x in platemap_df.columns]
     pd.testing.assert_frame_equal(platemap_with_annotation, platemap_df)
+
+
+def test_load_npz():
+    npz_df = load_npz(output_npz_file)
+    npz_custom_prefix_df = load_npz(output_npz_file, fallback_feature_prefix="test")
+    npz_with_model_df = load_npz(output_npz_with_model_file)
+    npz_no_meta_df = load_npz(output_npz_without_metadata_file)
+    real_data_df = load_npz(example_npz_file)
+
+    core_cols = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"]
+
+    assert npz_df.shape == (6, 5)
+    assert npz_df.columns.tolist() == core_cols + ["DP_0", "DP_1"]
+
+    assert npz_custom_prefix_df.shape == (6, 5)
+    assert npz_custom_prefix_df.columns.tolist() == core_cols + ["test_0", "test_1"]
+
+    assert npz_with_model_df.shape == (6, 6)
+    assert npz_with_model_df.columns.tolist() == core_cols + [
+        "Metadata_Model",
+        "cnn_0",
+        "cnn_1",
+    ]
+
+    assert npz_no_meta_df.shape == (6, 2)
+    assert npz_no_meta_df.columns.tolist() == ["DP_0", "DP_1"]
+
+    pd.testing.assert_frame_equal(
+        npz_df.drop(core_cols, axis="columns"), npz_no_meta_df
+    )
+
+    # Check real data
+    assert real_df.shape == (206, 54)
+    assert all([x in real_df.columns for x in core_cols + ["Metadata_Model"]])
+    assert len(real_df.Metadata_Model.unique()) == 1
+    assert real_df.Metadata_Model.unique()[0] == "cnn"
+    assert real_df.drop(
+        core_cols + ["Metadata_Model"], axis="columns"
+    ).columns.tolist() == [f"cnn_{x}" for x in range(0, 50)]