cytomining · gwaybio · Dec 9, 2019 · Dec 9, 2019 · Dec 9, 2019 · Dec 9, 2019
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -90,3 +90,41 @@ def infer_cp_features(population_df, metadata=False):
     ), "No CP features found. Are you sure this dataframe is from CellProfiler?"
 
     return features
+
+
+def drop_outlier_features(
+    population_df, features="infer", samples="none", outlier_cutoff=15
+):
+    """
+    Exclude a feature if its min or max absolute value is greater than the threshold
+
+    Arguments:
+    population_df - pandas DataFrame that includes metadata and observation features
+    features - a list of features present in the population dataframe [default: "infer"]
+               if "infer", then assume cell painting features are those that start with
+               "Cells_", "Nuclei_", or "Cytoplasm_"
+    samples - list samples to perform operation on
+              [default: "none"] - if "none", use all samples to calculate
+    outlier_cutoff - threshold to remove feature if absolute value is greater
+
+    Return:
+    list of features to exclude from the population_df
+    """
+    # Subset dataframe
+    if samples != "none":
+        population_df = population_df.loc[samples, :]
+
+    if features == "infer":
+        features = infer_cp_features(population_df)
+        population_df = population_df.loc[:, features]
+    else:
+        population_df = population_df.loc[:, features]
+
+    max_feature_values = population_df.max().abs()
+    min_feature_values = population_df.min().abs()
+
+    outlier_features = max_feature_values[
+        (max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
+    ].index.tolist()
+
+    return outlier_features
diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -9,7 +9,11 @@
 from pycytominer.variance_threshold import variance_threshold
 from pycytominer.get_na_columns import get_na_columns
 from pycytominer.cyto_utils.output import output
-from pycytominer.cyto_utils.features import get_blacklist_features, infer_cp_features
+from pycytominer.cyto_utils.features import (
+    get_blacklist_features,
+    infer_cp_features,
+    drop_outlier_features,
+)
 
 
 def feature_select(
@@ -26,6 +30,7 @@ def feature_select(
     compression=None,
     float_format=None,
     blacklist_file=None,
+    outlier_cutoff=15,
 ):
     """
     Performs feature selection based on the given operation
@@ -52,12 +57,17 @@ def feature_select(
     blacklist_file - file location of dataframe with features to exclude [default: None]
                      Note that if "blacklist" in operation then will remove standard
                      blacklist
+    outlier_cutoff - the threshold at which the maximum or minimum value of a feature
+                     across a full experiment is excluded [default: 15]. Note that this
+                     procedure is typically applied (and therefore the default is
+                     suitable) for after normalization.
     """
     all_ops = [
         "variance_threshold",
         "correlation_threshold",
         "drop_na_columns",
         "blacklist",
+        "drop_outliers",
     ]
 
     # Make sure the user provides a supported operation
@@ -114,6 +124,13 @@ def feature_select(
                 )
             else:
                 exclude = get_blacklist_features(population_df=profiles)
+        elif op == "drop_outliers":
+            exclude = drop_outlier_features(
+                population_df=profiles,
+                features=features,
+                samples=samples,
+                outlier_cutoff=outlier_cutoff,
+            )
 
         excluded_features += exclude
 

diff --git a/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py b/pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py
@@ -0,0 +1,55 @@
+import os
+import random
+import pytest
+import tempfile
+import warnings
+import pandas as pd
+from pycytominer.cyto_utils.features import drop_outlier_features
+
+# Build data to use in tests
+data_df = pd.DataFrame(
+    {
+        "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
+        "Metadata_treatment": [
+            "drug",
+            "drug",
+            "control",
+            "control",
+            "drug",
+            "drug",
+            "control",
+            "control",
+        ],
+        "Cells_x": [1, 2, -8, 2, 5, 5, 5, -1],
+        "Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1],
+        "Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2],
+        "Cells_zz": [14, -46, 1, 60, -30, -100, 2, 2],
+    }
+).reset_index(drop=True)
+
+
+def test_outlier_default():
+    result = drop_outlier_features(data_df)
+    expected_result = ["Cells_zz", "Nuclei_z"]
+    assert sorted(result) == sorted(expected_result)
+
+
+def test_outlier_high_cutoff():
+    result = drop_outlier_features(data_df, outlier_cutoff=30)
+    expected_result = ["Cells_zz"]
+    assert result == expected_result
+
+
+def test_outlier_samples():
+    result = drop_outlier_features(data_df, samples=[0, 1, 2, 3, 5])
+    expected_result = ["Cells_zz", "Nuclei_z"]
+    assert sorted(result) == sorted(expected_result)
+
+    result = drop_outlier_features(data_df, samples=[0, 1, 2, 3])
+    expected_result = ["Cells_zz"]
+    assert result == expected_result
+
+
+def test_outlier_features():
+    result = drop_outlier_features(data_df, features=["Cells_x", "Cytoplasm_y"])
+    assert len(result) == 0
diff --git a/pycytominer/tests/test_feature_select.py b/pycytominer/tests/test_feature_select.py
@@ -52,6 +52,27 @@
 ).reset_index(drop=True)
 
 
+data_outlier_df = pd.DataFrame(
+    {
+        "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
+        "Metadata_treatment": [
+            "drug",
+            "drug",
+            "control",
+            "control",
+            "drug",
+            "drug",
+            "control",
+            "control",
+        ],
+        "Cells_x": [1, 2, -8, 2, 5, 5, 5, -1],
+        "Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1],
+        "Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2],
+        "Cells_zz": [14, -46, 1, 60, -30, -100, 2, 2],
+    }
+).reset_index(drop=True)
+
+
 def test_feature_select_get_na_columns():
     """
     Testing feature_select and get_na_columns pycytominer function
@@ -273,3 +294,25 @@ def test_feature_select_blacklist():
     )
     expected_result = pd.DataFrame({"y": [1, 2, 8, 5, 2, 1], "zz": [0, -3, 8, 9, 6, 9]})
     pd.testing.assert_frame_equal(result, expected_result)
+
+
+def test_feature_select_drop_outlier():
+    """
+    Testing feature_select and get_na_columns pycytominer function
+    """
+    result = feature_select(
+        data_outlier_df, features="infer", operation="drop_outliers"
+    )
+    expected_result = data_outlier_df.drop(["Cells_zz", "Nuclei_z"], axis="columns")
+    pd.testing.assert_frame_equal(result, expected_result)
+
+    result = feature_select(
+        data_outlier_df, features="infer", operation="drop_outliers", outlier_cutoff=30
+    )
+    expected_result = data_outlier_df.drop(["Cells_zz"], axis="columns")
+    pd.testing.assert_frame_equal(result, expected_result)
+
+    result = feature_select(
+        data_outlier_df, features=["Cells_x", "Cytoplasm_y"], operation="drop_outliers"
+    )
+    pd.testing.assert_frame_equal(result, data_outlier_df)