cytomining · niranjchandrasekaran · Aug 27, 2021 · Jul 6, 2021 · Jul 6, 2021 · Jul 7, 2021
diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -9,6 +9,7 @@
     correlation_threshold,
     variance_threshold,
     get_na_columns,
+    noise_removal,
 )
 from pycytominer.cyto_utils import (
     load_profiles,
@@ -34,6 +35,8 @@ def feature_select(
     float_format=None,
     blocklist_file=None,
     outlier_cutoff=15,
+    noise_removal_perturb_groups=None,
+    noise_removal_stdev_cutoff=None,
 ):
     """
     Performs feature selection based on the given operation
@@ -66,13 +69,20 @@ def feature_select(
                      across a full experiment is excluded. Note that this
                      procedure is typically applied (and therefore the default is
                      suitable) for after normalization.
+    noise_removal_perturb_groups - [default: None] list of perturbation groups corresponding to rows in profiles or
+                                   str specifying the name of the metadata column containing this information.
+                                 Note that noise removal should only be used on normalized data.
+    noise_removal_stdev_cutoff - [default: None] maximum mean feature standard deviation to be kept for noise removal,
+                                 grouped by the identity of the perturbation from perturb_list.
+                                 The data must already be normalized so that this cutoff can apply to all columns.
     """
     all_ops = [
         "variance_threshold",
         "correlation_threshold",
         "drop_na_columns",
         "blocklist",
         "drop_outliers",
+        "noise_removal",
     ]
 
     # Make sure the user provides a supported operation
@@ -133,7 +143,13 @@ def feature_select(
                 samples=samples,
                 outlier_cutoff=outlier_cutoff,
             )
-
+        elif op == "noise_removal":
+            exclude = noise_removal(
+                population_df=profiles,
+                features=features,
+                noise_removal_perturb_groups=noise_removal_perturb_groups,
+                noise_removal_stdev_cutoff=noise_removal_stdev_cutoff,
+            )
         excluded_features += exclude
 
     excluded_features = list(set(excluded_features))

diff --git a/pycytominer/operations/__init__.py b/pycytominer/operations/__init__.py
@@ -3,3 +3,4 @@
 from .get_na_columns import get_na_columns
 from .transform import Spherize, RobustMAD
 from .sparse_random_projection import sparse_random_projection
+from .noise_removal import noise_removal
diff --git a/pycytominer/operations/noise_removal.py b/pycytominer/operations/noise_removal.py
@@ -0,0 +1,78 @@
+"""
+Remove noisy features, as defined by features with excessive standard deviation within the same perturbation group.
+"""
+
+import numpy as np
+import pandas as pd
+from pycytominer.cyto_utils import infer_cp_features
+
+
+def noise_removal(
+    population_df,
+    noise_removal_perturb_groups,
+    features,
+    samples="all",
+    noise_removal_stdev_cutoff=0.8,
+):
+    """
+
+    Parameters
+    ----------
+    population_df: pandas.core.frame.DataFrame
+        Dataframe which contains all measurement data and optionally metadata such as the identity of the perturbation
+        group for each row.
+    noise_removal_perturb_groups: list or array of str
+        The list of unique perturbations corresponding to the rows in population_df. For example,
+        perturb1_well1 and perturb1_well2 would both be "perturb1".
+    features: list of str, default "infer"
+        List of features. Can be inferred or manually supplied.
+    samples: list of str, default "infer"
+        Which rows to use from population_df. Use "all" if applicable.
+    noise_removal_stdev_cutoff: float
+        Maximum mean stdev value for a feature to be kept, with features grouped according to the perturbations in
+        noise_removal_perturbation_groups.
+
+    Returns
+    ----------
+    list
+        A list of features to be removed, due to having too high standard deviation within replicate groups.
+
+    """
+    # Subset dataframe
+    if samples != "all":
+        population_df = population_df.loc[samples, :]
+
+    if features == "infer":
+        features = infer_cp_features(population_df)
+
+    # If a metadata column name is specified, use that as the perturb groups
+    if isinstance(noise_removal_perturb_groups, str):
+        assert noise_removal_perturb_groups in population_df.columns, (
+            'f"{perturb} not found. Are you sure it is a ' "metadata column?"
+        )
+        group_info = population_df[noise_removal_perturb_groups]
+    # Otherwise, the user specifies a list of perturbs
+    elif isinstance(noise_removal_perturb_groups, list):
+        assert len(noise_removal_perturb_groups) == len(population_df), (
+            f"The length of input list: {len(noise_removal_perturb_groups)} is not equivalent to your "
+            f"data: {population_df.shape[0]}"
+        )
+        group_info = noise_removal_perturb_groups
+    else:
+        raise TypeError(
+            "noise_removal_perturb_groups must be a list corresponding to row perturbations or a str \
+                        specifying the name of the metadata column."
+        )
+    # Subset and df and assign each row with the identity of its perturbation group
+    population_df = population_df.loc[:, features]
+    population_df = population_df.assign(group_id=group_info)
+
+    # Get the standard deviations of features within each group
+    stdev_means_df = population_df.groupby("group_id").apply(lambda x: np.std(x)).mean()
+
+    # Identify noisy features with a greater mean stdev within perturbation group than the threshold
+    to_remove = stdev_means_df[
+        stdev_means_df > noise_removal_stdev_cutoff
+    ].index.tolist()
+
+    return to_remove
diff --git a/pycytominer/tests/test_feature_select.py b/pycytominer/tests/test_feature_select.py
@@ -3,6 +3,7 @@
 import tempfile
 import numpy as np
 import pandas as pd
+import pytest
 from pycytominer.feature_select import feature_select
 
 random.seed(123)
@@ -41,7 +42,6 @@
     }
 ).reset_index(drop=True)
 
-
 a_feature = [1] * 99 + [2]
 b_feature = [1, 2] * 50
 c_feature = [1, 2] * 25 + random.sample(range(1, 1000), 50)
@@ -51,7 +51,6 @@
     {"a": a_feature, "b": b_feature, "c": c_feature, "d": d_feature}
 ).reset_index(drop=True)
 
-
 data_outlier_df = pd.DataFrame(
     {
         "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
@@ -73,6 +72,129 @@
 ).reset_index(drop=True)
 
 
+def test_feature_select_noise_removal():
+    """
+    Testing noise_removal feature selection operation
+    """
+    # Set perturbation groups for the test dataframes
+    data_df_groups = ["a", "a", "a", "b", "b", "b"]
+
+    # Tests on data_df
+    result1 = feature_select(
+        profiles=data_df,
+        features=data_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups=data_df_groups,
+        noise_removal_stdev_cutoff=2.5,
+    )
+    result2 = feature_select(
+        profiles=data_df,
+        features=data_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups=data_df_groups,
+        noise_removal_stdev_cutoff=2,
+    )
+    result3 = feature_select(
+        profiles=data_df,
+        features=data_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups=data_df_groups,
+        noise_removal_stdev_cutoff=3.5,
+    )
+    expected_result1 = data_df[["x", "y"]]
+    expected_result2 = data_df[[]]
+    expected_result3 = data_df[["x", "y", "z", "zz"]]
+    pd.testing.assert_frame_equal(result1, expected_result1)
+    pd.testing.assert_frame_equal(result2, expected_result2)
+    pd.testing.assert_frame_equal(result3, expected_result3)
+
+    # Test on data_unique_test_df, which has 100 rows
+    data_unique_test_df_groups = []
+    # Create a 100 element list containing 10 replicates of 10 perturbations
+    for elem in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]:
+        data_unique_test_df_groups.append([elem] * 10)
+    # Unstack so it's just a single list
+    data_unique_test_df_groups = [
+        item for sublist in data_unique_test_df_groups for item in sublist
+    ]
+
+    result4 = feature_select(
+        profiles=data_unique_test_df,
+        features=data_unique_test_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups=data_unique_test_df_groups,
+        noise_removal_stdev_cutoff=3.5,
+    )
+    result5 = feature_select(
+        profiles=data_unique_test_df,
+        features=data_unique_test_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups=data_unique_test_df_groups,
+        noise_removal_stdev_cutoff=500,
+    )
+    expected_result4 = data_unique_test_df[["a", "b"]]
+    expected_result5 = data_unique_test_df[["a", "b", "c", "d"]]
+    pd.testing.assert_frame_equal(result4, expected_result4)
+    pd.testing.assert_frame_equal(result5, expected_result5)
+
+    # Test the same as above, except that data_unique_test_df_groups is now made into a metadata column
+    data_unique_test_df2 = data_unique_test_df.copy()
+    data_unique_test_df2["perturb_group"] = data_unique_test_df_groups
+    result4b = feature_select(
+        profiles=data_unique_test_df2,
+        features=data_unique_test_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups="perturb_group",
+        noise_removal_stdev_cutoff=3.5,
+    )
+    result5b = feature_select(
+        profiles=data_unique_test_df2,
+        features=data_unique_test_df.columns.tolist(),
+        operation="noise_removal",
+        noise_removal_perturb_groups="perturb_group",
+        noise_removal_stdev_cutoff=500,
+    )
+    expected_result4b = data_unique_test_df2[["a", "b", "perturb_group"]]
+    expected_result5b = data_unique_test_df2[["a", "b", "c", "d", "perturb_group"]]
+    pd.testing.assert_frame_equal(result4b, expected_result4b)
+    pd.testing.assert_frame_equal(result5b, expected_result5b)
+
+    # Test assertion errors for the user inputting the perturbation groupings
+    bad_perturb_list = ["a", "a", "b", "b", "a", "a", "b"]
+    with pytest.raises(
+        AssertionError
+    ):  # When the inputted perturb list doesn't match the length of the data
+        feature_select(
+            data_df,
+            features=data_df.columns.tolist(),
+            operation="noise_removal",
+            noise_removal_perturb_groups=bad_perturb_list,
+            noise_removal_stdev_cutoff=3,
+        )
+
+    with pytest.raises(
+        AssertionError
+    ):  # When the perturb list is inputted as string, but there is no such metadata column in the population_df
+        feature_select(
+            profiles=data_df,
+            features=data_df.columns.tolist(),
+            operation="noise_removal",
+            noise_removal_perturb_groups="bad_string",
+            noise_removal_stdev_cutoff=2.5,
+        )
+
+    with pytest.raises(
+        TypeError
+    ):  # When the perturbation groups are not either a list or metadata column string
+        feature_select(
+            profiles=data_df,
+            features=data_df.columns.tolist(),
+            operation="noise_removal",
+            noise_removal_perturb_groups=12345,
+            noise_removal_stdev_cutoff=2.5,
+        )
+
+
 def test_feature_select_get_na_columns():
     """
     Testing feature_select and get_na_columns pycytominer function