Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "noise removal" feature selection #153

Merged
merged 15 commits into from
Aug 27, 2021
Merged
18 changes: 17 additions & 1 deletion pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
correlation_threshold,
variance_threshold,
get_na_columns,
noise_removal,
)
from pycytominer.cyto_utils import (
load_profiles,
Expand All @@ -34,6 +35,8 @@ def feature_select(
float_format=None,
blocklist_file=None,
outlier_cutoff=15,
noise_removal_perturb_groups=None,
noise_removal_stdev_cutoff=None,
):
"""
Performs feature selection based on the given operation
Expand Down Expand Up @@ -66,13 +69,20 @@ def feature_select(
across a full experiment is excluded. Note that this
procedure is typically applied (and therefore the default is
suitable) for after normalization.
noise_removal_perturb_groups - [default: None] list of perturbation groups corresponding to rows in profiles or
str specifying the name of the metadata column containing this information.
Note that noise removal should only be used on normalized data.
noise_removal_stdev_cutoff - [default: None] maximum mean feature standard deviation to be kept for noise removal,
grouped by the identity of the perturbation from perturb_list.
The data must already be normalized so that this cutoff can apply to all columns.
"""
all_ops = [
"variance_threshold",
"correlation_threshold",
"drop_na_columns",
"blocklist",
"drop_outliers",
"noise_removal",
]

# Make sure the user provides a supported operation
Expand Down Expand Up @@ -133,7 +143,13 @@ def feature_select(
samples=samples,
outlier_cutoff=outlier_cutoff,
)

elif op == "noise_removal":
exclude = noise_removal(
population_df=profiles,
features=features,
noise_removal_perturb_groups=noise_removal_perturb_groups,
noise_removal_stdev_cutoff=noise_removal_stdev_cutoff,
)
excluded_features += exclude

excluded_features = list(set(excluded_features))
Expand Down
1 change: 1 addition & 0 deletions pycytominer/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .get_na_columns import get_na_columns
from .transform import Spherize, RobustMAD
from .sparse_random_projection import sparse_random_projection
from .noise_removal import noise_removal
78 changes: 78 additions & 0 deletions pycytominer/operations/noise_removal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Remove noisy features, as defined by features with excessive standard deviation within the same perturbation group.
"""

import numpy as np
import pandas as pd
from pycytominer.cyto_utils import infer_cp_features


def noise_removal(
population_df,
noise_removal_perturb_groups,
features,
samples="all",
noise_removal_stdev_cutoff=0.8,
):
"""

Parameters
----------
population_df: pandas.core.frame.DataFrame
Dataframe which contains all measurement data and optionally metadata such as the identity of the perturbation
group for each row.
noise_removal_perturb_groups: list or array of str
The list of unique perturbations corresponding to the rows in population_df. For example,
perturb1_well1 and perturb1_well2 would both be "perturb1".
features: list of str, default "infer"
List of features. Can be inferred or manually supplied.
samples: list of str, default "infer"
Which rows to use from population_df. Use "all" if applicable.
noise_removal_stdev_cutoff: float
Maximum mean stdev value for a feature to be kept, with features grouped according to the perturbations in
noise_removal_perturbation_groups.

Returns
----------
list
A list of features to be removed, due to having too high standard deviation within replicate groups.
ruifanp marked this conversation as resolved.
Show resolved Hide resolved

"""
# Subset dataframe
if samples != "all":
population_df = population_df.loc[samples, :]

if features == "infer":
features = infer_cp_features(population_df)

# If a metadata column name is specified, use that as the perturb groups
if isinstance(noise_removal_perturb_groups, str):
assert noise_removal_perturb_groups in population_df.columns, (
'f"{perturb} not found. Are you sure it is a ' "metadata column?"
)
ruifanp marked this conversation as resolved.
Show resolved Hide resolved
group_info = population_df[noise_removal_perturb_groups]
# Otherwise, the user specifies a list of perturbs
elif isinstance(noise_removal_perturb_groups, list):
assert len(noise_removal_perturb_groups) == len(population_df), (
f"The length of input list: {len(noise_removal_perturb_groups)} is not equivalent to your "
f"data: {population_df.shape[0]}"
)
group_info = noise_removal_perturb_groups
else:
raise TypeError(
"noise_removal_perturb_groups must be a list corresponding to row perturbations or a str \
specifying the name of the metadata column."
)
# Subset and df and assign each row with the identity of its perturbation group
population_df = population_df.loc[:, features]
population_df = population_df.assign(group_id=group_info)

# Get the standard deviations of features within each group
stdev_means_df = population_df.groupby("group_id").apply(lambda x: np.std(x)).mean()

# Identify noisy features with a greater mean stdev within perturbation group than the threshold
to_remove = stdev_means_df[
stdev_means_df > noise_removal_stdev_cutoff
].index.tolist()

return to_remove
126 changes: 124 additions & 2 deletions pycytominer/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import tempfile
import numpy as np
import pandas as pd
import pytest
from pycytominer.feature_select import feature_select

random.seed(123)
Expand Down Expand Up @@ -41,7 +42,6 @@
}
).reset_index(drop=True)


a_feature = [1] * 99 + [2]
b_feature = [1, 2] * 50
c_feature = [1, 2] * 25 + random.sample(range(1, 1000), 50)
Expand All @@ -51,7 +51,6 @@
{"a": a_feature, "b": b_feature, "c": c_feature, "d": d_feature}
).reset_index(drop=True)


data_outlier_df = pd.DataFrame(
{
"Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
Expand All @@ -73,6 +72,129 @@
).reset_index(drop=True)


def test_feature_select_noise_removal():
ruifanp marked this conversation as resolved.
Show resolved Hide resolved
"""
Testing noise_removal feature selection operation
"""
# Set perturbation groups for the test dataframes
ruifanp marked this conversation as resolved.
Show resolved Hide resolved
data_df_groups = ["a", "a", "a", "b", "b", "b"]

# Tests on data_df
result1 = feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=2.5,
)
result2 = feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=2,
)
result3 = feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=data_df_groups,
noise_removal_stdev_cutoff=3.5,
)
expected_result1 = data_df[["x", "y"]]
expected_result2 = data_df[[]]
expected_result3 = data_df[["x", "y", "z", "zz"]]
pd.testing.assert_frame_equal(result1, expected_result1)
pd.testing.assert_frame_equal(result2, expected_result2)
pd.testing.assert_frame_equal(result3, expected_result3)

# Test on data_unique_test_df, which has 100 rows
data_unique_test_df_groups = []
# Create a 100 element list containing 10 replicates of 10 perturbations
for elem in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]:
data_unique_test_df_groups.append([elem] * 10)
# Unstack so it's just a single list
data_unique_test_df_groups = [
item for sublist in data_unique_test_df_groups for item in sublist
]

result4 = feature_select(
profiles=data_unique_test_df,
features=data_unique_test_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=data_unique_test_df_groups,
noise_removal_stdev_cutoff=3.5,
)
result5 = feature_select(
profiles=data_unique_test_df,
features=data_unique_test_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=data_unique_test_df_groups,
noise_removal_stdev_cutoff=500,
)
expected_result4 = data_unique_test_df[["a", "b"]]
expected_result5 = data_unique_test_df[["a", "b", "c", "d"]]
pd.testing.assert_frame_equal(result4, expected_result4)
pd.testing.assert_frame_equal(result5, expected_result5)

# Test the same as above, except that data_unique_test_df_groups is now made into a metadata column
data_unique_test_df2 = data_unique_test_df.copy()
data_unique_test_df2["perturb_group"] = data_unique_test_df_groups
result4b = feature_select(
profiles=data_unique_test_df2,
features=data_unique_test_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups="perturb_group",
ruifanp marked this conversation as resolved.
Show resolved Hide resolved
noise_removal_stdev_cutoff=3.5,
)
result5b = feature_select(
profiles=data_unique_test_df2,
features=data_unique_test_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups="perturb_group",
noise_removal_stdev_cutoff=500,
)
expected_result4b = data_unique_test_df2[["a", "b", "perturb_group"]]
expected_result5b = data_unique_test_df2[["a", "b", "c", "d", "perturb_group"]]
pd.testing.assert_frame_equal(result4b, expected_result4b)
pd.testing.assert_frame_equal(result5b, expected_result5b)

# Test assertion errors for the user inputting the perturbation groupings
bad_perturb_list = ["a", "a", "b", "b", "a", "a", "b"]
with pytest.raises(
AssertionError
): # When the inputted perturb list doesn't match the length of the data
feature_select(
data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=bad_perturb_list,
noise_removal_stdev_cutoff=3,
)

with pytest.raises(
AssertionError
): # When the perturb list is inputted as string, but there is no such metadata column in the population_df
feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups="bad_string",
noise_removal_stdev_cutoff=2.5,
)

with pytest.raises(
TypeError
): # When the perturbation groups are not either a list or metadata column string
feature_select(
profiles=data_df,
features=data_df.columns.tolist(),
operation="noise_removal",
noise_removal_perturb_groups=12345,
noise_removal_stdev_cutoff=2.5,
)


def test_feature_select_get_na_columns():
"""
Testing feature_select and get_na_columns pycytominer function
Expand Down