Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Functionality to Drop Outlier Features #62

Merged
merged 3 commits into from
Dec 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,41 @@ def infer_cp_features(population_df, metadata=False):
), "No CP features found. Are you sure this dataframe is from CellProfiler?"

return features


def drop_outlier_features(
population_df, features="infer", samples="none", outlier_cutoff=15
):
"""
Exclude a feature if its min or max absolute value is greater than the threshold

Arguments:
population_df - pandas DataFrame that includes metadata and observation features
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that start with
"Cells_", "Nuclei_", or "Cytoplasm_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
outlier_cutoff - threshold to remove feature if absolute value is greater

Return:
list of features to exclude from the population_df
"""
# Subset dataframe
if samples != "none":
population_df = population_df.loc[samples, :]

if features == "infer":
features = infer_cp_features(population_df)
population_df = population_df.loc[:, features]
else:
population_df = population_df.loc[:, features]

max_feature_values = population_df.max().abs()
min_feature_values = population_df.min().abs()

outlier_features = max_feature_values[
(max_feature_values > outlier_cutoff) | (min_feature_values > outlier_cutoff)
].index.tolist()

return outlier_features
19 changes: 18 additions & 1 deletion pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from pycytominer.variance_threshold import variance_threshold
from pycytominer.get_na_columns import get_na_columns
from pycytominer.cyto_utils.output import output
from pycytominer.cyto_utils.features import get_blacklist_features, infer_cp_features
from pycytominer.cyto_utils.features import (
get_blacklist_features,
infer_cp_features,
drop_outlier_features,
)


def feature_select(
Expand All @@ -26,6 +30,7 @@ def feature_select(
compression=None,
float_format=None,
blacklist_file=None,
outlier_cutoff=15,
):
"""
Performs feature selection based on the given operation
Expand All @@ -52,12 +57,17 @@ def feature_select(
blacklist_file - file location of dataframe with features to exclude [default: None]
Note that if "blacklist" in operation then will remove standard
blacklist
outlier_cutoff - the threshold at which the maximum or minimum value of a feature
across a full experiment is excluded [default: 15]. Note that this
procedure is typically applied (and therefore the default is
suitable) for after normalization.
"""
all_ops = [
"variance_threshold",
"correlation_threshold",
"drop_na_columns",
"blacklist",
"drop_outliers",
]

# Make sure the user provides a supported operation
Expand Down Expand Up @@ -114,6 +124,13 @@ def feature_select(
)
else:
exclude = get_blacklist_features(population_df=profiles)
elif op == "drop_outliers":
exclude = drop_outlier_features(
population_df=profiles,
features=features,
samples=samples,
outlier_cutoff=outlier_cutoff,
)

excluded_features += exclude

Expand Down
55 changes: 55 additions & 0 deletions pycytominer/tests/test_cyto_utils/test_feature_drop_outlier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import random
import pytest
import tempfile
import warnings
import pandas as pd
from pycytominer.cyto_utils.features import drop_outlier_features

# Build data to use in tests
data_df = pd.DataFrame(
{
"Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
"Metadata_treatment": [
"drug",
"drug",
"control",
"control",
"drug",
"drug",
"control",
"control",
],
"Cells_x": [1, 2, -8, 2, 5, 5, 5, -1],
"Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1],
"Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2],
"Cells_zz": [14, -46, 1, 60, -30, -100, 2, 2],
}
).reset_index(drop=True)


def test_outlier_default():
result = drop_outlier_features(data_df)
expected_result = ["Cells_zz", "Nuclei_z"]
assert sorted(result) == sorted(expected_result)


def test_outlier_high_cutoff():
result = drop_outlier_features(data_df, outlier_cutoff=30)
expected_result = ["Cells_zz"]
assert result == expected_result


def test_outlier_samples():
result = drop_outlier_features(data_df, samples=[0, 1, 2, 3, 5])
expected_result = ["Cells_zz", "Nuclei_z"]
assert sorted(result) == sorted(expected_result)

result = drop_outlier_features(data_df, samples=[0, 1, 2, 3])
expected_result = ["Cells_zz"]
assert result == expected_result


def test_outlier_features():
result = drop_outlier_features(data_df, features=["Cells_x", "Cytoplasm_y"])
assert len(result) == 0
43 changes: 43 additions & 0 deletions pycytominer/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,27 @@
).reset_index(drop=True)


data_outlier_df = pd.DataFrame(
{
"Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
"Metadata_treatment": [
"drug",
"drug",
"control",
"control",
"drug",
"drug",
"control",
"control",
],
"Cells_x": [1, 2, -8, 2, 5, 5, 5, -1],
"Cytoplasm_y": [3, -1, 7, 4, 5, -9, 6, 1],
"Nuclei_z": [-1, 8, 2, 5, -6, 20, 2, -2],
"Cells_zz": [14, -46, 1, 60, -30, -100, 2, 2],
}
).reset_index(drop=True)


def test_feature_select_get_na_columns():
"""
Testing feature_select and get_na_columns pycytominer function
Expand Down Expand Up @@ -273,3 +294,25 @@ def test_feature_select_blacklist():
)
expected_result = pd.DataFrame({"y": [1, 2, 8, 5, 2, 1], "zz": [0, -3, 8, 9, 6, 9]})
pd.testing.assert_frame_equal(result, expected_result)


def test_feature_select_drop_outlier():
"""
Testing feature_select and get_na_columns pycytominer function
"""
result = feature_select(
data_outlier_df, features="infer", operation="drop_outliers"
)
expected_result = data_outlier_df.drop(["Cells_zz", "Nuclei_z"], axis="columns")
pd.testing.assert_frame_equal(result, expected_result)

result = feature_select(
data_outlier_df, features="infer", operation="drop_outliers", outlier_cutoff=30
)
expected_result = data_outlier_df.drop(["Cells_zz"], axis="columns")
pd.testing.assert_frame_equal(result, expected_result)

result = feature_select(
data_outlier_df, features=["Cells_x", "Cytoplasm_y"], operation="drop_outliers"
)
pd.testing.assert_frame_equal(result, data_outlier_df)