Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Blacklist features #41

Merged
merged 7 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include MANIFEST.in
include LICENSE.md
include README.md
include setup.py
include pycytominer/data/*

36 changes: 36 additions & 0 deletions pycytominer/cyto_utils/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Utility function to manipulate cell profiler features
"""

import os
import pandas as pd

blacklist_file = os.path.join(
os.path.dirname(__file__), "..", "data", "blacklist_features.txt"
)


def get_blacklist_features(blacklist_file=blacklist_file, population_df=None):
"""
Get a list of blacklist features

Arguments:
blacklist_file - file location of dataframe with features to exclude
population_df - profile dataframe used to subset blacklist features [default: None]

Return:
list of features to exclude from downstream analysis
"""

blacklist = pd.read_csv(blacklist_file)

assert any(
[x == "blacklist" for x in blacklist.columns]
), "one column must be named 'blacklist'"

blacklist_features = blacklist.blacklist.to_list()
if isinstance(population_df, pd.DataFrame):
population_features = population_df.columns.tolist()
blacklist_features = [x for x in blacklist_features if x in population_features]

return blacklist_features
56 changes: 56 additions & 0 deletions pycytominer/data/blacklist_features.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
blacklist
Nuclei_Correlation_Manders_AGP_DNA
Nuclei_Correlation_Manders_AGP_ER
Nuclei_Correlation_Manders_AGP_Mito
Nuclei_Correlation_Manders_AGP_RNA
Nuclei_Correlation_Manders_DNA_AGP
Nuclei_Correlation_Manders_DNA_ER
Nuclei_Correlation_Manders_DNA_Mito
Nuclei_Correlation_Manders_DNA_RNA
Nuclei_Correlation_Manders_ER_AGP
Nuclei_Correlation_Manders_ER_DNA
Nuclei_Correlation_Manders_ER_Mito
Nuclei_Correlation_Manders_ER_RNA
Nuclei_Correlation_Manders_Mito_AGP
Nuclei_Correlation_Manders_Mito_DNA
Nuclei_Correlation_Manders_Mito_ER
Nuclei_Correlation_Manders_Mito_RNA
Nuclei_Correlation_Manders_RNA_AGP
Nuclei_Correlation_Manders_RNA_DNA
Nuclei_Correlation_Manders_RNA_ER
Nuclei_Correlation_Manders_RNA_Mito
Nuclei_Correlation_RWC_AGP_DNA
Nuclei_Correlation_RWC_AGP_ER
Nuclei_Correlation_RWC_AGP_Mito
Nuclei_Correlation_RWC_AGP_RNA
Nuclei_Correlation_RWC_DNA_AGP
Nuclei_Correlation_RWC_DNA_ER
Nuclei_Correlation_RWC_DNA_Mito
Nuclei_Correlation_RWC_DNA_RNA
Nuclei_Correlation_RWC_ER_AGP
Nuclei_Correlation_RWC_ER_DNA
Nuclei_Correlation_RWC_ER_Mito
Nuclei_Correlation_RWC_ER_RNA
Nuclei_Correlation_RWC_Mito_AGP
Nuclei_Correlation_RWC_Mito_DNA
Nuclei_Correlation_RWC_Mito_ER
Nuclei_Correlation_RWC_Mito_RNA
Nuclei_Correlation_RWC_RNA_AGP
Nuclei_Correlation_RWC_RNA_DNA
Nuclei_Correlation_RWC_RNA_ER
Nuclei_Correlation_RWC_RNA_Mito
Nuclei_Granularity_14_AGP
Nuclei_Granularity_14_DNA
Nuclei_Granularity_14_ER
Nuclei_Granularity_14_Mito
Nuclei_Granularity_14_RNA
Nuclei_Granularity_15_AGP
Nuclei_Granularity_15_DNA
Nuclei_Granularity_15_ER
Nuclei_Granularity_15_Mito
Nuclei_Granularity_15_RNA
Nuclei_Granularity_16_AGP
Nuclei_Granularity_16_DNA
Nuclei_Granularity_16_ER
Nuclei_Granularity_16_Mito
Nuclei_Granularity_16_RNA
16 changes: 15 additions & 1 deletion pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
Select features to use in downstream analysis based on specified selection method
"""

import os
import pandas as pd

from pycytominer.correlation_threshold import correlation_threshold
from pycytominer.variance_threshold import variance_threshold
from pycytominer.get_na_columns import get_na_columns
from pycytominer.cyto_utils.compress import compress
from pycytominer.cyto_utils.features import get_blacklist_features


def feature_select(
Expand Down Expand Up @@ -41,8 +43,14 @@ def feature_select(
unique_cut = kwargs.pop("unique_cut", 0.1)
how = kwargs.pop("how", None)
float_format = kwargs.pop("float_format", None)
blacklist_file = kwargs.pop("blacklist_file", None)

all_ops = ["variance_threshold", "correlation_threshold", "drop_na_columns"]
all_ops = [
"variance_threshold",
"correlation_threshold",
"drop_na_columns",
"blacklist",
]

# Make sure the user provides a supported operation
if isinstance(operation, list):
Expand Down Expand Up @@ -94,6 +102,12 @@ def feature_select(
threshold=corr_threshold,
method=corr_method,
)
elif op == "blacklist":
if blacklist_file:
exclude = get_blacklist_features(population_df=profiles, blacklist_file=blacklist_file)
else:
exclude = get_blacklist_features(population_df=profiles)

excluded_features += exclude

excluded_features = list(set(excluded_features))
Expand Down
30 changes: 30 additions & 0 deletions pycytominer/tests/test_feature_blacklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import random
import pytest
import tempfile
import warnings
import pandas as pd
from pycytominer.cyto_utils.features import get_blacklist_features

blacklist_file = os.path.join(
os.path.dirname(__file__), "..", "data", "blacklist_features.txt"
)

blacklist = pd.read_csv(blacklist_file).blacklist.tolist()

data_blacklist_df = pd.DataFrame(
{
"Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2],
"Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9],
}
).reset_index(drop=True)


def test_blacklist():
blacklist_from_func = get_blacklist_features()
assert blacklist == blacklist_from_func


def test_blacklist_df():
blacklist_from_func = get_blacklist_features(population_df=data_blacklist_df)
assert data_blacklist_df.columns.tolist() == blacklist_from_func
19 changes: 19 additions & 0 deletions pycytominer/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,22 @@ def test_feature_select_compress():
result = pd.read_csv(compress_file)

pd.testing.assert_frame_equal(result, expected_result)


def test_feature_select_blacklist():
"""
Testing feature_select and get_na_columns pycytominer function
"""

data_blacklist_df = pd.DataFrame(
{
"Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2],
"y": [1, 2, 8, 5, 2, 1],
"Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9],
"zz": [0, -3, 8, 9, 6, 9],
}
).reset_index(drop=True)

result = feature_select(data_blacklist_df, operation="blacklist")
expected_result = pd.DataFrame({"y": [1, 2, 8, 5, 2, 1], "zz": [0, -3, 8, 9, 6, 9]})
pd.testing.assert_frame_equal(result, expected_result)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@
license="BSD 3-Clause License",
install_requires=["numpy", "pandas", "scikit-learn", "sqlalchemy"],
python_requires=">=3.4",
include_package_data=True,
)