cytomining · gwaybio · Aug 26, 2019 · Aug 26, 2019 · Aug 26, 2019 · Aug 26, 2019
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,6 @@
+include MANIFEST.in
+include LICENSE.md
+include README.md
+include setup.py
+include pycytominer/data/*
+
diff --git a/pycytominer/cyto_utils/features.py b/pycytominer/cyto_utils/features.py
@@ -0,0 +1,36 @@
+"""
+Utility function to manipulate cell profiler features
+"""
+
+import os
+import pandas as pd
+
+blacklist_file = os.path.join(
+    os.path.dirname(__file__), "..", "data", "blacklist_features.txt"
+)
+
+
+def get_blacklist_features(blacklist_file=blacklist_file, population_df=None):
+    """
+    Get a list of blacklist features
+
+    Arguments:
+    blacklist_file - file location of dataframe with features to exclude
+    population_df - profile dataframe used to subset blacklist features [default: None]
+
+    Return:
+    list of features to exclude from downstream analysis
+    """
+
+    blacklist = pd.read_csv(blacklist_file)
+
+    assert any(
+        [x == "blacklist" for x in blacklist.columns]
+    ), "one column must be named 'blacklist'"
+
+    blacklist_features = blacklist.blacklist.to_list()
+    if isinstance(population_df, pd.DataFrame):
+        population_features = population_df.columns.tolist()
+        blacklist_features = [x for x in blacklist_features if x in population_features]
+
+    return blacklist_features
diff --git a/pycytominer/data/blacklist_features.txt b/pycytominer/data/blacklist_features.txt
@@ -0,0 +1,56 @@
+blacklist
+Nuclei_Correlation_Manders_AGP_DNA
+Nuclei_Correlation_Manders_AGP_ER
+Nuclei_Correlation_Manders_AGP_Mito
+Nuclei_Correlation_Manders_AGP_RNA
+Nuclei_Correlation_Manders_DNA_AGP
+Nuclei_Correlation_Manders_DNA_ER
+Nuclei_Correlation_Manders_DNA_Mito
+Nuclei_Correlation_Manders_DNA_RNA
+Nuclei_Correlation_Manders_ER_AGP
+Nuclei_Correlation_Manders_ER_DNA
+Nuclei_Correlation_Manders_ER_Mito
+Nuclei_Correlation_Manders_ER_RNA
+Nuclei_Correlation_Manders_Mito_AGP
+Nuclei_Correlation_Manders_Mito_DNA
+Nuclei_Correlation_Manders_Mito_ER
+Nuclei_Correlation_Manders_Mito_RNA
+Nuclei_Correlation_Manders_RNA_AGP
+Nuclei_Correlation_Manders_RNA_DNA
+Nuclei_Correlation_Manders_RNA_ER
+Nuclei_Correlation_Manders_RNA_Mito
+Nuclei_Correlation_RWC_AGP_DNA
+Nuclei_Correlation_RWC_AGP_ER
+Nuclei_Correlation_RWC_AGP_Mito
+Nuclei_Correlation_RWC_AGP_RNA
+Nuclei_Correlation_RWC_DNA_AGP
+Nuclei_Correlation_RWC_DNA_ER
+Nuclei_Correlation_RWC_DNA_Mito
+Nuclei_Correlation_RWC_DNA_RNA
+Nuclei_Correlation_RWC_ER_AGP
+Nuclei_Correlation_RWC_ER_DNA
+Nuclei_Correlation_RWC_ER_Mito
+Nuclei_Correlation_RWC_ER_RNA
+Nuclei_Correlation_RWC_Mito_AGP
+Nuclei_Correlation_RWC_Mito_DNA
+Nuclei_Correlation_RWC_Mito_ER
+Nuclei_Correlation_RWC_Mito_RNA
+Nuclei_Correlation_RWC_RNA_AGP
+Nuclei_Correlation_RWC_RNA_DNA
+Nuclei_Correlation_RWC_RNA_ER
+Nuclei_Correlation_RWC_RNA_Mito
+Nuclei_Granularity_14_AGP
+Nuclei_Granularity_14_DNA
+Nuclei_Granularity_14_ER
+Nuclei_Granularity_14_Mito
+Nuclei_Granularity_14_RNA
+Nuclei_Granularity_15_AGP
+Nuclei_Granularity_15_DNA
+Nuclei_Granularity_15_ER
+Nuclei_Granularity_15_Mito
+Nuclei_Granularity_15_RNA
+Nuclei_Granularity_16_AGP
+Nuclei_Granularity_16_DNA
+Nuclei_Granularity_16_ER
+Nuclei_Granularity_16_Mito
+Nuclei_Granularity_16_RNA
diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
@@ -2,12 +2,14 @@
 Select features to use in downstream analysis based on specified selection method
 """
 
+import os
 import pandas as pd
 
 from pycytominer.correlation_threshold import correlation_threshold
 from pycytominer.variance_threshold import variance_threshold
 from pycytominer.get_na_columns import get_na_columns
 from pycytominer.cyto_utils.compress import compress
+from pycytominer.cyto_utils.features import get_blacklist_features
 
 
 def feature_select(
@@ -41,8 +43,14 @@ def feature_select(
     unique_cut = kwargs.pop("unique_cut", 0.1)
     how = kwargs.pop("how", None)
     float_format = kwargs.pop("float_format", None)
+    blacklist_file = kwargs.pop("blacklist_file", None)
 
-    all_ops = ["variance_threshold", "correlation_threshold", "drop_na_columns"]
+    all_ops = [
+        "variance_threshold",
+        "correlation_threshold",
+        "drop_na_columns",
+        "blacklist",
+    ]
 
     # Make sure the user provides a supported operation
     if isinstance(operation, list):
@@ -94,6 +102,12 @@ def feature_select(
                 threshold=corr_threshold,
                 method=corr_method,
             )
+        elif op == "blacklist":
+            if blacklist_file:
+                exclude = get_blacklist_features(population_df=profiles, blacklist_file=blacklist_file)
+            else:
+                exclude = get_blacklist_features(population_df=profiles)
+
         excluded_features += exclude
 
     excluded_features = list(set(excluded_features))

diff --git a/pycytominer/tests/test_feature_blacklist.py b/pycytominer/tests/test_feature_blacklist.py
@@ -0,0 +1,30 @@
+import os
+import random
+import pytest
+import tempfile
+import warnings
+import pandas as pd
+from pycytominer.cyto_utils.features import get_blacklist_features
+
+blacklist_file = os.path.join(
+    os.path.dirname(__file__), "..", "data", "blacklist_features.txt"
+)
+
+blacklist = pd.read_csv(blacklist_file).blacklist.tolist()
+
+data_blacklist_df = pd.DataFrame(
+    {
+        "Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2],
+        "Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9],
+    }
+).reset_index(drop=True)
+
+
+def test_blacklist():
+    blacklist_from_func = get_blacklist_features()
+    assert blacklist == blacklist_from_func
+
+
+def test_blacklist_df():
+    blacklist_from_func = get_blacklist_features(population_df=data_blacklist_df)
+    assert data_blacklist_df.columns.tolist() == blacklist_from_func
diff --git a/pycytominer/tests/test_feature_select.py b/pycytominer/tests/test_feature_select.py
@@ -165,3 +165,22 @@ def test_feature_select_compress():
     result = pd.read_csv(compress_file)
 
     pd.testing.assert_frame_equal(result, expected_result)
+
+
+def test_feature_select_blacklist():
+    """
+    Testing feature_select and get_na_columns pycytominer function
+    """
+
+    data_blacklist_df = pd.DataFrame(
+        {
+            "Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2],
+            "y": [1, 2, 8, 5, 2, 1],
+            "Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9],
+            "zz": [0, -3, 8, 9, 6, 9],
+        }
+    ).reset_index(drop=True)
+
+    result = feature_select(data_blacklist_df, operation="blacklist")
+    expected_result = pd.DataFrame({"y": [1, 2, 8, 5, 2, 1], "zz": [0, -3, 8, 9, 6, 9]})
+    pd.testing.assert_frame_equal(result, expected_result)
diff --git a/setup.py b/setup.py
@@ -19,4 +19,5 @@
     license="BSD 3-Clause License",
     install_requires=["numpy", "pandas", "scikit-learn", "sqlalchemy"],
     python_requires=">=3.4",
+    include_package_data=True,
 )