From 768ddccb93a7591679ac159d2a05935ad423ef21 Mon Sep 17 00:00:00 2001
From: Greg Way <gregory.way@gmail.com>
Date: Fri, 16 Aug 2019 16:17:31 -0400
Subject: [PATCH] Inferring Features in `feature_select.py` (#27)

* add infer features

* forgot to change default

* infer features in get na columns
---
 pycytominer/correlation_threshold.py     | 13 ++++++++----
 pycytominer/feature_select.py            | 26 +++++++++++++++---------
 pycytominer/get_na_columns.py            |  6 +++++-
 pycytominer/normalize.py                 |  2 +-
 pycytominer/tests/test_get_na_columns.py |  2 +-
 pycytominer/variance_threshold.py        | 13 ++++++++----
 6 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/pycytominer/correlation_threshold.py b/pycytominer/correlation_threshold.py
index fad6924d..88b41aa0 100644
--- a/pycytominer/correlation_threshold.py
+++ b/pycytominer/correlation_threshold.py
@@ -8,15 +8,16 @@
 
 
 def correlation_threshold(
-    population_df, features="none", samples="none", threshold=0.9, method="pearson"
+    population_df, features="infer", samples="none", threshold=0.9, method="pearson"
 ):
     """
     Exclude features that have correlations above a certain threshold
 
     Arguments:
     population_df - pandas DataFrame that includes metadata and observation features
-    features - a list of features present in the population dataframe
-               [default: "none"] - if "none", use all features
+    features - a list of features present in the population dataframe [default: "infer"]
+               if "infer", then assume cell painting features are those that do not
+               start with "Metadata_"
     samples - list samples to perform operation on
               [default: "none"] - if "none", use all samples to calculate
     threshold - float between (0, 1) to exclude features [default: 0.9]
@@ -39,7 +40,11 @@ def correlation_threshold(
     if samples != "none":
         population_df = population_df.loc[samples, :]
 
-    if features != "none":
+    if features == "infer":
+        features = [
+            x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
+        ]
+    else:
         population_df = population_df.loc[:, features]
 
     data_cor_df = population_df.corr(method=method)
diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py
index a49d1f95..668da45d 100644
--- a/pycytominer/feature_select.py
+++ b/pycytominer/feature_select.py
@@ -10,7 +10,7 @@
 
 def feature_select(
     profiles,
-    features="none",
+    features="infer",
     samples="none",
     operation="variance_threshold",
     output_file="none",
@@ -21,8 +21,9 @@ def feature_select(
 
     Arguments:
     profiles - either pandas DataFrame or a file that stores profile data
-    features - list of cell painting features
-               [default: "none"] - if "none", use all features
+    features - list of cell painting features [default: "infer"]
+               if "infer", then assume cell painting features are those that do not
+               start with "Metadata_"
     samples - if provided, a list of samples to provide operation on
               [default: "none"] - if "none", use all samples to calculate
     operation - str or list of given operations to perform on input profiles
@@ -31,13 +32,6 @@ def feature_select(
                   that this output file be suffixed with
                   "_normalized_variable_selected.csv".
     """
-    # Load Data
-    if not isinstance(profiles, pd.DataFrame):
-        try:
-            profiles = pd.read_csv(profiles)
-        except FileNotFoundError:
-            raise FileNotFoundError("{} profile file not found".format(profiles))
-
     na_cutoff = kwargs.pop("na_cutoff", 0.05)
     corr_threshold = kwargs.pop("corr_threshold", 0.9)
     corr_method = kwargs.pop("corr_method", "pearson")
@@ -59,6 +53,18 @@ def feature_select(
     else:
         return ValueError("Operation must be a list or string")
 
+    # Load Data
+    if not isinstance(profiles, pd.DataFrame):
+        try:
+            profiles = pd.read_csv(profiles)
+        except FileNotFoundError:
+            raise FileNotFoundError("{} profile file not found".format(profiles))
+
+    if features == "infer":
+        features = [
+            x for x in profiles.columns.tolist() if not x.startswith("Metadata_")
+        ]
+
     excluded_features = []
     for op in operation:
         if op == "variance_threshold":
diff --git a/pycytominer/get_na_columns.py b/pycytominer/get_na_columns.py
index e49914fc..d3081da9 100644
--- a/pycytominer/get_na_columns.py
+++ b/pycytominer/get_na_columns.py
@@ -25,7 +25,11 @@ def get_na_columns(population_df, features="none", samples="none", cutoff=0.05):
     if samples != "none":
         population_df = population_df.loc[samples, :]
 
-    if features != "none":
+    if features == "infer":
+        features = [
+            x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
+        ]
+    else:
         population_df = population_df.loc[:, features]
 
     num_rows = population_df.shape[0]
diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py
index cec780ef..33271cea 100644
--- a/pycytominer/normalize.py
+++ b/pycytominer/normalize.py
@@ -8,7 +8,7 @@
 
 def normalize(
     profiles,
-    features="none",
+    features="infer",
     meta_features="none",
     samples="all",
     method="standardize",
diff --git a/pycytominer/tests/test_get_na_columns.py b/pycytominer/tests/test_get_na_columns.py
index 226d7c9b..08842d57 100644
--- a/pycytominer/tests/test_get_na_columns.py
+++ b/pycytominer/tests/test_get_na_columns.py
@@ -23,7 +23,7 @@ def test_get_na_columns():
     assert get_na_columns_result == expected_result
 
     get_na_columns_result = get_na_columns(
-        population_df=data_df, features="none", cutoff=0.1
+        population_df=data_df, features="infer", cutoff=0.1
     )
     expected_result = ["x", "y", "z", "zz"]
     assert sorted(get_na_columns_result) == expected_result
diff --git a/pycytominer/variance_threshold.py b/pycytominer/variance_threshold.py
index 24e55435..b5b6ee71 100644
--- a/pycytominer/variance_threshold.py
+++ b/pycytominer/variance_threshold.py
@@ -8,15 +8,16 @@
 
 
 def variance_threshold(
-    population_df, features="none", samples="none", freq_cut=0.05, unique_cut=0.01
+    population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01
 ):
     """
     Exclude features that have correlations below a certain threshold
 
     Arguments:
     population_df - pandas DataFrame that includes metadata and observation features
-    features - a list of features present in the population dataframe
-               [default: "none"] - if "none", use all features
+    features - a list of features present in the population dataframe [default: "infer"]
+               if "infer", then assume cell painting features are those that do not
+               start with "Metadata_"
     samples - list samples to perform operation on
               [default: "none"] - if "none", use all samples to calculate
     freq_cut - float of ratio (second most common feature value / most common) [default: 0.1]
@@ -33,7 +34,11 @@ def variance_threshold(
     if samples != "none":
         population_df = population_df.loc[samples, :]
 
-    if features != "none":
+    if features == "infer":
+        features = [
+            x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
+        ]
+    else:
         population_df = population_df.loc[:, features]
 
     # Test if excluded for low frequency