From 768ddccb93a7591679ac159d2a05935ad423ef21 Mon Sep 17 00:00:00 2001 From: Greg Way Date: Fri, 16 Aug 2019 16:17:31 -0400 Subject: [PATCH] Inferring Features in `feature_select.py` (#27) * add infer features * forgot to change default * infer features in get na columns --- pycytominer/correlation_threshold.py | 13 ++++++++---- pycytominer/feature_select.py | 26 +++++++++++++++--------- pycytominer/get_na_columns.py | 6 +++++- pycytominer/normalize.py | 2 +- pycytominer/tests/test_get_na_columns.py | 2 +- pycytominer/variance_threshold.py | 13 ++++++++---- 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/pycytominer/correlation_threshold.py b/pycytominer/correlation_threshold.py index fad6924d..88b41aa0 100644 --- a/pycytominer/correlation_threshold.py +++ b/pycytominer/correlation_threshold.py @@ -8,15 +8,16 @@ def correlation_threshold( - population_df, features="none", samples="none", threshold=0.9, method="pearson" + population_df, features="infer", samples="none", threshold=0.9, method="pearson" ): """ Exclude features that have correlations above a certain threshold Arguments: population_df - pandas DataFrame that includes metadata and observation features - features - a list of features present in the population dataframe - [default: "none"] - if "none", use all features + features - a list of features present in the population dataframe [default: "infer"] + if "infer", then assume cell painting features are those that do not + start with "Metadata_" samples - list samples to perform operation on [default: "none"] - if "none", use all samples to calculate threshold - float between (0, 1) to exclude features [default: 0.9] @@ -39,7 +40,11 @@ def correlation_threshold( if samples != "none": population_df = population_df.loc[samples, :] - if features != "none": + if features == "infer": + features = [ + x for x in population_df.columns.tolist() if not x.startswith("Metadata_") + ] + else: population_df = population_df.loc[:, features] data_cor_df = population_df.corr(method=method) diff --git a/pycytominer/feature_select.py b/pycytominer/feature_select.py index a49d1f95..668da45d 100644 --- a/pycytominer/feature_select.py +++ b/pycytominer/feature_select.py @@ -10,7 +10,7 @@ def feature_select( profiles, - features="none", + features="infer", samples="none", operation="variance_threshold", output_file="none", @@ -21,8 +21,9 @@ def feature_select( Arguments: profiles - either pandas DataFrame or a file that stores profile data - features - list of cell painting features - [default: "none"] - if "none", use all features + features - list of cell painting features [default: "infer"] + if "infer", then assume cell painting features are those that do not + start with "Metadata_" samples - if provided, a list of samples to provide operation on [default: "none"] - if "none", use all samples to calculate operation - str or list of given operations to perform on input profiles @@ -31,13 +32,6 @@ def feature_select( that this output file be suffixed with "_normalized_variable_selected.csv". """ - # Load Data - if not isinstance(profiles, pd.DataFrame): - try: - profiles = pd.read_csv(profiles) - except FileNotFoundError: - raise FileNotFoundError("{} profile file not found".format(profiles)) - na_cutoff = kwargs.pop("na_cutoff", 0.05) corr_threshold = kwargs.pop("corr_threshold", 0.9) corr_method = kwargs.pop("corr_method", "pearson") @@ -59,6 +53,18 @@ def feature_select( else: return ValueError("Operation must be a list or string") + # Load Data + if not isinstance(profiles, pd.DataFrame): + try: + profiles = pd.read_csv(profiles) + except FileNotFoundError: + raise FileNotFoundError("{} profile file not found".format(profiles)) + + if features == "infer": + features = [ + x for x in profiles.columns.tolist() if not x.startswith("Metadata_") + ] + excluded_features = [] for op in operation: if op == "variance_threshold": diff --git a/pycytominer/get_na_columns.py b/pycytominer/get_na_columns.py index e49914fc..d3081da9 100644 --- a/pycytominer/get_na_columns.py +++ b/pycytominer/get_na_columns.py @@ -25,7 +25,11 @@ def get_na_columns(population_df, features="none", samples="none", cutoff=0.05): if samples != "none": population_df = population_df.loc[samples, :] - if features != "none": + if features == "infer": + features = [ + x for x in population_df.columns.tolist() if not x.startswith("Metadata_") + ] + else: population_df = population_df.loc[:, features] num_rows = population_df.shape[0] diff --git a/pycytominer/normalize.py b/pycytominer/normalize.py index cec780ef..33271cea 100644 --- a/pycytominer/normalize.py +++ b/pycytominer/normalize.py @@ -8,7 +8,7 @@ def normalize( profiles, - features="none", + features="infer", meta_features="none", samples="all", method="standardize", diff --git a/pycytominer/tests/test_get_na_columns.py b/pycytominer/tests/test_get_na_columns.py index 226d7c9b..08842d57 100644 --- a/pycytominer/tests/test_get_na_columns.py +++ b/pycytominer/tests/test_get_na_columns.py @@ -23,7 +23,7 @@ def test_get_na_columns(): assert get_na_columns_result == expected_result get_na_columns_result = get_na_columns( - population_df=data_df, features="none", cutoff=0.1 + population_df=data_df, features="infer", cutoff=0.1 ) expected_result = ["x", "y", "z", "zz"] assert sorted(get_na_columns_result) == expected_result diff --git a/pycytominer/variance_threshold.py b/pycytominer/variance_threshold.py index 24e55435..b5b6ee71 100644 --- a/pycytominer/variance_threshold.py +++ b/pycytominer/variance_threshold.py @@ -8,15 +8,16 @@ def variance_threshold( - population_df, features="none", samples="none", freq_cut=0.05, unique_cut=0.01 + population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01 ): """ Exclude features that have correlations below a certain threshold Arguments: population_df - pandas DataFrame that includes metadata and observation features - features - a list of features present in the population dataframe - [default: "none"] - if "none", use all features + features - a list of features present in the population dataframe [default: "infer"] + if "infer", then assume cell painting features are those that do not + start with "Metadata_" samples - list samples to perform operation on [default: "none"] - if "none", use all samples to calculate freq_cut - float of ratio (second most common feature value / most common) [default: 0.1] @@ -33,7 +34,11 @@ def variance_threshold( if samples != "none": population_df = population_df.loc[samples, :] - if features != "none": + if features == "infer": + features = [ + x for x in population_df.columns.tolist() if not x.startswith("Metadata_") + ] + else: population_df = population_df.loc[:, features] # Test if excluded for low frequency