Skip to content

Commit

Permalink
Inferring Features in feature_select.py (#27)
Browse files Browse the repository at this point in the history
* add infer features

* forgot to change default

* infer features in get na columns
  • Loading branch information
gwaybio authored Aug 16, 2019
1 parent 44955f6 commit 768ddcc
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 21 deletions.
13 changes: 9 additions & 4 deletions pycytominer/correlation_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@


def correlation_threshold(
population_df, features="none", samples="none", threshold=0.9, method="pearson"
population_df, features="infer", samples="none", threshold=0.9, method="pearson"
):
"""
Exclude features that have correlations above a certain threshold
Arguments:
population_df - pandas DataFrame that includes metadata and observation features
features - a list of features present in the population dataframe
[default: "none"] - if "none", use all features
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that do not
start with "Metadata_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
threshold - float between (0, 1) to exclude features [default: 0.9]
Expand All @@ -39,7 +40,11 @@ def correlation_threshold(
if samples != "none":
population_df = population_df.loc[samples, :]

if features != "none":
if features == "infer":
features = [
x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
]
else:
population_df = population_df.loc[:, features]

data_cor_df = population_df.corr(method=method)
Expand Down
26 changes: 16 additions & 10 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def feature_select(
profiles,
features="none",
features="infer",
samples="none",
operation="variance_threshold",
output_file="none",
Expand All @@ -21,8 +21,9 @@ def feature_select(
Arguments:
profiles - either pandas DataFrame or a file that stores profile data
features - list of cell painting features
[default: "none"] - if "none", use all features
features - list of cell painting features [default: "infer"]
if "infer", then assume cell painting features are those that do not
start with "Metadata_"
samples - if provided, a list of samples to provide operation on
[default: "none"] - if "none", use all samples to calculate
operation - str or list of given operations to perform on input profiles
Expand All @@ -31,13 +32,6 @@ def feature_select(
that this output file be suffixed with
"_normalized_variable_selected.csv".
"""
# Load Data
if not isinstance(profiles, pd.DataFrame):
try:
profiles = pd.read_csv(profiles)
except FileNotFoundError:
raise FileNotFoundError("{} profile file not found".format(profiles))

na_cutoff = kwargs.pop("na_cutoff", 0.05)
corr_threshold = kwargs.pop("corr_threshold", 0.9)
corr_method = kwargs.pop("corr_method", "pearson")
Expand All @@ -59,6 +53,18 @@ def feature_select(
else:
return ValueError("Operation must be a list or string")

# Load Data
if not isinstance(profiles, pd.DataFrame):
try:
profiles = pd.read_csv(profiles)
except FileNotFoundError:
raise FileNotFoundError("{} profile file not found".format(profiles))

if features == "infer":
features = [
x for x in profiles.columns.tolist() if not x.startswith("Metadata_")
]

excluded_features = []
for op in operation:
if op == "variance_threshold":
Expand Down
6 changes: 5 additions & 1 deletion pycytominer/get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ def get_na_columns(population_df, features="none", samples="none", cutoff=0.05):
if samples != "none":
population_df = population_df.loc[samples, :]

if features != "none":
if features == "infer":
features = [
x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
]
else:
population_df = population_df.loc[:, features]

num_rows = population_df.shape[0]
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

def normalize(
profiles,
features="none",
features="infer",
meta_features="none",
samples="all",
method="standardize",
Expand Down
2 changes: 1 addition & 1 deletion pycytominer/tests/test_get_na_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_get_na_columns():
assert get_na_columns_result == expected_result

get_na_columns_result = get_na_columns(
population_df=data_df, features="none", cutoff=0.1
population_df=data_df, features="infer", cutoff=0.1
)
expected_result = ["x", "y", "z", "zz"]
assert sorted(get_na_columns_result) == expected_result
Expand Down
13 changes: 9 additions & 4 deletions pycytominer/variance_threshold.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@


def variance_threshold(
population_df, features="none", samples="none", freq_cut=0.05, unique_cut=0.01
population_df, features="infer", samples="none", freq_cut=0.05, unique_cut=0.01
):
"""
Exclude features that have correlations below a certain threshold
Arguments:
population_df - pandas DataFrame that includes metadata and observation features
features - a list of features present in the population dataframe
[default: "none"] - if "none", use all features
features - a list of features present in the population dataframe [default: "infer"]
if "infer", then assume cell painting features are those that do not
start with "Metadata_"
samples - list samples to perform operation on
[default: "none"] - if "none", use all samples to calculate
freq_cut - float of ratio (second most common feature value / most common) [default: 0.1]
Expand All @@ -33,7 +34,11 @@ def variance_threshold(
if samples != "none":
population_df = population_df.loc[samples, :]

if features != "none":
if features == "infer":
features = [
x for x in population_df.columns.tolist() if not x.startswith("Metadata_")
]
else:
population_df = population_df.loc[:, features]

# Test if excluded for low frequency
Expand Down

0 comments on commit 768ddcc

Please sign in to comment.