Skip to content

Commit

Permalink
closes cytomining#58
Browse files Browse the repository at this point in the history
  • Loading branch information
gwaybio committed Dec 4, 2019
1 parent 53993d4 commit b1f7693
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 15 deletions.
37 changes: 23 additions & 14 deletions pycytominer/feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,41 @@ def feature_select(
samples="none",
operation="variance_threshold",
output_file="none",
**kwargs
na_cutoff=0.05,
corr_threshold=0.9,
corr_method="pearson",
freq_cut=0.05,
unique_cut=0.1,
compression=None,
float_format=None,
blacklist_file=None,
):
"""
Performs feature selection based on the given operation
Arguments:
profiles - either pandas DataFrame or a file that stores profile data
features - list of cell painting features [default: "infer"]
if "infer", then assume cell painting features are those that do not
start with "Cells", "Nuclei", or "Cytoplasm"
if "infer", then assume cell painting features are those that start with
"Cells", "Nuclei", or "Cytoplasm"
samples - if provided, a list of samples to provide operation on
[default: "none"] - if "none", use all samples to calculate
operation - str or list of given operations to perform on input profiles
output_file - [default: "none"] if provided, will write annotated profiles to file
if not specified, will return the annotated profiles. We recommend
that this output file be suffixed with
"_normalized_variable_selected.csv".
na_cutoff - proportion of missing values in a column to tolerate before removing
corr_threshold - float between (0, 1) to exclude features above [default: 0.9]
freq_cut - float of ratio (2nd most common feature val / most common) [default: 0.1]
unique_cut - float of ratio (num unique features / num samples) [default: 0.1]
compression - the mechanism to compress [default: "gzip"]
float_format - decimal precision to use in writing output file [default: None]
For example, use "%.3g" for 3 decimal precision.
blacklist_file - file location of dataframe with features to exclude [default: None]
Note that if "blacklist" in operation then will remove standard
blacklist
"""
na_cutoff = kwargs.pop("na_cutoff", 0.05)
corr_threshold = kwargs.pop("corr_threshold", 0.9)
corr_method = kwargs.pop("corr_method", "pearson")
freq_cut = kwargs.pop("freq_cut", 0.05)
unique_cut = kwargs.pop("unique_cut", 0.1)
compression = kwargs.pop("compression", None)
float_format = kwargs.pop("float_format", None)
blacklist_file = kwargs.pop("blacklist_file", None)

all_ops = [
"variance_threshold",
"correlation_threshold",
Expand All @@ -64,7 +72,6 @@ def feature_select(
operation = operation.split()
else:
return ValueError("Operation must be a list or string")

# Load Data
if not isinstance(profiles, pd.DataFrame):
try:
Expand Down Expand Up @@ -102,7 +109,9 @@ def feature_select(
)
elif op == "blacklist":
if blacklist_file:
exclude = get_blacklist_features(population_df=profiles, blacklist_file=blacklist_file)
exclude = get_blacklist_features(
population_df=profiles, blacklist_file=blacklist_file
)
else:
exclude = get_blacklist_features(population_df=profiles)

Expand Down
10 changes: 9 additions & 1 deletion pycytominer/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ def test_feature_select_get_na_columns():
expected_result = pd.DataFrame({"yy": [1, 2, 8, 10, 2, 100]})
pd.testing.assert_frame_equal(result, expected_result)

result = feature_select(
data_na_df,
features=data_na_df.columns.tolist(),
operation="drop_na_columns",
na_cutoff=1,
)
pd.testing.assert_frame_equal(result, data_na_df)

result = feature_select(
data_na_df,
features=data_na_df.columns.tolist(),
Expand Down Expand Up @@ -232,7 +240,7 @@ def test_feature_select_compress():
features=data_na_df.columns.tolist(),
operation="drop_na_columns",
output_file=compress_file,
how="gzip",
compression="gzip",
)
expected_result = pd.DataFrame({"yy": [1, 2, 8, 10, 2, 100]})
result = pd.read_csv(compress_file)
Expand Down

0 comments on commit b1f7693

Please sign in to comment.