-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from gwaygenomics/add-initial-functions
Adding Initial Functions
- Loading branch information
Showing
11 changed files
with
319 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__pycache__/ | ||
.ipynb_checkpoints/ |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
""" | ||
Aggregate single cell data based on given grouping variables | ||
""" | ||
|
||
import pandas as pd | ||
|
||
|
||
def aggregate(population_df, strata, variables="all", operation="median"): | ||
""" | ||
Combine population dataframe variables by strata groups using given operation | ||
Arguments: | ||
population_df - pandas DataFrame to group and aggregate | ||
strata - list indicating the columns to groupby and aggregate | ||
variables - [default: "all] or list indicating variables that should be aggregated | ||
operation - [default: "median"] a string indicating how the data is aggregated | ||
currently only supports one of ['mean', 'median'] | ||
Return: | ||
Pandas DataFrame of aggregated features | ||
""" | ||
|
||
operation = operation.lower() | ||
|
||
assert operation in ["mean", "median"], "operation must be one ['mean', 'median']" | ||
|
||
# Subset dataframe to only specified variables if provided | ||
if variables != "all": | ||
strata_df = population_df.loc[:, strata] | ||
population_df = population_df.loc[:, variables] | ||
population_df = pd.concat([strata_df, population_df], axis="columns") | ||
|
||
population_df = population_df.groupby(strata) | ||
|
||
if operation == "median": | ||
return population_df.median().reset_index() | ||
else: | ||
return population_df.mean().reset_index() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Returns list of variables such that no two variables have a correlation greater than a | ||
specified threshold | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def correlation_threshold(variables, data_df, threshold=0.9, method="pearson"): | ||
""" | ||
Exclude variables that have correlations above a certain threshold | ||
Arguments: | ||
variables - list specifying observation variables | ||
data_df - Pandas DataFrame containing the data to calculate variable correlation | ||
typically, this DataFrame is a sampled subset of the full dataframe | ||
threshold - float between (0, 1) to exclude variables [default: 0.9] | ||
method - string indicating which correlation metric to use to test cutoff | ||
[default: "pearson"] | ||
Return: | ||
A list of variables to exclude | ||
""" | ||
method = method.lower() | ||
|
||
assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)" | ||
assert method in [ | ||
"pearson", | ||
"spearman", | ||
"kendall", | ||
], "method not supported, select one of ['pearson', 'spearman', 'kendall']" | ||
|
||
# Subset dataframe and calculate correlation matrix across subset variables | ||
data_cor_df = data_df.loc[:, variables].corr(method=method) | ||
|
||
# Create a copy of the dataframe to generate upper triangle of zeros | ||
data_cor_zerotri_df = data_cor_df.copy() | ||
|
||
# Zero out upper triangle in correlation matrix | ||
data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1) | ||
|
||
# Get absolute sum of correlation across variables | ||
# The lower the index, the less correlation to the full data frame | ||
# We want to drop variables with highest correlation, so drop higher index | ||
variable_cor_sum = data_cor_df.abs().sum().sort_values().index | ||
|
||
# Acquire pairwise correlations in a long format | ||
# Note that we are using the zero triangle DataFrame | ||
pairwise_df = data_cor_zerotri_df.stack().reset_index() | ||
pairwise_df.columns = ["pair_a", "pair_b", "correlation"] | ||
|
||
# And subset to only variable combinations that pass the threshold | ||
pairwise_df = pairwise_df.query("correlation > @threshold") | ||
|
||
# Output the excluded variables | ||
excluded = pairwise_df.apply( | ||
lambda x: determine_high_cor_pair(x, variable_cor_sum), axis="columns" | ||
) | ||
|
||
return list(set(excluded.tolist())) | ||
|
||
|
||
def determine_high_cor_pair(correlation_row, sorted_correlation_pairs): | ||
""" | ||
Select highest correlated variable given a correlation row with columns: | ||
["pair_a", "pair_b", "correlation"] | ||
For use in a pandas.apply() | ||
""" | ||
|
||
pair_a = correlation_row["pair_a"] | ||
pair_b = correlation_row["pair_b"] | ||
|
||
if sorted_correlation_pairs.get_loc(pair_a) > sorted_correlation_pairs.get_loc( | ||
pair_b | ||
): | ||
return pair_a | ||
else: | ||
return pair_b |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
""" | ||
Count the number of NAs per variable | ||
Note this was called `count_na_rows()` in cytominer | ||
""" | ||
|
||
import pandas as pd | ||
|
||
|
||
def count_na_features(population_df, variables): | ||
""" | ||
Given a population dataframe and variables, count how many nas per feature | ||
Arguments: | ||
population_df - pandas DataFrame storing profiles | ||
variables - a list of features present in the population dataframe | ||
Return: | ||
Dataframe of NA counts per variable | ||
""" | ||
|
||
return pd.DataFrame( | ||
population_df.loc[:, variables].isna().sum(), columns=["num_na"] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
""" | ||
Remove variables with specified threshold of NA values | ||
Note: This was called `drop_na_columns` in cytominer for R | ||
""" | ||
|
||
import pandas as pd | ||
|
||
|
||
def get_na_columns(population_df, variables, cutoff=0.05): | ||
""" | ||
Get features that have more NA values than cutoff defined | ||
Arguments: | ||
population_df - pandas DataFrame storing profiles | ||
variables - a list of features present in the population dataframe | ||
Output: | ||
A list of the features to exclude | ||
""" | ||
|
||
num_rows = population_df.shape[0] | ||
na_prop_df = population_df.loc[:, variables].isna().sum() / num_rows | ||
|
||
na_prop_df = na_prop_df[na_prop_df > cutoff] | ||
return list(set(na_prop_df.index.tolist())) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
from pycytominer.aggregate import aggregate | ||
|
||
# Build data to use in tests | ||
data_df = pd.concat( | ||
[ | ||
pd.DataFrame({"g": "a", "x": [1, 3, 8], "y": [5, 3, 1]}), | ||
pd.DataFrame({"g": "b", "x": [1, 3, 5], "y": [8, 3, 1]}), | ||
] | ||
).reset_index(drop=True) | ||
|
||
|
||
def test_aggregate_median_allvar(): | ||
""" | ||
Testing aggregate pycytominer function | ||
""" | ||
aggregate_result = aggregate( | ||
population_df=data_df, strata=["g"], variables="all", operation="median" | ||
) | ||
|
||
expected_result = pd.concat( | ||
[ | ||
pd.DataFrame({"g": "a", "x": [3], "y": [3]}), | ||
pd.DataFrame({"g": "b", "x": [3], "y": [3]}), | ||
] | ||
).reset_index(drop=True) | ||
|
||
assert aggregate_result.equals(expected_result) | ||
|
||
|
||
def test_aggregate_mean_allvar(): | ||
""" | ||
Testing aggregate pycytominer function | ||
""" | ||
aggregate_result = aggregate( | ||
population_df=data_df, strata=["g"], variables="all", operation="mean" | ||
) | ||
|
||
expected_result = pd.concat( | ||
[ | ||
pd.DataFrame({"g": "a", "x": [4], "y": [3]}), | ||
pd.DataFrame({"g": "b", "x": [3], "y": [4]}), | ||
] | ||
).reset_index(drop=True) | ||
|
||
assert aggregate_result.equals(expected_result) | ||
|
||
|
||
def test_aggregate_median_subsetvar(): | ||
""" | ||
Testing aggregate pycytominer function | ||
""" | ||
aggregate_result = aggregate( | ||
population_df=data_df, strata=["g"], variables=["x"], operation="median" | ||
) | ||
|
||
expected_result = pd.DataFrame({"g": ["a", "b"], "x": [3, 3]}) | ||
|
||
assert aggregate_result.equals(expected_result) | ||
|
||
|
||
def test_aggregate_mean_subsetvar(): | ||
""" | ||
Testing aggregate pycytominer function | ||
""" | ||
aggregate_result = aggregate( | ||
population_df=data_df, strata=["g"], variables=["x"], operation="mean" | ||
) | ||
|
||
expected_result = pd.DataFrame({"g": ["a", "b"], "x": [4, 3]}) | ||
|
||
assert aggregate_result.equals(expected_result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import pandas as pd | ||
from pycytominer.correlation_threshold import correlation_threshold | ||
|
||
# Build data to use in tests | ||
data_df = pd.DataFrame( | ||
{ | ||
"x": [1, 3, 8, 5, 2, 2], | ||
"y": [1, 2, 8, 5, 2, 1], | ||
"z": [9, 3, 8, 9, 2, 9], | ||
"zz": [0, -3, 8, 9, 6, 9], | ||
} | ||
).reset_index(drop=True) | ||
|
||
|
||
def test_correlation_threshold(): | ||
""" | ||
Testing correlation_threshold pycytominer function | ||
""" | ||
correlation_threshold_result = correlation_threshold( | ||
variables=["x", "y", "z", "zz"], | ||
data_df=data_df, | ||
threshold=0.9, | ||
method="pearson", | ||
) | ||
|
||
expected_result = ['y'] | ||
|
||
assert correlation_threshold_result == expected_result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from pycytominer.count_na_features import count_na_features | ||
|
||
data_df = pd.DataFrame( | ||
{ | ||
"x": [np.nan, 3, 8, 5, 2, 2], | ||
"y": [1, 2, 8, np.nan, 2, np.nan], | ||
"z": [9, 3, 8, 9, 2, 9], | ||
"zz": [np.nan, np.nan, 8, np.nan, 6, 9], | ||
} | ||
).reset_index(drop=True) | ||
|
||
|
||
def test_count_na_features(): | ||
""" | ||
Testing count_na_features pycytominer function | ||
""" | ||
count_na_features_result = count_na_features( | ||
population_df=data_df, variables=["x", "zz"] | ||
) | ||
|
||
expected_result = pd.DataFrame({"num_na": [1, 3]}) | ||
expected_result.index = ["x", "zz"] | ||
|
||
assert count_na_features_result.equals(expected_result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from pycytominer.get_na_columns import get_na_columns | ||
|
||
data_df = pd.DataFrame( | ||
{ | ||
"x": [np.nan, 3, 8, 5, 2, 2], | ||
"y": [1, 2, 8, np.nan, 2, np.nan], | ||
"z": [9, 3, 8, 9, 2, 9], | ||
"zz": [np.nan, np.nan, 8, np.nan, 6, 9], | ||
} | ||
).reset_index(drop=True) | ||
|
||
|
||
def test_get_na_columns(): | ||
""" | ||
Testing get_na_columns pycytominer function | ||
""" | ||
get_na_columns_result = get_na_columns( | ||
population_df=data_df, variables=["x", "y", "zz"], cutoff=0.4 | ||
) | ||
|
||
expected_result = ["zz"] | ||
|
||
assert get_na_columns_result == expected_result |