Skip to content

Commit

Permalink
Merge pull request #1 from gwaygenomics/add-initial-functions
Browse files Browse the repository at this point in the history
Adding Initial Functions
  • Loading branch information
gwaybio authored Jul 3, 2019
2 parents 8cec875 + 0788c9d commit 17fe2ff
Show file tree
Hide file tree
Showing 11 changed files with 319 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
.ipynb_checkpoints/
Empty file added pycytominer/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions pycytominer/aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Aggregate single cell data based on given grouping variables
"""

import pandas as pd


def aggregate(population_df, strata, variables="all", operation="median"):
"""
Combine population dataframe variables by strata groups using given operation
Arguments:
population_df - pandas DataFrame to group and aggregate
strata - list indicating the columns to groupby and aggregate
variables - [default: "all] or list indicating variables that should be aggregated
operation - [default: "median"] a string indicating how the data is aggregated
currently only supports one of ['mean', 'median']
Return:
Pandas DataFrame of aggregated features
"""

operation = operation.lower()

assert operation in ["mean", "median"], "operation must be one ['mean', 'median']"

# Subset dataframe to only specified variables if provided
if variables != "all":
strata_df = population_df.loc[:, strata]
population_df = population_df.loc[:, variables]
population_df = pd.concat([strata_df, population_df], axis="columns")

population_df = population_df.groupby(strata)

if operation == "median":
return population_df.median().reset_index()
else:
return population_df.mean().reset_index()
80 changes: 80 additions & 0 deletions pycytominer/correlation_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""
Returns list of variables such that no two variables have a correlation greater than a
specified threshold
"""

import numpy as np
import pandas as pd


def correlation_threshold(variables, data_df, threshold=0.9, method="pearson"):
"""
Exclude variables that have correlations above a certain threshold
Arguments:
variables - list specifying observation variables
data_df - Pandas DataFrame containing the data to calculate variable correlation
typically, this DataFrame is a sampled subset of the full dataframe
threshold - float between (0, 1) to exclude variables [default: 0.9]
method - string indicating which correlation metric to use to test cutoff
[default: "pearson"]
Return:
A list of variables to exclude
"""
method = method.lower()

assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"
assert method in [
"pearson",
"spearman",
"kendall",
], "method not supported, select one of ['pearson', 'spearman', 'kendall']"

# Subset dataframe and calculate correlation matrix across subset variables
data_cor_df = data_df.loc[:, variables].corr(method=method)

# Create a copy of the dataframe to generate upper triangle of zeros
data_cor_zerotri_df = data_cor_df.copy()

# Zero out upper triangle in correlation matrix
data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1)

# Get absolute sum of correlation across variables
# The lower the index, the less correlation to the full data frame
# We want to drop variables with highest correlation, so drop higher index
variable_cor_sum = data_cor_df.abs().sum().sort_values().index

# Acquire pairwise correlations in a long format
# Note that we are using the zero triangle DataFrame
pairwise_df = data_cor_zerotri_df.stack().reset_index()
pairwise_df.columns = ["pair_a", "pair_b", "correlation"]

# And subset to only variable combinations that pass the threshold
pairwise_df = pairwise_df.query("correlation > @threshold")

# Output the excluded variables
excluded = pairwise_df.apply(
lambda x: determine_high_cor_pair(x, variable_cor_sum), axis="columns"
)

return list(set(excluded.tolist()))


def determine_high_cor_pair(correlation_row, sorted_correlation_pairs):
"""
Select highest correlated variable given a correlation row with columns:
["pair_a", "pair_b", "correlation"]
For use in a pandas.apply()
"""

pair_a = correlation_row["pair_a"]
pair_b = correlation_row["pair_b"]

if sorted_correlation_pairs.get_loc(pair_a) > sorted_correlation_pairs.get_loc(
pair_b
):
return pair_a
else:
return pair_b
23 changes: 23 additions & 0 deletions pycytominer/count_na_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
"""
Count the number of NAs per variable
Note this was called `count_na_rows()` in cytominer
"""

import pandas as pd


def count_na_features(population_df, variables):
"""
Given a population dataframe and variables, count how many nas per feature
Arguments:
population_df - pandas DataFrame storing profiles
variables - a list of features present in the population dataframe
Return:
Dataframe of NA counts per variable
"""

return pd.DataFrame(
population_df.loc[:, variables].isna().sum(), columns=["num_na"]
)
25 changes: 25 additions & 0 deletions pycytominer/get_na_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
Remove variables with specified threshold of NA values
Note: This was called `drop_na_columns` in cytominer for R
"""

import pandas as pd


def get_na_columns(population_df, variables, cutoff=0.05):
"""
Get features that have more NA values than cutoff defined
Arguments:
population_df - pandas DataFrame storing profiles
variables - a list of features present in the population dataframe
Output:
A list of the features to exclude
"""

num_rows = population_df.shape[0]
na_prop_df = population_df.loc[:, variables].isna().sum() / num_rows

na_prop_df = na_prop_df[na_prop_df > cutoff]
return list(set(na_prop_df.index.tolist()))
Empty file added pycytominer/tests/__init__.py
Empty file.
72 changes: 72 additions & 0 deletions pycytominer/tests/test_aggregate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd
from pycytominer.aggregate import aggregate

# Build data to use in tests
data_df = pd.concat(
[
pd.DataFrame({"g": "a", "x": [1, 3, 8], "y": [5, 3, 1]}),
pd.DataFrame({"g": "b", "x": [1, 3, 5], "y": [8, 3, 1]}),
]
).reset_index(drop=True)


def test_aggregate_median_allvar():
"""
Testing aggregate pycytominer function
"""
aggregate_result = aggregate(
population_df=data_df, strata=["g"], variables="all", operation="median"
)

expected_result = pd.concat(
[
pd.DataFrame({"g": "a", "x": [3], "y": [3]}),
pd.DataFrame({"g": "b", "x": [3], "y": [3]}),
]
).reset_index(drop=True)

assert aggregate_result.equals(expected_result)


def test_aggregate_mean_allvar():
"""
Testing aggregate pycytominer function
"""
aggregate_result = aggregate(
population_df=data_df, strata=["g"], variables="all", operation="mean"
)

expected_result = pd.concat(
[
pd.DataFrame({"g": "a", "x": [4], "y": [3]}),
pd.DataFrame({"g": "b", "x": [3], "y": [4]}),
]
).reset_index(drop=True)

assert aggregate_result.equals(expected_result)


def test_aggregate_median_subsetvar():
"""
Testing aggregate pycytominer function
"""
aggregate_result = aggregate(
population_df=data_df, strata=["g"], variables=["x"], operation="median"
)

expected_result = pd.DataFrame({"g": ["a", "b"], "x": [3, 3]})

assert aggregate_result.equals(expected_result)


def test_aggregate_mean_subsetvar():
"""
Testing aggregate pycytominer function
"""
aggregate_result = aggregate(
population_df=data_df, strata=["g"], variables=["x"], operation="mean"
)

expected_result = pd.DataFrame({"g": ["a", "b"], "x": [4, 3]})

assert aggregate_result.equals(expected_result)
28 changes: 28 additions & 0 deletions pycytominer/tests/test_correlation_threshold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import pandas as pd
from pycytominer.correlation_threshold import correlation_threshold

# Build data to use in tests
data_df = pd.DataFrame(
{
"x": [1, 3, 8, 5, 2, 2],
"y": [1, 2, 8, 5, 2, 1],
"z": [9, 3, 8, 9, 2, 9],
"zz": [0, -3, 8, 9, 6, 9],
}
).reset_index(drop=True)


def test_correlation_threshold():
"""
Testing correlation_threshold pycytominer function
"""
correlation_threshold_result = correlation_threshold(
variables=["x", "y", "z", "zz"],
data_df=data_df,
threshold=0.9,
method="pearson",
)

expected_result = ['y']

assert correlation_threshold_result == expected_result
26 changes: 26 additions & 0 deletions pycytominer/tests/test_count_na_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
import pandas as pd
from pycytominer.count_na_features import count_na_features

data_df = pd.DataFrame(
{
"x": [np.nan, 3, 8, 5, 2, 2],
"y": [1, 2, 8, np.nan, 2, np.nan],
"z": [9, 3, 8, 9, 2, 9],
"zz": [np.nan, np.nan, 8, np.nan, 6, 9],
}
).reset_index(drop=True)


def test_count_na_features():
"""
Testing count_na_features pycytominer function
"""
count_na_features_result = count_na_features(
population_df=data_df, variables=["x", "zz"]
)

expected_result = pd.DataFrame({"num_na": [1, 3]})
expected_result.index = ["x", "zz"]

assert count_na_features_result.equals(expected_result)
25 changes: 25 additions & 0 deletions pycytominer/tests/test_get_na_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np
import pandas as pd
from pycytominer.get_na_columns import get_na_columns

data_df = pd.DataFrame(
{
"x": [np.nan, 3, 8, 5, 2, 2],
"y": [1, 2, 8, np.nan, 2, np.nan],
"z": [9, 3, 8, 9, 2, 9],
"zz": [np.nan, np.nan, 8, np.nan, 6, 9],
}
).reset_index(drop=True)


def test_get_na_columns():
"""
Testing get_na_columns pycytominer function
"""
get_na_columns_result = get_na_columns(
population_df=data_df, variables=["x", "y", "zz"], cutoff=0.4
)

expected_result = ["zz"]

assert get_na_columns_result == expected_result

0 comments on commit 17fe2ff

Please sign in to comment.