Merge pull request #1 from gwaygenomics/add-initial-functions

Adding Initial Functions
cytomining · Jul 3, 2019 · 17fe2ff · 17fe2ff
2 parents 8cec875 + 0788c9d
commit 17fe2ff
Show file tree

Hide file tree

Showing 11 changed files with 319 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+.ipynb_checkpoints/
diff --git a/pycytominer/__init__.py b/pycytominer/__init__.py
diff --git a/pycytominer/aggregate.py b/pycytominer/aggregate.py
@@ -0,0 +1,38 @@
+"""
+Aggregate single cell data based on given grouping variables
+"""
+
+import pandas as pd
+
+
+def aggregate(population_df, strata, variables="all", operation="median"):
+    """
+    Combine population dataframe variables by strata groups using given operation
+
+    Arguments:
+    population_df - pandas DataFrame to group and aggregate
+    strata - list indicating the columns to groupby and aggregate
+    variables - [default: "all] or list indicating variables that should be aggregated
+    operation - [default: "median"] a string indicating how the data is aggregated
+                currently only supports one of ['mean', 'median']
+
+    Return:
+    Pandas DataFrame of aggregated features
+    """
+
+    operation = operation.lower()
+
+    assert operation in ["mean", "median"], "operation must be one ['mean', 'median']"
+
+    # Subset dataframe to only specified variables if provided
+    if variables != "all":
+        strata_df = population_df.loc[:, strata]
+        population_df = population_df.loc[:, variables]
+        population_df = pd.concat([strata_df, population_df], axis="columns")
+
+    population_df = population_df.groupby(strata)
+
+    if operation == "median":
+        return population_df.median().reset_index()
+    else:
+        return population_df.mean().reset_index()
diff --git a/pycytominer/correlation_threshold.py b/pycytominer/correlation_threshold.py
@@ -0,0 +1,80 @@
+"""
+Returns list of variables such that no two variables have a correlation greater than a
+specified threshold
+"""
+
+import numpy as np
+import pandas as pd
+
+
+def correlation_threshold(variables, data_df, threshold=0.9, method="pearson"):
+    """
+    Exclude variables that have correlations above a certain threshold
+
+    Arguments:
+    variables - list specifying observation variables
+    data_df - Pandas DataFrame containing the data to calculate variable correlation
+              typically, this DataFrame is a sampled subset of the full dataframe
+    threshold - float between (0, 1) to exclude variables [default: 0.9]
+    method - string indicating which correlation metric to use to test cutoff
+             [default: "pearson"]
+
+    Return:
+    A list of variables to exclude
+    """
+    method = method.lower()
+
+    assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"
+    assert method in [
+        "pearson",
+        "spearman",
+        "kendall",
+    ], "method not supported, select one of ['pearson', 'spearman', 'kendall']"
+
+    # Subset dataframe and calculate correlation matrix across subset variables
+    data_cor_df = data_df.loc[:, variables].corr(method=method)
+
+    # Create a copy of the dataframe to generate upper triangle of zeros
+    data_cor_zerotri_df = data_cor_df.copy()
+
+    # Zero out upper triangle in correlation matrix
+    data_cor_zerotri_df.loc[:, :] = np.tril(data_cor_df, k=-1)
+
+    # Get absolute sum of correlation across variables
+    # The lower the index, the less correlation to the full data frame
+    # We want to drop variables with highest correlation, so drop higher index
+    variable_cor_sum = data_cor_df.abs().sum().sort_values().index
+
+    # Acquire pairwise correlations in a long format
+    # Note that we are using the zero triangle DataFrame
+    pairwise_df = data_cor_zerotri_df.stack().reset_index()
+    pairwise_df.columns = ["pair_a", "pair_b", "correlation"]
+
+    # And subset to only variable combinations that pass the threshold
+    pairwise_df = pairwise_df.query("correlation > @threshold")
+
+    # Output the excluded variables
+    excluded = pairwise_df.apply(
+        lambda x: determine_high_cor_pair(x, variable_cor_sum), axis="columns"
+    )
+
+    return list(set(excluded.tolist()))
+
+
+def determine_high_cor_pair(correlation_row, sorted_correlation_pairs):
+    """
+    Select highest correlated variable given a correlation row with columns:
+    ["pair_a", "pair_b", "correlation"]
+
+    For use in a pandas.apply()
+    """
+
+    pair_a = correlation_row["pair_a"]
+    pair_b = correlation_row["pair_b"]
+
+    if sorted_correlation_pairs.get_loc(pair_a) > sorted_correlation_pairs.get_loc(
+        pair_b
+    ):
+        return pair_a
+    else:
+        return pair_b
diff --git a/pycytominer/count_na_features.py b/pycytominer/count_na_features.py
@@ -0,0 +1,23 @@
+"""
+Count the number of NAs per variable
+Note this was called `count_na_rows()` in cytominer
+"""
+
+import pandas as pd
+
+
+def count_na_features(population_df, variables):
+    """
+    Given a population dataframe and variables, count how many nas per feature
+
+    Arguments:
+    population_df - pandas DataFrame storing profiles
+    variables - a list of features present in the population dataframe
+
+    Return:
+    Dataframe of NA counts per variable
+    """
+
+    return pd.DataFrame(
+        population_df.loc[:, variables].isna().sum(), columns=["num_na"]
+    )
diff --git a/pycytominer/get_na_columns.py b/pycytominer/get_na_columns.py
@@ -0,0 +1,25 @@
+"""
+Remove variables with specified threshold of NA values
+Note: This was called `drop_na_columns` in cytominer for R
+"""
+
+import pandas as pd
+
+
+def get_na_columns(population_df, variables, cutoff=0.05):
+    """
+    Get features that have more NA values than cutoff defined
+
+    Arguments:
+    population_df - pandas DataFrame storing profiles
+    variables - a list of features present in the population dataframe
+
+    Output:
+    A list of the features to exclude
+    """
+
+    num_rows = population_df.shape[0]
+    na_prop_df = population_df.loc[:, variables].isna().sum() / num_rows
+
+    na_prop_df = na_prop_df[na_prop_df > cutoff]
+    return list(set(na_prop_df.index.tolist()))
diff --git a/pycytominer/tests/__init__.py b/pycytominer/tests/__init__.py
diff --git a/pycytominer/tests/test_aggregate.py b/pycytominer/tests/test_aggregate.py
@@ -0,0 +1,72 @@
+import pandas as pd
+from pycytominer.aggregate import aggregate
+
+# Build data to use in tests
+data_df = pd.concat(
+    [
+        pd.DataFrame({"g": "a", "x": [1, 3, 8], "y": [5, 3, 1]}),
+        pd.DataFrame({"g": "b", "x": [1, 3, 5], "y": [8, 3, 1]}),
+    ]
+).reset_index(drop=True)
+
+
+def test_aggregate_median_allvar():
+    """
+    Testing aggregate pycytominer function
+    """
+    aggregate_result = aggregate(
+        population_df=data_df, strata=["g"], variables="all", operation="median"
+    )
+
+    expected_result = pd.concat(
+        [
+            pd.DataFrame({"g": "a", "x": [3], "y": [3]}),
+            pd.DataFrame({"g": "b", "x": [3], "y": [3]}),
+        ]
+    ).reset_index(drop=True)
+
+    assert aggregate_result.equals(expected_result)
+
+
+def test_aggregate_mean_allvar():
+    """
+    Testing aggregate pycytominer function
+    """
+    aggregate_result = aggregate(
+        population_df=data_df, strata=["g"], variables="all", operation="mean"
+    )
+
+    expected_result = pd.concat(
+        [
+            pd.DataFrame({"g": "a", "x": [4], "y": [3]}),
+            pd.DataFrame({"g": "b", "x": [3], "y": [4]}),
+        ]
+    ).reset_index(drop=True)
+
+    assert aggregate_result.equals(expected_result)
+
+
+def test_aggregate_median_subsetvar():
+    """
+    Testing aggregate pycytominer function
+    """
+    aggregate_result = aggregate(
+        population_df=data_df, strata=["g"], variables=["x"], operation="median"
+    )
+
+    expected_result = pd.DataFrame({"g": ["a", "b"], "x": [3, 3]})
+
+    assert aggregate_result.equals(expected_result)
+
+
+def test_aggregate_mean_subsetvar():
+    """
+    Testing aggregate pycytominer function
+    """
+    aggregate_result = aggregate(
+        population_df=data_df, strata=["g"], variables=["x"], operation="mean"
+    )
+
+    expected_result = pd.DataFrame({"g": ["a", "b"], "x": [4, 3]})
+
+    assert aggregate_result.equals(expected_result)
diff --git a/pycytominer/tests/test_correlation_threshold.py b/pycytominer/tests/test_correlation_threshold.py
@@ -0,0 +1,28 @@
+import pandas as pd
+from pycytominer.correlation_threshold import correlation_threshold
+
+# Build data to use in tests
+data_df = pd.DataFrame(
+    {
+        "x": [1, 3, 8, 5, 2, 2],
+        "y": [1, 2, 8, 5, 2, 1],
+        "z": [9, 3, 8, 9, 2, 9],
+        "zz": [0, -3, 8, 9, 6, 9],
+    }
+).reset_index(drop=True)
+
+
+def test_correlation_threshold():
+    """
+    Testing correlation_threshold pycytominer function
+    """
+    correlation_threshold_result = correlation_threshold(
+        variables=["x", "y", "z", "zz"],
+        data_df=data_df,
+        threshold=0.9,
+        method="pearson",
+    )
+
+    expected_result = ['y']
+
+    assert correlation_threshold_result == expected_result
diff --git a/pycytominer/tests/test_count_na_features.py b/pycytominer/tests/test_count_na_features.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pandas as pd
+from pycytominer.count_na_features import count_na_features
+
+data_df = pd.DataFrame(
+    {
+        "x": [np.nan, 3, 8, 5, 2, 2],
+        "y": [1, 2, 8, np.nan, 2, np.nan],
+        "z": [9, 3, 8, 9, 2, 9],
+        "zz": [np.nan, np.nan, 8, np.nan, 6, 9],
+    }
+).reset_index(drop=True)
+
+
+def test_count_na_features():
+    """
+    Testing count_na_features pycytominer function
+    """
+    count_na_features_result = count_na_features(
+        population_df=data_df, variables=["x", "zz"]
+    )
+
+    expected_result = pd.DataFrame({"num_na": [1, 3]})
+    expected_result.index = ["x", "zz"]
+
+    assert count_na_features_result.equals(expected_result)
diff --git a/pycytominer/tests/test_get_na_columns.py b/pycytominer/tests/test_get_na_columns.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pandas as pd
+from pycytominer.get_na_columns import get_na_columns
+
+data_df = pd.DataFrame(
+    {
+        "x": [np.nan, 3, 8, 5, 2, 2],
+        "y": [1, 2, 8, np.nan, 2, np.nan],
+        "z": [9, 3, 8, 9, 2, 9],
+        "zz": [np.nan, np.nan, 8, np.nan, 6, 9],
+    }
+).reset_index(drop=True)
+
+
+def test_get_na_columns():
+    """
+    Testing get_na_columns pycytominer function
+    """
+    get_na_columns_result = get_na_columns(
+        population_df=data_df, variables=["x", "y", "zz"], cutoff=0.4
+    )
+
+    expected_result = ["zz"]
+
+    assert get_na_columns_result == expected_result