Skip to content

Commit

Permalink
Intake lists (#52)
Browse files Browse the repository at this point in the history
* Add list input to enrichment.py

* delete prints

* Apply Black

* modify test two new input

* Change the input of precision_recall.py

* Black changes

* fix tests

* rerun the demo

* also accept ints

Co-authored-by: Greg Way <gregory.way@gmail.com>

* ints and floats also allowed

* add further tests

* Black

* fix test

* Add Demo

* Fix test

* change input to floats

Co-authored-by: Greg Way <gregory.way@gmail.com>

* correct doc

* change input to floats

Co-authored-by: Greg Way <gregory.way@gmail.com>

* More tests

* named percentile in enrichment.py

* update docstring

Co-authored-by: Greg Way <gregory.way@gmail.com>

* Update cytominer_eval/tests/test_operations/test_enrichment.py

Co-authored-by: Greg Way <gregory.way@gmail.com>

* add comment for test

* finalize test enrichment

Co-authored-by: Greg Way <gregory.way@gmail.com>
  • Loading branch information
michaelbornholdt and gwaybio authored May 6, 2021
1 parent 220b296 commit 7f94b11
Show file tree
Hide file tree
Showing 8 changed files with 241 additions and 178 deletions.
9 changes: 5 additions & 4 deletions cytominer_eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
enrichment,
)


def evaluate(
profiles: pd.DataFrame,
features: List[str],
Expand All @@ -26,11 +27,11 @@ def evaluate(
similarity_metric: str = "pearson",
replicate_reproducibility_quantile: np.float = 0.95,
replicate_reproducibility_return_median_cor: bool = False,
precision_recall_k: int = 10,
precision_recall_k: Union[int, List[int]] = 10,
grit_control_perts: List[str] = ["None"],
grit_replicate_summary_method: str = "mean",
mp_value_params: dict = {},
enrichment_percentile: float = 0.5,
enrichment_percentile: Union[float, List[float]] = 0.99,
):
r"""Evaluate profile quality and strength.
Expand Down Expand Up @@ -85,7 +86,7 @@ def evaluate(
Only used when `operation='replicate_reproducibility'`. If True, then also
return pairwise correlations as defined by replicate_groups and
similarity metric
precision_recall_k : {10, ...}, optional
precision_recall_k : int or list of ints {10, ...}, optional
Only used when `operation='precision_recall'`. Used to calculate precision and
recall considering the top k profiles according to pairwise similarity.
grit_control_perts : {None, ...}, optional
Expand All @@ -100,7 +101,7 @@ def evaluate(
Only used when `operation='mp_value'`. A key, item pair of optional parameters
for calculating mp value. See also
:py:func:`cytominer_eval.operations.util.default_mp_value_parameters`
percentile : float, optional
enrichment_percentile : float or list of floats, optional
Only used when `operation='enrichment'`. Determines the percentage of top connections
used for the enrichment calculation.
"""
Expand Down
76 changes: 43 additions & 33 deletions cytominer_eval/operations/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
import numpy as np
import pandas as pd
from typing import List
from typing import List, Union
import scipy

from .util import assign_replicates, calculate_grit, check_grit_replicate_summary_method
Expand All @@ -14,8 +14,10 @@


def enrichment(
similarity_melted_df: pd.DataFrame, replicate_groups: List[str], percentile: 0.9,
) -> dict:
similarity_melted_df: pd.DataFrame,
replicate_groups: List[str],
percentile: Union[float, List[float]],
) -> pd.DataFrame:
"""Calculate the enrichment score. This score is based on the fisher exact odds score. Similar to the other functions, the closest connections are determined and checked with the replicates.
This score effectively calculates how much better the distribution of correct connections is compared to random.
Expand All @@ -28,48 +30,56 @@ def enrichment(
replicate_groups : List
a list of metadata column names in the original profile dataframe to use as
replicate columns.
percentile : float
percentile : List of floats
Determines what percentage of top connections used for the enrichment calculation.
Returns
-------
dict
percentile, threshold, odds ratio and p value
"""
# threshold based on percentile of top connections
threshold = similarity_melted_df.similarity_metric.quantile(percentile)

result = []
replicate_truth_df = assign_replicates(
similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups
)
# calculate the individual components of the contingency tables
v11 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric>@threshold"
# loop over all percentiles
if type(percentile) == float:
percentile = [percentile]
for p in percentile:
# threshold based on percentile of top connections
threshold = similarity_melted_df.similarity_metric.quantile(p)

# calculate the individual components of the contingency tables
v11 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric>@threshold"
)
)
)
v12 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric>@threshold"
v12 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric>@threshold"
)
)
)
v21 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric<=@threshold"
v21 = len(
replicate_truth_df.query(
"group_replicate==True and similarity_metric<=@threshold"
)
)
)
v22 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric<=@threshold"
v22 = len(
replicate_truth_df.query(
"group_replicate==False and similarity_metric<=@threshold"
)
)
)

v = np.asarray([[v11, v12], [v21, v22]])
r = scipy.stats.fisher_exact(v, alternative="greater")
result = {
"percentile": percentile,
"threshold": threshold,
"ods_ratio": r[0],
"p-value": r[1],
}
return result
v = np.asarray([[v11, v12], [v21, v22]])
r = scipy.stats.fisher_exact(v, alternative="greater")
result.append(
{
"enrichment_percentile": p,
"threshold": threshold,
"ods_ratio": r[0],
"p-value": r[1],
}
)
result_df = pd.DataFrame(result)
return result_df
21 changes: 13 additions & 8 deletions cytominer_eval/operations/precision_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
import pandas as pd
from typing import List
from typing import List, Union

from .util import assign_replicates, calculate_precision_recall
from cytominer_eval.transform.util import set_pair_ids, assert_melt
Expand All @@ -13,7 +13,7 @@
def precision_recall(
similarity_melted_df: pd.DataFrame,
replicate_groups: List[str],
k: int,
k: Union[int, List[int]],
) -> pd.DataFrame:
"""Determine the precision and recall at k for all unique replicate groups
based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
Expand All @@ -27,7 +27,7 @@ def precision_recall(
replicate_groups : List
a list of metadata column names in the original profile dataframe to use as
replicate columns.
k : int
k : List of ints or int
an integer indicating how many pairwise comparisons to threshold.
Returns
Expand All @@ -49,11 +49,16 @@ def precision_recall(
"{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
for x in replicate_groups
]

# Calculate precision and recall for all groups
precision_recall_df = similarity_melted_df.groupby(replicate_group_cols).apply(
lambda x: calculate_precision_recall(x, k=k)
)
# iterate over all k
precision_recall_df = pd.DataFrame()
if type(k) == int:
k = [k]
for k_ in k:
# Calculate precision and recall for all groups
precision_recall_df_at_k = similarity_melted_df.groupby(
replicate_group_cols
).apply(lambda x: calculate_precision_recall(x, k=k_))
precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)

# Rename the columns back to the replicate groups provided
rename_cols = dict(zip(replicate_group_cols, replicate_groups))
Expand Down
19 changes: 6 additions & 13 deletions cytominer_eval/tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,7 @@ def test_evaluate_replicate_reprod_return_cor_true():

assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
assert sorted(med_cor_df.columns.tolist()) == sorted(
[
"Metadata_gene_name",
"Metadata_pert_name",
"similarity_metric",
]
["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
)


Expand All @@ -134,6 +130,7 @@ def test_evaluate_precision_recall():

for k in ks:

# first test the function with k = float, later we test with k = list of floats
result = evaluate(
profiles=gene_profiles,
features=gene_features,
Expand All @@ -152,15 +149,15 @@ def test_evaluate_precision_recall():
result.query("recall == 1").shape[0]
== expected_result["gene"]["recall"][str(k)]
)

# test function with argument k = list of floats, should give same result as above
result = evaluate(
profiles=compound_profiles,
features=compound_features,
meta_features=compound_meta_features,
replicate_groups=["Metadata_broad_sample"],
operation="precision_recall",
similarity_metric="pearson",
precision_recall_k=k,
precision_recall_k=[k],
)

assert (
Expand Down Expand Up @@ -205,9 +202,7 @@ def test_evaluate_grit():
top_result = (
grit_results_df.sort_values(by="grit", ascending=False)
.reset_index(drop=True)
.iloc[
0,
]
.iloc[0,]
)
assert np.round(top_result.grit, 4) == 2.3352
assert top_result.group == "PTK2"
Expand All @@ -233,9 +228,7 @@ def test_evaluate_grit():
top_result = (
grit_results_df.sort_values(by="grit", ascending=False)
.reset_index(drop=True)
.iloc[
0,
]
.iloc[0,]
)

assert np.round(top_result.grit, 4) == 0.9990
Expand Down
38 changes: 22 additions & 16 deletions cytominer_eval/tests/test_operations/test_enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,37 +42,43 @@


def test_enrichment():
result = []
for p in np.arange(1, 0.97, -0.005):
r = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=p,
)
result.append(r)
result_df = pd.DataFrame(result)
percent_list = np.arange(1, 0.97, -0.005)
result = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=percent_list,
)

# check for correct shape and starts with 1.0
assert result_df.shape == (7, 4)
assert result_df.percentile[0] == 1.0
assert result.shape == (7, 4)
assert result.enrichment_percentile[0] == 1.0
assert result.enrichment_percentile[1] == 0.995
# check if the higher percentiles are larger than the small one
assert result_df.percentile[1] > result_df.percentile.iloc[-1]
assert result.enrichment_percentile[1] > result.enrichment_percentile.iloc[-1]

result_int = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=0.97,
)

assert result_int.enrichment_percentile[0] == result.enrichment_percentile.iloc[-1]


def test_compare_functions():
percentile = 0.9
percent_list = [0.95, 0.9]
eval_res = evaluate(
profiles=df,
features=features,
meta_features=meta_features,
replicate_groups=replicate_groups,
operation="enrichment",
similarity_metric="pearson",
enrichment_percentile=percentile,
enrichment_percentile=percent_list,
)
enr_res = enrichment(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
percentile=percentile,
percentile=percent_list,
)
assert enr_res == eval_res
assert enr_res.equals(eval_res)
20 changes: 14 additions & 6 deletions cytominer_eval/tests/test_operations/test_precision_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,30 @@


def test_precision_recall():
result = precision_recall(
result_list = precision_recall(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
k=10,
k=[5, 10],
)

assert len(result.k.unique()) == 1
assert result.k.unique()[0] == 10
result_int = precision_recall(
similarity_melted_df=similarity_melted_df,
replicate_groups=replicate_groups,
k=5,
)

assert len(result_list.k.unique()) == 2
assert result_list.k.unique()[0] == 5

# ITGAV has a really strong profile
assert (
result.sort_values(by="recall", ascending=False)
result_list.sort_values(by="recall", ascending=False)
.reset_index(drop=True)
.iloc[0, :]
.Metadata_gene_name
== "ITGAV"
)

assert all(x in result.columns for x in replicate_groups)
assert all(x in result_list.columns for x in replicate_groups)

assert result_int.equals(result_list.query("k == 5"))
8 changes: 7 additions & 1 deletion cytominer_eval/transform/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

def get_available_eval_metrics():
r"""Output the available eval metrics in the cytominer_eval library"""
return ["replicate_reproducibility", "precision_recall", "grit", "mp_value", "enrichment"]
return [
"replicate_reproducibility",
"precision_recall",
"grit",
"mp_value",
"enrichment",
]


def get_available_similarity_metrics():
Expand Down
Loading

0 comments on commit 7f94b11

Please sign in to comment.