Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Differential category #43

Merged
merged 6 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions src/malco/analysis/eval_diagnose_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas as pd
import numpy as np

from oaklib.datamodels.vocabulary import IS_A, PART_OF
from oaklib.interfaces import MappingProviderInterface
from oaklib.interfaces import OboGraphInterface
from oaklib.interfaces.obograph_interface import GraphTraversalMethod

from oaklib import get_adapter


def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.

Returns:
Adapter: The adapter.
"""
return get_adapter("sqlite:obo:mondo")

def mondo_mapping(term, adapter):
print(term)
mondos = []
for m in adapter.sssom_mappings([term], source="OMIM"):
if m.predicate_id == "skos:exactMatch":
mondos.append(m.subject_id)
return mondos

def find_category(omim_term, disease_categories, mondo):
if not isinstance(mondo, MappingProviderInterface):
raise ValueError("Adapter is not an MappingProviderInterface")
# What is best algorithm to avoid traversing the mondo graph a billion times?
# Find ancestors
mondo_term = mondo_mapping(omim_term, mondo)
ancestor_list = mondo.ancestors(mondo_term, predicates=[IS_A, PART_OF]) #, reflexive=True) # method=GraphTraversalMethod.ENTAILMENT

for mondo_ancestor in ancestor_list:
if mondo_ancestor in disease_categories:
return mondo_ancestor # This should be smt like MONDO:0045024 (cancer or benign tumor)


# Find 42 diseases categories
mondo = mondo_adapter()
disease_categories = mondo.relationships(objects = ["MONDO:0700096"], predicates=[IS_A])
# make df contingency table with header=diseases_category, correct, incorrect and initialize all to 0.
header = ["label","correct", "incorrect"]
#header = ["diseases_category", "correct", "incorrect"]
dc_list = [i[0] for i in list(disease_categories)]
#contingency_table = pd.DataFrame(0, index=np.arange(len(dc_list)), columns=header)
contingency_table = pd.DataFrame(0, index=dc_list, columns=header)
#dc_labels = []
for j in dc_list:
contingency_table.loc[j,"label"] = mondo.label(j)


# example path of full results
filename = "testout_multmodel_b4run/raw_results/multimodel/gpt-4/full_df_results.tsv"

# label term score rank correct_term is_correct reciprocal_rank
# PMID_35962790_Family_B_Individual_3__II_6__en-prompt.txt MONDO:0008675 1.0 1.0 OMIM:620545 False 0.0

df = pd.read_csv(
filename, sep="\t" #, header=None, names=["description", "term", "label"]
)

ppkts = df.groupby("label")[["term", "correct_term", "is_correct"]]

for ppkt in ppkts:
# find this phenopackets category <cat> from OMIM
category_index = find_category(ppkt[1].iloc[0]["correct_term"], dc_list, mondo)
#cat_ind = find_cat_index(category)
# is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe
if not any(ppkt[1]["is_correct"]):
# no --> increase <cat> incorrect
contingency_table.loc[category_index, "incorrect"] += 1
else:
# yes --> increase <cat> correct
contingency_table.loc[category_index, "correct"] += 1

print(contingency_table)

4 changes: 2 additions & 2 deletions src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:
raw_results_dir (Path): Path to the raw results directory.
output_dir (Path): Path to the output directory.
"""

for lang in langs:
raw_results_lang = raw_results_dir / "multilingual" / lang
output_lang = output_dir / "multilingual" / lang
Expand All @@ -21,7 +21,7 @@ def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple, models:

create_standardised_results(raw_results_dir=raw_results_lang,
output_dir=output_lang, output_file_name="results.tsv")

for model in models:
raw_results_model = raw_results_dir / "multimodel" / model
output_model = output_dir / "multimodel" / model
Expand Down
15 changes: 11 additions & 4 deletions src/malco/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@ class MalcoRunner(PhEvalRunner):
output_dir: Path
config_file: Path
version: str
# Declare a tuple (immutable!) of languages
# Declare a tuple of languages and models
#TODO move next 4 lines to input file
languages = ("en", "es", "nl", "it", "de")
models = ('gpt-4o', 'gpt-4') # Decide on list of models: Claude-Sonnet (Anthropic key),
models = ("gpt-3.5-turbo", "gpt-4", "gpt-4-turbo", "gpt-4o") # Decide on list of models: Claude-Sonnet (Anthropic key),
just_run = 1 # only run the run part of the code
just_postprocess = 0 # only run the postprocess part of the code



def prepare(self):
"""
Expand Down Expand Up @@ -55,7 +60,8 @@ def post_process(self,
output_dir=self.output_dir,
langs=self.languages,
models=self.models)



comparing = "language"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
output_dir=self.output_dir / "multilingual" ,
Expand All @@ -65,7 +71,8 @@ def post_process(self,

if print_plot:
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)



comparing = "model"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
output_dir=self.output_dir / "multimodel" ,
Expand Down
Loading