Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Post process add mondo utils #22

Merged
merged 23 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ac76273
Commit poetry.lock after poetry update
May 9, 2024
11661dd
Make results tsv
May 9, 2024
0162585
Make post_process step
May 9, 2024
77c8803
Merge branch 'main' into post_process
May 9, 2024
69cb7ac
Fix gitignore
May 9, 2024
ce65c4e
Add code to make MMR plot
May 10, 2024
a023064
Find result.tsv files in subdirs
May 10, 2024
ae0f1bf
minimal hacky working example, simple for loops over es and en in run…
leokim-l May 10, 2024
34afbcc
Comments about TODO for tomorrow
May 10, 2024
2bd93ba
Merge branch 'post_process' of https://github.com/monarch-initiative/…
May 10, 2024
d284263
added 78 spanish and english results.tsv, this is the result of the p…
leokim-l May 10, 2024
35a2873
Add back template code to allow tests to pass
May 10, 2024
13c2f2d
fixed labeling for spanish, should make it take any language with car…
leokim-l May 10, 2024
ff412ab
Merge in main
May 15, 2024
8f60837
Add OAK-based scoring
May 15, 2024
133d22f
Tidy up
May 15, 2024
245f3a6
refactored significantly runner, moving functions to different folder…
leokim-l May 23, 2024
7f91ae8
finished refactoring and cleaning, whole pipeline should almost work …
leokim-l May 27, 2024
e876922
moved ontogpt output to subdir, added rank in dataframe for bookkeepi…
leokim-l May 28, 2024
2ad4f45
added plot file naming
leokim-l May 28, 2024
2a30bb1
upload of jar with 5 languages, previously not done by mistake
leokim-l May 29, 2024
d18112b
increased cache size, seems heuristically to speed up the process, to…
leokim-l May 29, 2024
22a249f
Merge branch 'main' into post_process_add_mondo_utils
justaddcoffee Jun 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ venv/
__pycache__
data/*
prompts/*
inputdir
outputdir
.*db
inputdir/all_phenopackets.zip
inputdir/phenopacket-store/
.openai_cache.db
Binary file modified inputdir/phenopacket2prompt.jar
Binary file not shown.
26 changes: 13 additions & 13 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions src/malco/post_process/compute_mrr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import csv
from pathlib import Path
import pandas as pd
import pickle as pkl
from malco.post_process.mondo_score_utils import score_grounded_result


def compute_mrr(output_dir, prompt_dir, correct_answer_file) -> Path:
# Read in results TSVs from self.output_dir that match glob results*tsv
#TODO Leo: make more robust, had other results*tsv files from previous testing
# Proposal, go for exact file name match defined somewhere as global/static/immutable
results_data = []
results_files = []
num_ppkt = 0
for subdir, dirs, files in os.walk(output_dir):
for filename in files:
if filename.startswith("result") and filename.endswith(".tsv"):
file_path = os.path.join(subdir, filename)
df = pd.read_csv(file_path, sep="\t")
num_ppkt = df["label"].nunique()
results_data.append(df)
# Append both the subdirectory relative to output_dir and the filename
results_files.append(os.path.relpath(file_path, output_dir))
# Read in correct answers from prompt_dir
answers_path = os.path.join(os.getcwd(), prompt_dir, correct_answer_file)
answers = pd.read_csv(
answers_path, sep="\t", header=None, names=["description", "term", "label"]
)

# Mapping each label to its correct term
label_to_correct_term = answers.set_index("label")["term"].to_dict()
# Calculate the Mean Reciprocal Rank (MRR) for each file
mrr_scores = []
for df in results_data:
# For each label in the results file, find if the correct term is ranked
df["rank"] = df.groupby("label")["score"].rank(ascending=False, method="first")
label_4_non_eng = df["label"].str.replace("_[a-z][a-z]-prompt", "_en-prompt", regex=True)
df["correct_term"] = label_4_non_eng.map(label_to_correct_term)

# df['term'] is Mondo or OMIM ID, or even disease label
# df['correct_term'] is an OMIM
# call OAK and get OMIM IDs for df['term'] and see if df['correct_term'] is one of them
# in the case of phenotypic series, if Mondo corresponds to grouping term, accept it
df['is_correct'] = df.apply(
lambda row: score_grounded_result(row['term'], row['correct_term']) > 0,
axis=1)

# Calculate reciprocal rank
df["reciprocal_rank"] = df.apply(
lambda row: 1 / row["rank"] if row["is_correct"] else 0, axis=1
)
# Calculate MRR for this file
mrr = df.groupby("label")["reciprocal_rank"].max().mean()
mrr_scores.append(mrr)

print("MRR scores are:\n")
print(mrr_scores)
plot_dir = output_dir / "plots"
plot_dir.mkdir(exist_ok=True)
plot_data_file = plot_dir / "plotting_data.tsv"

# write out results for plotting
with plot_data_file.open('w', newline = '') as dat:
writer = csv.writer(dat, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
writer.writerow(results_files)
writer.writerow(mrr_scores)
return plot_data_file, plot_dir, num_ppkt
25 changes: 25 additions & 0 deletions src/malco/post_process/generate_plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import seaborn as sns
import matplotlib.pyplot as plt
import os
import csv

# Make a nice plot, use it as function or as script

def make_plots(plot_data_file, plot_dir, languages, num_ppkt):
with plot_data_file.open('r', newline = '') as f:
lines = csv.reader(f, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
results_files = next(lines)
mrr_scores = next(lines)
#lines = f.read().splitlines()

print(results_files)
print(mrr_scores)

# Plotting the results
sns.barplot(x = results_files, y = mrr_scores)
plt.xlabel("Results File")
plt.ylabel("Mean Reciprocal Rank (MRR)")
plt.title("MRR of Correct Answers Across Different Results Files")
plot_path = plot_dir / (str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
plt.savefig(plot_path)
plt.show()
8 changes: 5 additions & 3 deletions src/malco/post_process/mondo_score_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
PARTIAL_SCORE = 0.5


@lru_cache
@lru_cache(maxsize=4096)
def mondo_adapter() -> OboGraphInterface:
"""
Get the adapter for the MONDO ontology.
Expand All @@ -20,7 +20,7 @@ def mondo_adapter() -> OboGraphInterface:
return get_adapter("sqlite:obo:mondo")


@lru_cache()
@lru_cache(maxsize=1024)
def omim_mappings(term: str) -> List[str]:
"""
Get the OMIM mappings for a term.
Expand Down Expand Up @@ -81,7 +81,9 @@ def score_grounded_result(prediction: str, ground_truth: str) -> float:
# prediction is a MONDO that directly maps to a correct OMIM
return FULL_SCORE
mondo = mondo_adapter()
for mondo_descendant in mondo.descendants([prediction], predicates=[IS_A], reflexive=True):

descendants_list = mondo.descendants([prediction], predicates=[IS_A], reflexive=True)
for mondo_descendant in descendants_list:
if ground_truth in omim_mappings(mondo_descendant):
# prediction is a MONDO that maps to a correct OMIM via a descendant
return PARTIAL_SCORE
Expand Down
14 changes: 11 additions & 3 deletions src/malco/post_process/post_process.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
from pathlib import Path

from malco.post_process.post_process_results_format import create_standardised_results
import os


def post_process(raw_results_dir: Path, output_dir: Path) -> None:
def post_process(raw_results_dir: Path, output_dir: Path, langs: tuple) -> None:
"""
Post-process the raw results output to standardised PhEval TSV format.

Args:
raw_results_dir (Path): Path to the raw results directory.
output_dir (Path): Path to the output directory.
"""
create_standardised_results(raw_results_dir=raw_results_dir, output_dir=output_dir)

for lang in langs:
raw_results_lang = raw_results_dir / lang
output_lang = output_dir / lang
raw_results_lang.mkdir(exist_ok=True)
output_lang.mkdir(exist_ok=True)

create_standardised_results(raw_results_dir=raw_results_lang,
output_dir=output_lang, output_file_name = "results.tsv")
51 changes: 50 additions & 1 deletion src/malco/post_process/post_process_results_format.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,60 @@
import json
import os
from pathlib import Path
from typing import List

import pandas as pd
import yaml
from pheval.post_processing.post_processing import PhEvalGeneResult, generate_pheval_result
from pheval.utils.file_utils import all_files
from pheval.utils.phenopacket_utils import GeneIdentifierUpdater, create_hgnc_dict


def read_raw_result_yaml(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.

Args:
raw_result_path(Path): Path to the raw result file.

Returns:
dict: Contents of the raw result file.
"""
with open(raw_result_path, 'r') as raw_result:
return list(yaml.safe_load_all(raw_result)) # Load and convert to list


def create_standardised_results(raw_results_dir: Path, output_dir: Path,
output_file_name: str) -> pd.DataFrame:
data = []
for raw_result_path in raw_results_dir.iterdir():
if raw_result_path.is_file():
all_results = read_raw_result_yaml(raw_result_path)

for this_result in all_results:
extracted_object = this_result.get("extracted_object")
if extracted_object:
label = extracted_object.get('label')
terms = extracted_object.get('terms')
if terms:
num_terms = len(terms)
score = [1 / (i + 1) for i in range(num_terms)] # score is reciprocal rank
rank_list = [ i+1 for i in range(num_terms)]
for term, scr, rank in zip(terms, score, rank_list):
data.append({'label': label, 'term': term, 'score': scr, 'rank': rank})

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to TSV
output_path = output_dir / output_file_name
df.to_csv(output_path, sep='\t', index=False)

return df


# these are from the template and not currently used outside of tests

def read_raw_result(raw_result_path: Path) -> List[dict]:
"""
Read the raw result file.
Expand Down Expand Up @@ -94,7 +142,7 @@ def extract_pheval_gene_requirements(self) -> List[PhEvalGeneResult]:
)
return pheval_result


'''
def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None:
"""
Create PhEval gene tsv output from raw results.
Expand All @@ -117,3 +165,4 @@ def create_standardised_results(raw_results_dir: Path, output_dir: Path) -> None
output_dir=output_dir,
tool_result_path=raw_result_path,
)
'''
34 changes: 34 additions & 0 deletions src/malco/prepare/setup_phenopackets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import zipfile
import os
import requests

phenopacket_zip_url="https://github.com/monarch-initiative/phenopacket-store/releases/download/0.1.11/all_phenopackets.zip"
# TODO just point to a folder w/ ppkts
phenopacket_dir="phenopacket-store"

def setup_phenopackets(self) -> str:
phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
if os.path.exists(phenopacket_store_path):
print(f"{phenopacket_store_path} exists, skipping download.")
else:
print(f"{phenopacket_store_path} doesn't exist, downloading phenopackets...")
download_phenopackets(self, phenopacket_zip_url, phenopacket_dir)
return phenopacket_store_path


def download_phenopackets(self, phenopacket_zip_url, phenopacket_dir):
# Ensure the directory for storing the phenopackets exists
phenopacket_store_path = os.path.join(self.input_dir, phenopacket_dir)
os.makedirs(phenopacket_store_path, exist_ok=True)

# Download the phenopacket release zip file
response = requests.get(phenopacket_zip_url)
zip_path = os.path.join(self.input_dir, "all_phenopackets.zip")
with open(zip_path, "wb") as f:
f.write(response.content)
print("Download completed.")

# Unzip the phenopacket release zip file
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(phenopacket_store_path)
print("Unzip completed.")
29 changes: 9 additions & 20 deletions src/malco/run/run.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from pathlib import Path

from malco.run.run_tool import run_tool
from ontogpt.cli import run_multilingual_analysis
import os
# from ontogpt.cli import run_multilingual_analysis


def run(testdata_dir: Path, raw_results_dir: Path) -> None:
def run(testdata_dir: Path, raw_results_dir: Path, output_dir: Path,
langs: tuple) -> None:
"""
Run the tool to obtain the raw results.

Expand All @@ -14,19 +13,9 @@ def run(testdata_dir: Path, raw_results_dir: Path) -> None:
raw_results_dir: Path to the raw results directory.
"""
mydir = os.getcwd()
# TODO figure out how to run one language at a time, not like the next line
# lang_list = os.listdir(mydir + "prompts")
"""
run_multilingual_analysis(
input_data_dir=mydir + "prompts/en/PMID_23993194_Family_2_Case_2-prompt",
output_directory=mydir + "outputdir/",
output=mydir + "outputdir/" + "grounded_en", # TODO generalize lang
output_format="yaml",
model="gpt-4-turbo",
ext=".txt",
)
"""
os.system(
f"ontogpt run-multilingual-analysis --output={mydir}/outputdir/grounded_en/results.yaml --output-format=yaml {mydir}/prompts/et/ {mydir}outputdir/"
)
# run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)

for lang in langs:
os.system(
f"ontogpt -v run-multilingual-analysis --output={output_dir}/raw_results/{lang}/results.yaml {mydir}/prompts/{lang}/ {output_dir}/raw_results/{lang}/differentials_by_file/"
)
# run_tool(phenopacket_dir=testdata_dir.joinpath("phenopackets"), output_dir=raw_results_dir)
Loading
Loading