Skip to content

Commit

Permalink
multimodel plots etc polished and finished
Browse files Browse the repository at this point in the history
  • Loading branch information
leokim-l committed Jul 30, 2024
1 parent 791f21c commit ac4aba0
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 22 deletions.
6 changes: 3 additions & 3 deletions src/malco/post_process/compute_mrr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def mondo_adapter() -> OboGraphInterface:
"""
return get_adapter("sqlite:obo:mondo")

def compute_mrr(output_dir, prompt_dir, correct_answer_file,
def compute_mrr(comparing, output_dir, prompt_dir, correct_answer_file,
raw_results_dir) -> Path:
# Read in results TSVs from self.output_dir that match glob results*tsv
results_data = []
Expand All @@ -49,7 +49,7 @@ def compute_mrr(output_dir, prompt_dir, correct_answer_file,
label_to_correct_term = answers.set_index("label")["term"].to_dict()
# Calculate the Mean Reciprocal Rank (MRR) for each file
mrr_scores = []
header = ["lang", "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
header = [comparing, "n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10", "n10p", "nf"]
rank_df = pd.DataFrame(0, index=np.arange(len(results_files)), columns=header)

cache_file = output_dir / "cache_log.txt"
Expand Down Expand Up @@ -93,7 +93,7 @@ def compute_mrr(output_dir, prompt_dir, correct_answer_file,
mrr_scores.append(mrr)

# Calculate top<n> of each rank
rank_df.loc[i,"lang"] = results_files[i].split("/")[0]
rank_df.loc[i, comparing] = results_files[i].split("/")[0]

ppkts = df.groupby("label")[["rank","is_correct"]]
index_matches = df.index[df['is_correct']]
Expand Down
17 changes: 12 additions & 5 deletions src/malco/post_process/generate_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@

# Make a nice plot, use it as function or as script

def make_plots(mrr_file, plot_dir, languages, num_ppkt, topn_file):
def make_plots(mrr_file, plot_dir, languages, num_ppkt, models, topn_file, comparing):
if comparing=="model":
name_string = str(len(models))
else:
name_string = str(len(languages))

with mrr_file.open('r', newline = '') as f:
lines = csv.reader(f, quoting = csv.QUOTE_NONNUMERIC, delimiter = '\t', lineterminator='\n')
results_files = next(lines)
Expand All @@ -20,7 +25,7 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, topn_file):
plt.xlabel("Results File")
plt.ylabel("Mean Reciprocal Rank (MRR)")
plt.title("MRR of Correct Answers Across Different Results Files")
plot_path = plot_dir / (str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
plot_path = plot_dir / (name_string + "_" + comparing + "_" + str(num_ppkt) + "ppkt.png")
plt.savefig(plot_path)
plt.close()

Expand All @@ -33,17 +38,19 @@ def make_plots(mrr_file, plot_dir, languages, num_ppkt, topn_file):
df["not_found"] = df["nf"]

df_aggr = pd.DataFrame()
df_aggr = pd.melt(df, id_vars="lang", value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
df_aggr = pd.melt(df, id_vars=comparing, value_vars=["top1", "top3", "top5", "top10", "not_found"], var_name="Rank_in", value_name="counts")
df_aggr["percentage"] = df_aggr["counts"]/num_ppkt
bar_data_file = plot_dir / "topn_aggr.tsv"
df_aggr.to_csv(bar_data_file, sep='\t', index=False)

sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = "lang")
sns.barplot(x="Rank_in", y="percentage", data = df_aggr, hue = comparing)

plt.xlabel("Number of Ranks in")
plt.ylabel("Percentage of Cases")
plt.title("Rank Comparison for Differential Diagnosis")
plot_path = plot_dir / ("barplot_" + str(len(languages)) + "_langs_" + str(num_ppkt) + "ppkt.png")
breakpoint()
plt.legend(title=comparing)
plot_path = plot_dir / ("barplot_" + name_string + "_" + comparing + "_" + str(num_ppkt) + "ppkt.png")
plt.savefig(plot_path)
plt.close()

18 changes: 9 additions & 9 deletions src/malco/run/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,25 @@
import subprocess


def call_ontogpt(lang, raw_results_dir, input_dir, model):
if model=="gpt-4-turbo":
def call_ontogpt(lang, raw_results_dir, input_dir, model, modality):
if modality=="several_languages":
command = (
f"ontogpt -v run-multilingual-analysis "
f"--output={raw_results_dir}/{lang}/results.yaml " # save raw OntoGPT output
f"{input_dir}/prompts/{lang}/ "
f"{raw_results_dir}/{lang}/differentials_by_file/ "
f"--model={model}"
)
else:
elif modality=="several_models":
command = (
f"ontogpt -v run-multilingual-analysis "
f"--output={raw_results_dir}/{model}/results.yaml " # save raw OntoGPT output
f"{input_dir}/prompts/{lang}/ "
f"{raw_results_dir}/{model}/differentials_by_file/ "
f"--model={model}"
)
else:
command(f"echo Something is not working...")
print(f"Running command: {command}")
process = subprocess.Popen(command, shell=True)
process.communicate()
Expand All @@ -46,13 +48,11 @@ def run(testdata_dir: Path,
if max_workers is None:
max_workers = multiprocessing.cpu_count()

modality = "several_languages"
with multiprocessing.Pool(processes=max_workers) as pool:
pool.starmap(call_ontogpt, [(lang, raw_results_dir / "multilingual", input_dir, "gpt-4-turbo") for lang in langs])
pool.starmap(call_ontogpt, [(lang, raw_results_dir / "multilingual", input_dir, "gpt-4-turbo", modality) for lang in langs])

# English only many models
#TODO
# 1323 of ontogpt/cli.py and
# 15 of ontogpt/utils/multilingual.py
# have to be edited (get rid of hardcoded model!)
modality = "several_models"
with multiprocessing.Pool(processes=max_workers) as pool:
pool.starmap(call_ontogpt, [("en", raw_results_dir / "multimodel", input_dir, model) for model in models])
pool.starmap(call_ontogpt, [("en", raw_results_dir / "multimodel", input_dir, model, modality) for model in models])
12 changes: 7 additions & 5 deletions src/malco/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,22 @@ def post_process(self,
langs=self.languages,
models=self.models)

mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(
comparing = "language"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(comparing,
output_dir=self.output_dir / "multilingual" ,
prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
correct_answer_file=correct_answer_file,
raw_results_dir=self.raw_results_dir / "multilingual" )
raw_results_dir=self.raw_results_dir / "multilingual")

if print_plot:
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, topn_file)
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)

mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr(
comparing = "model"
mrr_file, plot_dir, num_ppkt, topn_file = compute_mrr( comparing,
output_dir=self.output_dir / "multimodel" ,
prompt_dir=os.path.join(self.input_dir, prompts_subdir_name),
correct_answer_file=correct_answer_file,
raw_results_dir=self.raw_results_dir / "multimodel" )

if print_plot:
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, topn_file)
make_plots(mrr_file, plot_dir, self.languages, num_ppkt, self.models, topn_file, comparing)

0 comments on commit ac4aba0

Please sign in to comment.