diff --git a/caches/cache_log.txt b/caches/cache_log.txt new file mode 100644 index 00000000..c58bcb90 --- /dev/null +++ b/caches/cache_log.txt @@ -0,0 +1,166 @@ +Timestamp: 20240821-140112 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=15970, misses=8315, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=90813, misses=14568, maxsize=16384, currsize=14568 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=29929, misses=17045, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=189862, misses=18113, maxsize=16384, currsize=16384 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=36854, misses=19422, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=220689, misses=18252, maxsize=16384, currsize=16384 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=56160, misses=25978, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=326321, misses=19439, maxsize=16384, currsize=16384 + +Timestamp: 20240822-193603 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=23718, misses=10045, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=118177, misses=5610, maxsize=16384, currsize=16384 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=53220, misses=27955, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=316509, misses=9986, maxsize=16384, currsize=16384 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=81216, misses=36794, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=425550, misses=10637, maxsize=16384, currsize=16384 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=100499, misses=43378, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=532067, misses=11525, maxsize=16384, currsize=16384 + +Timestamp: 20240828-114052 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=23726, misses=10037, maxsize=4096, currsize=4096 +omim_mappings cache info: +CacheInfo: hits=116389, misses=7706, maxsize=16384, currsize=16384 + +Timestamp: 20240903-201528 + +Timestamp: 20240904-111909 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=12774 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=74660, misses=6556, maxsize=524288, currsize=19330 +omim_mappings cache info: +CacheInfo: hits=64985, misses=8, maxsize=524288, currsize=20618 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=109269, misses=8782, maxsize=524288, currsize=21556 +omim_mappings cache info: +CacheInfo: hits=90157, misses=8, maxsize=524288, currsize=20618 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=134134, misses=9936, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=115413, misses=10, maxsize=524288, currsize=20620 + +Timestamp: 20240904-115833 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +Timestamp: 20240904-121924 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=33763, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=81216, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=118051, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=144070, misses=0, maxsize=524288, currsize=22710 +omim_mappings cache info: +CacheInfo: hits=0, misses=0, maxsize=524288, currsize=0 + +Timestamp: 20240905-132835 + +gpt-4o/results.tsv +score_grounded_result cache info: +CacheInfo: hits=39813, misses=2626, maxsize=524288, currsize=25336 +omim_mappings cache info: +CacheInfo: hits=21842, misses=279, maxsize=524288, currsize=20899 + +gpt-4/results.tsv +score_grounded_result cache info: +CacheInfo: hits=97434, misses=5307, maxsize=524288, currsize=28017 +omim_mappings cache info: +CacheInfo: hits=53565, misses=936, maxsize=524288, currsize=21556 + +gpt-4-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=143644, misses=6097, maxsize=524288, currsize=28807 +omim_mappings cache info: +CacheInfo: hits=62521, misses=1074, maxsize=524288, currsize=21694 + +gpt-3.5-turbo/results.tsv +score_grounded_result cache info: +CacheInfo: hits=176206, misses=6703, maxsize=524288, currsize=29413 +omim_mappings cache info: +CacheInfo: hits=80073, misses=2993, maxsize=524288, currsize=23613 + diff --git a/caches/omim_mappings_cache.db b/caches/omim_mappings_cache.db new file mode 100644 index 00000000..74e30758 Binary files /dev/null and b/caches/omim_mappings_cache.db differ diff --git a/caches/score_grounded_result_cache.db b/caches/score_grounded_result_cache.db new file mode 100644 index 00000000..628ddb71 Binary files /dev/null and b/caches/score_grounded_result_cache.db differ diff --git a/src/malco/analysis/disease_avail_knowledge.py b/src/malco/analysis/disease_avail_knowledge.py new file mode 100644 index 00000000..28a35c13 --- /dev/null +++ b/src/malco/analysis/disease_avail_knowledge.py @@ -0,0 +1,92 @@ +# Let us try to parametrize how much is known about the diseases, there are two ideas beyond eval_diagnose_category, looking at the MONDO categories +# Idea (0), (number of HPOs present, number of HPOs excluded) correlated to diseases found? +# (1) HPOA and (2) Monarch KG +# (1) Parse out disease genes discovered after 2008/9 (First thing in HPOA) +# Look for a correlation between date annotated and disease correctly diagnosed. +# Hypothesis: the older the easier to diagnose +# (2) To start, looking at the two broad categories found/not-found, count average number of all links +# After that, count average number of links of some kind +# Then, something more graphy, such as, centrality? Maybe need to project out something first to find signal in the noise... +import sys +import pandas as pd +import numpy as np +import datetime as dt + +hpoa_file_path = "/Users/leonardo/IdeaProjects/maxodiff/data/phenotype.hpoa" +hpoa_df = pd.read_csv( + hpoa_file_path, sep="\t" , header=4 + ) + +hpoa_cleaned = pd.DataFrame() +hpoa_cleaned["database_id"] = hpoa_df["database_id"] +hpoa_cleaned['date'] = hpoa_df["biocuration"].str.extract(r'\[(.*?)\]') +#string_dates = str(hpoa_df["biocuration"].str.extract(r'\[(.*?)\]')) +# Mi sto un po attorcigliando, sarebbe da, semplicemente, fare un color coding +#hpoa_cleaned['date'] = [dt.datetime.strptime(day, '%Y-%m-%d').date() for day in string_dates] +hpoa_cleaned = hpoa_cleaned[hpoa_cleaned['database_id'].str.startswith("OMIM")] + +model = str(sys.argv[1]) +ranking_results_filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv" +rank_results_df = pd.read_csv( + ranking_results_filename, sep="\t" + ) + +found_diseases = [] +not_found_diseases = [] +ppkts = rank_results_df.groupby("label")[["term", "correct_term", "is_correct"]] +for ppkt in ppkts: + # is there a true? ppkt is tuple ("filename", dataframe) --> ppkt[1] is a dataframe + disease = ppkt[1].iloc[0]['correct_term'] + if any(ppkt[1]["is_correct"]): + found_diseases.append(disease) + else: + not_found_diseases.append(disease) + +found_set = set(found_diseases) +notfound_set = set(not_found_diseases) +overlap = [] + +for i in found_set: + if i in notfound_set: + overlap.append(i) + +print(f"Number of found diseases by {model} is {len(found_set)}.") +print(f"Number of not found diseases by {model} is {len(notfound_set)}.") +print(f"Found diseases also present in not-found set, by {model} is {len(overlap)}.\n") +# Need some more statistic + +# header = ["disease_id", "found", "date"] + +# Problematic, goes from 27 k unique values to 8.2k +hpoa_cleaned = hpoa_cleaned.drop_duplicates(subset='database_id') +# Idea here could be to look at the 263-129 (gpt-4o) found diseases not present in not found set and the opposite +# namely never found diseases and look for a correlation with date. +always_found = found_set - notfound_set # 134 +never_found = notfound_set - found_set # 213 + +results_dict = {} # turns out being 281 long +found_dict = {} +notfound_dict = {} + +# TODO +results_df = pd.DataFrame(columns=["disease", "found", "date"]) + +for af in always_found: + try: + results_dict[af] = [True, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() ] + found_dict[af] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() + results_df + except ValueError: + print(f"No HPOA for {af}.") +for nf in never_found: + try: + results_dict[nf] = [False, hpoa_cleaned.loc[hpoa_cleaned['database_id'] == nf, 'date'].item() ] + notfound_dict[nf] = hpoa_cleaned.loc[hpoa_cleaned['database_id'] == af, 'date'].item() + except ValueError: + print(f"No HPOA for {nf}.") + +res_to_clean = pd.DataFrame.from_dict(results_dict).transpose() +res_to_clean.columns=["found","date"] +res_to_clean.date = pd.to_datetime(res_to_clean.date).values.astype(np.int64) +final_avg = pd.DataFrame(pd.to_datetime(res_to_clean.groupby('found').mean().date)) +print(final_avg) \ No newline at end of file diff --git a/src/malco/post_process/ranking_utils.py b/src/malco/post_process/ranking_utils.py index b19ee02f..0049914c 100644 --- a/src/malco/post_process/ranking_utils.py +++ b/src/malco/post_process/ranking_utils.py @@ -43,7 +43,8 @@ def compute_mrr_and_ranks( ) -> Path: # Read in results TSVs from self.output_dir that match glob results*tsv - out_caches = output_dir / "caches" + out_caches = Path("caches") + #out_caches = output_dir / "caches" out_caches.mkdir(exist_ok=True) output_dir = output_dir / out_subdir results_data = []