added statistical tests for ic, seems again very significant, possibl…

…e errors, and tried 2feature logit, little better in class 1 recall and f1, but not that much
monarch-initiative · Oct 10, 2024 · 1b8aebc · 1b8aebc
1 parent fd50b37
commit 1b8aebc
Showing 1 changed file with 71 additions and 26 deletions.
diff --git a/src/malco/analysis/disease_avail_knowledge.py b/src/malco/analysis/disease_avail_knowledge.py
@@ -47,9 +47,9 @@
 # import df of results
 model = str(sys.argv[1])
 try:
-   make_time_plot = str(sys.argv[2])=="plot"
+   make_plots = str(sys.argv[2])=="plot"
 except IndexError:
-   make_time_plot = False
+   make_plots = False
    print("\nYou can pass \"plot\" as a second CLI argument and this will generate nice plots!")
 
 ranking_results_filename = f"out_openAI_models/multimodel/{model}/full_df_results.tsv"
@@ -96,7 +96,7 @@
 # len(ppkts) --> 6687
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# Do linear regression of box plot of ppkts' rank vs time
+# Look for correlation in box plot of ppkts' rank vs time
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 dates = []
 ranks = []
@@ -111,11 +111,12 @@
 for i in range(len(dates)): 
    years_only.append(dates[i].year)
 
-if make_time_plot:
+if make_plots:
    sns.boxplot(x=years_only, y=ranks)
    plt.xlabel("Year of HPOA annotation")
    plt.ylabel("Rank")
    plt.title("LLM performance uncorrelated with date of discovery")
+   #TODO change show to save
    plt.show()
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -148,18 +149,6 @@
 # Chi2ContingencyResult(statistic=np.float64(458.8912809317326), pvalue=np.float64(8.37853926348694e-102), dof=1, expected_freq=array([[ 889.83260377, 1609.16739623],
 #       [1469.16739623, 2656.83260377]]))
 
-contingency_table = pd.DataFrame(cont_table, index=['found', 'not_found'], 
-                                 columns=['before2010', 'after2010'])
-row_totals = [ cont_table[0][0] + cont_table[0][1],
-               cont_table[1][0] + cont_table[1][1] ]
-column_totals = [ cont_table[0][0] + cont_table[1][0],
-                  cont_table[0][1] + cont_table[1][1] ]
-N = row_totals[0] + row_totals[1]
-
-
-
-
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # IC: For each phenpacket, list observed HPOs and compute average IC. Is it correlated with 
 # success? I.e., start with f/nf, 1/0 on y-axis vs avg(IC) on x-axis
@@ -252,13 +241,46 @@
 # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 # ppkt_ic_df['Diagnosed'].value_counts()
 # Diagnosed
-# 0.0    4182
-# 1.0    2347
-# IMBALANCED! Maybe SMOTE or similar? Respectively 
-# 0.0    64 %
-# 1.0    36 %
+# 0.0    4182   64 %
+# 1.0    2347   36%
 
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# T-test: unlikely that the two samples are such due to sample bias.
+# Likely, there is a correlation between average IC and whether the case is being solved.
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+from scipy.stats import kstest
+kstest(ppkt_ic_df['avg(IC)'],'norm')
+# KS test statistic 0.95, p-value=0, not normal.
+from scipy.stats import shapiro
+stat, p = shapiro(ppkt_ic_df['avg(IC)'])
+# p = e-45 --> not normal
+# N = 6227, seems OK to perform a parametric test. Even if non-normal, t-test is OK.
+# Not necessary to use non-parametric U-test (Mann-Whitney), mostly thanks to Central Limit Theorem
+from scipy.stats import ttest_ind
+found_ic = list(ppkt_ic_df.loc[ppkt_ic_df['Diagnosed']>0, 'avg(IC)'])
+not_found_ic = list(ppkt_ic_df.loc[ppkt_ic_df['Diagnosed']<1, 'avg(IC)'])
+tresult = ttest_ind(found_ic, not_found_ic, equal_var=False)
+fsum = [np.mean(found_ic),
+        np.std(found_ic),
+        np.mean(not_found_ic),
+        np.std(not_found_ic),
+      ] 
+if make_plots:
+   plt.hist(found_ic, bins=25, color='c', edgecolor='k', alpha=0.5)
+   plt.hist(not_found_ic, bins=25, color='r', edgecolor='k', alpha=0.5)
+   plt.xlabel("Average Information Content")
+   plt.ylabel("Counts")
+   plt.legend(['Successful Diagnosis', 'Unsuccessful Diagnosis'])
+   #TODO change show to save
+   plt.show()
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Machine Learning: can we build a logit to classify into two?
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ML(1): let's try with IC
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For unbalanced datasets: Maybe SMOTE or similar?
 #ppkt_ic_df.plot(x='avg(IC)', y='Diagnosed', style=['ob','rx'])
 # Dumbly copy paste from
 # https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8 [1/10/24]
@@ -275,17 +297,42 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix, classification_report
 
-X_train, X_test, y_train, y_test = train_test_split(ppkt_ic_df[['avg(IC)']], ppkt_ic_df['Diagnosed'], test_size=0.3, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(ppkt_ic_df[['avg(IC)']], ppkt_ic_df['Diagnosed'], test_size=0.2, random_state=0)
 logreg = LogisticRegression()
 logreg.fit(X_train, y_train)
 y_pred = logreg.predict(X_test)
 print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
-confusion_matrix = confusion_matrix(y_test, y_pred)
-print(confusion_matrix)
+cm1d = confusion_matrix(y_test, y_pred)
+print(cm1d)
 class_report = classification_report(y_test, y_pred)
 print(class_report)
 # Not much better than always saying 0, as of now.
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ML(2): let's try with IC AND time (SVM with gausskernel a possible alternative?)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Features: year, IC. Label 0/1 f/nf --> Later, to improve, maybe use 1/rank, nf=0
+# rank_date_dict { 'pubmedid' + '_en-prompt.txt': [rank, '2012-05-11'],...}
+# ppkt_ic_df has row names with 'pubmedid'
+# for entry in ppkt_ic_df if rowname in rank_date_dict.keys() get that .values() index[1] and parse first 4 entries (year is 4 digits)
+date_df = pd.DataFrame.from_dict(rank_date_dict, orient='index', columns=['rank', 'date'])
+date_df.drop(columns='rank', inplace=True)
+date_df['date'] = date_df['date'].str[0:4]
+new_index = np.array([i[0:-14].rstrip("_") for i in date_df.index.to_list()])
+date_df.set_index(new_index, inplace=True)
+ic_date_df = date_df.join(ppkt_ic_df, how='inner')
+
+X_train, X_test, y_train, y_test = train_test_split(ic_date_df[['avg(IC)','date']], ic_date_df['Diagnosed'], test_size=0.2, random_state=0)
+logreg = LogisticRegression()
+logreg.fit(X_train, y_train)
+y_pred = logreg.predict(X_test)
+print('\nAccuracy of 2 PARAMETER logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
+cm2d = confusion_matrix(y_test, y_pred)
+print(cm2d)
+class_report = classification_report(y_test, y_pred)
+print(class_report)
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Analysis of found vs not-found
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -342,6 +389,4 @@
 res_to_clean['date'] = pd.to_datetime(res_to_clean.date).values.astype(np.int64)
 final_avg = pd.DataFrame(pd.to_datetime(res_to_clean.groupby('found').mean()['date']))
 print(final_avg)
-
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-# 2 dimensional logistic regression or SVM with gausskernel?