improve swag procedure to be based on f1 score after a certain dimens…

…ion rather than the current balanced accuracy. recomputed procedure, saved model, tested
agardini · Jun 2, 2024 · 98dceec · 98dceec
1 parent cd3fa74
commit 98dceec
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 7 deletions.
diff --git a/model.rds b/model.rds
diff --git a/submission.R b/submission.R
@@ -301,7 +301,7 @@ max_class <- function(vec) {
 
 
 
-# for testing 
+# # for testing
 # # check returned df when running this on fake data
 # df = read.csv("PreFer_fake_data.csv")
 # background_df = read.csv("PreFer_fake_background_data.csv")

diff --git a/training.R b/training.R
@@ -97,7 +97,7 @@ model_combination <- function(
 # cleaned_df = clean_df(df = data, background_df = data_back)
 # outcome_df = read_csv(file = "not_commit/PreFer_train_outcome.csv")
 # 
-# 
+
 
 train_save_model <- function(cleaned_df, outcome_df) {
   # Trains a model using the cleaned dataframe and saves the model to a file.
@@ -120,7 +120,7 @@ train_save_model <- function(cleaned_df, outcome_df) {
   # Meta-parameters swag algorithm
   control <- list(pmax = 35,  # maximum dimension explored
                   alpha = .15, #normally a small value, corresponds to the kept at each iteration
-                  m = 80L, # max number of models explored per dimension
+                  m = 40L, # max number of models explored per dimension
                   seed = 123L, #for replicability
                   verbose = T #keeps track of completed dimensions)
   )
@@ -167,7 +167,7 @@ train_save_model <- function(cleaned_df, outcome_df) {
     sample_size[i] = nrow(df_sub)
 
     if(control$verbose){
-      cat(paste0("Variable ",i,"/", ncol(X),". accuracy of ", round(cv_errors[i] , 4), "\n"))
+      cat(paste0("Variable ",i,"/", ncol(X),". balanced accuracy of ", round(cv_errors[i] , 4), "\n"))
 
     }
   }
@@ -181,7 +181,7 @@ train_save_model <- function(cleaned_df, outcome_df) {
   CVs_precison[[d]] = cv_errors_precison
   CVs_recall[[d]] = cv_errors_recall
 
-  cv_alpha[d] <- quantile(cv_errors,control$alpha,na.rm=T)
+  cv_alpha[d] <- quantile(cv_errors, (1-control$alpha), na.rm=T)
   IDs[[d]] <- which(cv_errors >= cv_alpha[d])
   id_screening <- IDs[[d]]
   VarMat[[d]] <- var_mat
@@ -237,12 +237,21 @@ train_save_model <- function(cleaned_df, outcome_df) {
     CVs_precison[[d]] = cv_errors_precison
     CVs_recall[[d]] = cv_errors_recall
     VarMat[[d]] <- var_mat
-    cv_alpha[d] <- quantile(cv_errors,control$alpha,na.rm=T)
+    cv_alpha[d] <- quantile(cv_errors,probs = (1-control$alpha),na.rm=T)
     IDs[[d]] <- which(cv_errors >= cv_alpha[d])
+    # if proportion of f1 score per tested model , switch to f1 score, to do, for now...
+    measure_considered= " balanced accuracy"
+    if(d > 10){
+      # switch for f1 score
+      measure_considered= " f1 score"
+      cv_alpha[d] <- quantile(cv_errors_f1_score,probs = (1-control$alpha), na.rm=T) # save in vector of quantile per dimension
+      IDs[[d]] <- which(cv_errors_f1_score >= cv_alpha[d])
+    }
+
     Sample_size[[d]] = sample_size
 
 
-    if(control$verbose) print(paste0("Dimension explored: ", d ," - CV errors at alpha: ",round(cv_alpha[d],4)))
+    if(control$verbose) print(paste0("Dimension explored: ", d ,",", measure_considered," at quantile ",(1-control$alpha),":" ,round(cv_alpha[d],4)))
     if(ncol(var_mat)==1) break 
   }