fix: bug of saving preprocess cache files (#310)

* save independent returns of preprocess_script() to 'others.pkl' * fix CI
microsoft · Sep 24, 2024 · 5fb0608 · 5fb0608
1 parent dab2cff
commit 5fb0608
Show file tree

Hide file tree

Showing 9 changed files with 22 additions and 19 deletions.
diff --git a/...scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py b/...scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py
@@ -33,9 +33,9 @@ def preprocess_script():
         y_train = pd.read_pickle("y_train.pkl")
         y_valid = pd.read_pickle("y_valid.pkl")
         X_test = pd.read_pickle("X_test.pkl")
-        ids = pd.read_pickle("ids.pkl")
+        others = pd.read_pickle("others.pkl")
 
-        return X_train, X_valid, y_train, y_valid, X_test, ids
+        return X_train, X_valid, y_train, y_valid, X_test, *others
 
     X_train, X_valid, y_train, y_valid = prepreprocess()
 

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -90,9 +90,9 @@ def preprocess_script():
         y_train = pd.read_pickle("/kaggle/preprocessed_data/y_train.pkl")
         y_valid = pd.read_pickle("/kaggle/preprocessed_data/y_valid.pkl")
         X_test = pd.read_pickle("/kaggle/preprocessed_data/X_test.pkl")
-        passenger_ids = pd.read_pickle("/kaggle/preprocessed_data/passenger_ids.pkl")
+        others = pd.read_pickle("/kaggle/preprocessed_data/others.pkl")
 
-        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+        return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
@@ -90,9 +90,9 @@ def preprocess_script():
         y_train = pd.read_pickle("y_train.pkl")
         y_valid = pd.read_pickle("y_valid.pkl")
         X_test = pd.read_pickle("X_test.pkl")
-        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+        others = pd.read_pickle("others.pkl")
 
-        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+        return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py
@@ -87,9 +87,9 @@ def preprocess_script():
         y_train = pd.read_pickle("y_train.pkl")
         y_valid = pd.read_pickle("y_valid.pkl")
         X_test = pd.read_pickle("X_test.pkl")
-        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+        others = pd.read_pickle("others.pkl")
 
-        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+        return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -113,7 +113,7 @@ def source_data(self) -> str:
             y_train,
             y_valid,
             X_test,
-            passenger_ids,
+            *others,
         ) = preprocess_experiment.experiment_workspace.generate_preprocess_data()
 
         data_folder.mkdir(exist_ok=True, parents=True)
@@ -122,7 +122,7 @@ def source_data(self) -> str:
         pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
         pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
         pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
-        pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))
+        pickle.dump(others, open(data_folder / "others.pkl", "wb"))
 
         buffer = io.StringIO()
         X_valid.info(verbose=True, buf=buffer, show_counts=True)

diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/fea_share_preprocess.py
@@ -108,7 +108,8 @@ def preprocess_script():
         y_train = pd.read_pickle("y_train.pkl")
         y_valid = pd.read_pickle("y_valid.pkl")
         X_test = pd.read_pickle("X_test.pkl")
-        return X_train, X_valid, y_train, y_valid, X_test
+        others = pd.read_pickle("others.pkl")
+        return X_train, X_valid, y_train, y_valid, X_test, *others
 
     X_train, X_valid, y_train, y_valid, test, category_encoder, test_ids = prepreprocess()
 

diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py
@@ -84,9 +84,9 @@ def preprocess_script():
         y_train = pd.read_pickle("y_train.pkl")
         y_valid = pd.read_pickle("y_valid.pkl")
         X_test = pd.read_pickle("X_test.pkl")
-        passenger_ids = pd.read_pickle("passenger_ids.pkl")
+        others = pd.read_pickle("others.pkl")
 
-        return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+        return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data

diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -1,6 +1,7 @@
 import subprocess
 import zipfile
 from pathlib import Path
+from typing import Any
 
 import pandas as pd
 
@@ -14,14 +15,14 @@
 
 from fea_share_preprocess import preprocess_script
 
-X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script()
+X_train, X_valid, y_train, y_valid, X_test, *others = preprocess_script()
 
 pickle.dump(X_train, open("X_train.pkl", "wb"))
 pickle.dump(X_valid, open("X_valid.pkl", "wb"))
 pickle.dump(y_train, open("y_train.pkl", "wb"))
 pickle.dump(y_valid, open("y_valid.pkl", "wb"))
 pickle.dump(X_test, open("X_test.pkl", "wb"))
-pickle.dump(passenger_ids, open("passenger_ids.pkl", "wb"))
+pickle.dump(others, open("others.pkl", "wb"))
 """
 
 
@@ -34,7 +35,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
 
     def generate_preprocess_data(
         self,
-    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.Series]:
+    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, Any]:
         kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
         kgde.prepare()
 
@@ -47,7 +48,7 @@ def generate_preprocess_data(
                 "y_train.pkl",
                 "y_valid.pkl",
                 "X_test.pkl",
-                "passenger_ids.pkl",
+                "others.pkl",
             ],
             running_extra_volume=(
                 {KAGGLE_IMPLEMENT_SETTING.local_data_path + "/" + KAGGLE_IMPLEMENT_SETTING.competition: "/kaggle/input"}
@@ -59,8 +60,8 @@ def generate_preprocess_data(
             logger.error("Feature preprocess failed.")
             raise Exception("Feature preprocess failed.")
         else:
-            X_train, X_valid, y_train, y_valid, X_test, passenger_ids = results
-            return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
+            X_train, X_valid, y_train, y_valid, X_test, others = results
+            return X_train, X_valid, y_train, y_valid, X_test, *others
 
     def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
         logger.info(f"Running the experiment in {self.workspace_path}")

diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -127,6 +127,7 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
         "covid19-global-forecasting-week-1",
         "birdsong-recognition",
         "optiver-trading-at-the-close",
+        "facebook-v-predicting-check-ins",
     ]
 
     for i in dsagent_cs + other_cs: