feat: add s3e11 kaggle template (#324)

* s3e11 tpl v1 * some changes * fix some bugs in s3e11 tpl, change docker logs color * fix CI
microsoft · Sep 25, 2024 · 8c57524 · 8c57524
1 parent 91979c0
commit 8c57524
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 1 deletion.
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py
@@ -0,0 +1,51 @@
+import os
+
+import numpy as np  # linear algebra
+import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
+from sklearn.model_selection import train_test_split
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    # train
+    train = pd.read_csv("/kaggle/input/train.csv")
+    train["store_sqft"] = train["store_sqft"].astype("category")
+    train["salad"] = (train["salad_bar"] + train["prepared_food"]) / 2
+    train["log_cost"] = np.log1p(train["cost"])
+    most_important_features = [
+        "total_children",
+        "num_children_at_home",
+        "avg_cars_at home(approx).1",
+        "store_sqft",
+        "coffee_bar",
+        "video_store",
+        "salad",
+        "florist",
+    ]
+
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        train[most_important_features], train["log_cost"], test_size=0.2, random_state=2023
+    )
+
+    # test
+    test = pd.read_csv("/kaggle/input/test.csv")
+    test["store_sqft"] = test["store_sqft"].astype("category")
+    test["salad"] = (test["salad_bar"] + test["prepared_food"]) / 2
+
+    ids = test["id"]
+    X_test = test.drop(["id"], axis=1)
+    X_test = X_test[most_important_features]
+
+    return X_train, X_valid, y_train, y_valid, X_test, ids
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/model_xgboost.py
@@ -0,0 +1,42 @@
+"""
+motivation  of the model
+"""
+
+import pandas as pd
+import xgboost as xgb
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+
+    xgb_params = {
+        "n_estimators": 280,
+        "learning_rate": 0.05,
+        "max_depth": 10,
+        "subsample": 1.0,
+        "colsample_bytree": 1.0,
+        "tree_method": "hist",
+        "enable_categorical": True,
+        "verbosity": 1,
+        "min_child_weight": 3,
+        "base_score": 4.6,
+        "random_state": 2023,
+    }
+    model = xgb.XGBRegressor(**xgb_params)
+    model.fit(X_train, y_train)
+    return model
+
+
+def predict(model, X_test):
+    """
+    Keep feature select's consistency.
+    """
+    X_test = select(X_test)
+    y_pred = model.predict(X_test)
+    return y_pred
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/train.py
@@ -0,0 +1,78 @@
+import importlib.util
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import mean_squared_error
+
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    X_train_l.append(X_train_f)
+    X_valid_l.append(X_valid_f)
+    X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+# 3) Train the model
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred = predict_func(model, X_valid)
+    y_valid_pred_l.append(y_valid_pred)
+    # print(y_valid_pred)
+    # print(y_valid_pred.shape)
+
+# 5) Ensemble
+# Majority vote ensemble
+y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)
+
+
+# 6) Save the validation metrics
+metrics = mean_squared_error(y_valid, y_valid_pred_ensemble, squared=False)
+print(f"RMLSE on valid set: {metrics}")
+pd.Series(data=[metrics], index=["RMLSE"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+y_test_pred_l = []
+for model, predict_func in model_l:
+    y_test_pred_l.append(predict_func(model, X_test))
+
+
+# For multiclass classification, use the mode of the predictions
+y_test_pred = np.mean(y_test_pred_l, axis=0)
+
+
+submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])
+submission_result.insert(0, "id", ids)
+
+submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
@@ -23,7 +23,9 @@
 import docker.models.containers
 from pydantic import BaseModel
 from pydantic_settings import BaseSettings
+from rich import print
 from rich.progress import Progress, TextColumn
+from rich.rule import Rule
 
 from rdagent.log import rdagent_logger as logger
 
@@ -207,7 +209,7 @@ def prepare(self):
                     status_dict = json.loads(part)
                     if "error" in status_dict:
                         p.update(task, description=f"[red]error: {status_dict['error']}")
-                        raise docker.errors.BuildError(status_dict["error"])
+                        raise docker.errors.BuildError(status_dict["error"], "")
                     if "stream" in status_dict:
                         p.update(task, description=status_dict["stream"])
             logger.info(f"Finished building the image from dockerfile: {self.conf.dockerfile_folder_path}")
@@ -305,10 +307,12 @@ def run(
                 **self._gpu_kwargs(client),
             )
             logs = container.logs(stream=True)
+            print(Rule("[bold green]Docker Logs Begin[/bold green]", style="dark_orange"))
             for log in logs:
                 decoded_log = log.strip().decode()
                 print(decoded_log)
                 log_output += decoded_log + "\n"
+            print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))
             container.wait()
             container.stop()
             container.remove()