From 2393a41e7237615ced2c3fdd5c49308236b9f276 Mon Sep 17 00:00:00 2001
From: Haoran Pan <167847254+TPLin22@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:52:11 +0800
Subject: [PATCH] fix: template for kaggle foreset & s4e9 (#334)

* s4e9: remove onehot, reshape output

* forest-cover-type-prediction: cross validation
---
 .../train.py                                  | 175 +++++++++---------
 .../train_past.py                             | 118 ++++++++++++
 .../fea_share_preprocess.py                   |  52 ++----
 .../model/model_randomforest.py               |   2 +-
 .../model/model_xgboost.py                    |   2 +-
 .../playground-series-s4e9_template/train.py  |   5 +-
 6 files changed, 225 insertions(+), 129 deletions(-)
 create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py

diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
index c2161ebf..65086289 100644
--- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
+++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
@@ -4,9 +4,10 @@
 
 import numpy as np
 import pandas as pd
-from fea_share_preprocess import clean_and_impute_data, preprocess_script
 from scipy import stats
-from sklearn.metrics import accuracy_score, matthews_corrcoef
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import KFold
 
 # Set random seed for reproducibility
 SEED = 42
@@ -15,19 +16,6 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-# support various method for metrics calculation
-def compute_metrics_for_classification(y_true, y_pred):
-    """Compute accuracy metric for classification."""
-    accuracy = accuracy_score(y_true, y_pred)
-    return accuracy
-
-
-def compute_metrics_for_classification(y_true, y_pred):
-    """Compute MCC for classification."""
-    mcc = matthews_corrcoef(y_true, y_pred)
-    return mcc
-
-
 def import_module_from_path(module_name, module_path):
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     module = importlib.util.module_from_spec(spec)
@@ -36,81 +24,96 @@ def import_module_from_path(module_name, module_path):
 
 
 # 1) Preprocess the data
-X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
-
-# 2) Auto feature engineering
-X_train_l, X_valid_l = [], []
-X_test_l = []
-
-for f in DIRNAME.glob("feature/feat*.py"):
-    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
-    cls.fit(X_train)
-    X_train_f = cls.transform(X_train)
-    X_valid_f = cls.transform(X_valid)
-    X_test_f = cls.transform(X_test)
-
-    X_train_l.append(X_train_f)
-    X_valid_l.append(X_valid_f)
-    X_test_l.append(X_test_f)
-
-X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
-X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
-X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
-
-print(X_train.shape, X_valid.shape, X_test.shape)
-
-# Handle inf and -inf values
-X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
-
-
-# 3) Train the model
-def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Flatten the columns of a DataFrame with MultiIndex columns,
-    for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
-    """
-    if df.columns.nlevels == 1:
-        return df
-    df.columns = ["_".join(col).strip() for col in df.columns.values]
-    return df
-
-
-X_train = flatten_columns(X_train)
-X_valid = flatten_columns(X_valid)
-X_test = flatten_columns(X_test)
-
-model_l = []  # list[tuple[model, predict_func]]
-for f in DIRNAME.glob("model/model*.py"):
-    m = import_module_from_path(f.stem, f)
-    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
-
-# 4) Evaluate the model on the validation set
-y_valid_pred_l = []
-for model, predict_func in model_l:
-    y_valid_pred = predict_func(model, X_valid)
-    y_valid_pred_l.append(y_valid_pred)
-    print(y_valid_pred)
-    print(y_valid_pred.shape)
-
-# 5) Ensemble
-# Majority vote ensemble
-y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten()
-
-# Compute metrics
-accuracy = accuracy_score(y_valid, y_valid_pred_ensemble)
-print(f"final accuracy on valid set: {accuracy}")
-
-# 6) Save the validation metrics
-pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv")
+data_df = pd.read_csv("/kaggle/input/train.csv")
+data_df = data_df.drop(["Id"], axis=1)
+
+X_train = data_df.drop(["Cover_Type"], axis=1)
+y_train = data_df["Cover_Type"] - 1
 
-# 7) Make predictions on the test set and save them
+submission_df = pd.read_csv("/kaggle/input/test.csv")
+ids = submission_df["Id"]
+X_test = submission_df.drop(["Id"], axis=1)
+
+# Set up KFold
+kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
+
+# Store results
+accuracies = []
 y_test_pred_l = []
-for model, predict_func in model_l:
-    y_test_pred_l.append(predict_func(model, X_test))
 
-# For multiclass classification, use the mode of the predictions
-y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1
+# 3) Train and evaluate using KFold
+fold_number = 1
+for train_index, valid_index in kf.split(X_train):
+    print(f"Starting fold {fold_number}...")
+
+    X_train_l, X_valid_l, X_test_l = [], [], []  # Reset feature lists for each fold
+    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
+    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
+    X_te = X_test
+
+    # Feature engineering
+    for f in DIRNAME.glob("feature/feat*.py"):
+        cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+        cls.fit(X_tr)
+        X_train_f = cls.transform(X_tr)
+        X_valid_f = cls.transform(X_val)
+        X_test_f = cls.transform(X_te)
+
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+
+    X_tr = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+    X_val = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+    X_te = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+    print("Shape of X_tr: ", X_tr.shape, " Shape of X_val: ", X_val.shape, " Shape of X_te: ", X_te.shape)
+
+    # Replace inf and -inf with NaN
+    X_tr.replace([np.inf, -np.inf], np.nan, inplace=True)
+    X_val.replace([np.inf, -np.inf], np.nan, inplace=True)
+    X_te.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Impute missing values
+    imputer = SimpleImputer(strategy="mean")
+    X_tr = pd.DataFrame(imputer.fit_transform(X_tr), columns=X_tr.columns)
+    X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
+    X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns)
+
+    # Remove duplicate columns
+    X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
+    X_val = X_val.loc[:, ~X_val.columns.duplicated()]
+    X_te = X_te.loc[:, ~X_te.columns.duplicated()]
+
+    # Train the model
+    model_l = []  # list[tuple[model, predict_func]]
+    for f in DIRNAME.glob("model/model*.py"):
+        m = import_module_from_path(f.stem, f)
+        model_l.append((m.fit(X_tr, y_tr, X_val, y_val), m.predict))
+
+    # Evaluate the model on the validation set
+    y_valid_pred_l = []
+    for model, predict_func in model_l:
+        y_valid_pred = predict_func(model, X_val)
+        y_valid_pred_l.append(y_valid_pred)
+        y_test_pred_l.append(predict_func(model, X_te))
+
+    # Majority vote ensemble
+    y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten()
+
+    # Compute metrics
+    accuracy = accuracy_score(y_val, y_valid_pred_ensemble)
+    accuracies.append(accuracy)
+    print(f"Fold {fold_number} accuracy: {accuracy}")
+
+    fold_number += 1
+
+# Print average accuracy
+accuracy = np.mean(accuracies)
+print(f"Average accuracy across folds: {accuracy}")
+pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv")
 
+y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1
 
 submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"])
 submission_result.insert(0, "Id", ids)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py
new file mode 100644
index 00000000..c2161ebf
--- /dev/null
+++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py
@@ -0,0 +1,118 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import clean_and_impute_data, preprocess_script
+from scipy import stats
+from sklearn.metrics import accuracy_score, matthews_corrcoef
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+# support various method for metrics calculation
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute accuracy metric for classification."""
+    accuracy = accuracy_score(y_true, y_pred)
+    return accuracy
+
+
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute MCC for classification."""
+    mcc = matthews_corrcoef(y_true, y_pred)
+    return mcc
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    X_train_l.append(X_train_f)
+    X_valid_l.append(X_valid_f)
+    X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+print(X_train.shape, X_valid.shape, X_test.shape)
+
+# Handle inf and -inf values
+X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
+
+
+# 3) Train the model
+def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Flatten the columns of a DataFrame with MultiIndex columns,
+    for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
+    """
+    if df.columns.nlevels == 1:
+        return df
+    df.columns = ["_".join(col).strip() for col in df.columns.values]
+    return df
+
+
+X_train = flatten_columns(X_train)
+X_valid = flatten_columns(X_valid)
+X_test = flatten_columns(X_test)
+
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred = predict_func(model, X_valid)
+    y_valid_pred_l.append(y_valid_pred)
+    print(y_valid_pred)
+    print(y_valid_pred.shape)
+
+# 5) Ensemble
+# Majority vote ensemble
+y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten()
+
+# Compute metrics
+accuracy = accuracy_score(y_valid, y_valid_pred_ensemble)
+print(f"final accuracy on valid set: {accuracy}")
+
+# 6) Save the validation metrics
+pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+y_test_pred_l = []
+for model, predict_func in model_l:
+    y_test_pred_l.append(predict_func(model, X_test))
+
+# For multiclass classification, use the mode of the predictions
+y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1
+
+
+submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"])
+submission_result.insert(0, "Id", ids)
+
+submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py
index 21fc1652..9684f040 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py
@@ -5,82 +5,56 @@
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import OrdinalEncoder
 
 
 def prepreprocess():
-    """
-    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
-    """
-    # Load and preprocess the data
     data_df = pd.read_csv("/kaggle/input/train.csv")
     data_df = data_df.drop(["id"], axis=1)
 
     X = data_df.drop(["price"], axis=1)
     y = data_df["price"]
 
-    # Split the data into training and validation sets
     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42)
 
     return X_train, X_valid, y_train, y_valid
 
 
 def preprocess_fit(X_train: pd.DataFrame):
-    """
-    Fits the preprocessor on the training data and returns the fitted preprocessor.
-    """
-    # Identify numerical and categorical features
     numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]]
     categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
 
-    # Define preprocessors for numerical and categorical features
     categorical_transformer = Pipeline(
         steps=[
             ("imputer", SimpleImputer(strategy="most_frequent")),
-            ("onehot", OneHotEncoder(handle_unknown="ignore")),
+            ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
         ]
     )
 
     numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
 
-    # Combine preprocessing steps
     preprocessor = ColumnTransformer(
         transformers=[
-            ("cat", categorical_transformer, categorical_cols),
             ("num", numerical_transformer, numerical_cols),
+            ("cat", categorical_transformer, categorical_cols),
         ]
     )
 
-    # Fit the preprocessor on the training data
     preprocessor.fit(X_train)
 
-    return preprocessor
+    return preprocessor, numerical_cols, categorical_cols
 
 
-def preprocess_transform(X: pd.DataFrame, preprocessor):
-    """
-    Transforms the given DataFrame using the fitted preprocessor.
-    Ensures the processed data has consistent features across train, validation, and test sets.
-    """
-    # Transform the data using the fitted preprocessor
-    X_array = preprocessor.transform(X).toarray()
-
-    # Get feature names for the columns in the transformed data
-    categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
-    feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(
-        categorical_cols
-    ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
+def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
+    X_transformed = preprocessor.transform(X)
 
     # Convert arrays back to DataFrames
-    X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
+    X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)
 
     return X_transformed
 
 
 def preprocess_script():
-    """
-    This method applies the preprocessing steps to the training, validation, and test datasets.
-    """
     if os.path.exists("/kaggle/input/X_train.pkl"):
         X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
         X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
@@ -90,19 +64,17 @@ def preprocess_script():
         others = pd.read_pickle("/kaggle/input/others.pkl")
 
         return X_train, X_valid, y_train, y_valid, X_test, *others
+
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
-    # Fit the preprocessor on the training data
-    preprocessor = preprocess_fit(X_train)
+    preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
 
-    # Preprocess the train, validation, and test data
-    X_train = preprocess_transform(X_train, preprocessor)
-    X_valid = preprocess_transform(X_valid, preprocessor)
+    X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
+    X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)
 
-    # Load and preprocess the test data
     submission_df = pd.read_csv("/kaggle/input/test.csv")
     ids = submission_df["id"]
     submission_df = submission_df.drop(["id"], axis=1)
-    X_test = preprocess_transform(submission_df, preprocessor)
+    X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)
 
     return X_train, X_valid, y_train, y_valid, X_test, ids
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py
index 867e6b64..dba55127 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py
@@ -45,4 +45,4 @@ def predict(model, X):
     # Predict using the trained model
     y_pred = model.predict(X_selected)
 
-    return y_pred
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py
index 84a8f5e2..90f4d7a4 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py
@@ -34,4 +34,4 @@ def predict(model, X):
     X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred = model.predict(dtest)
-    return y_pred
+    return y_pred.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py
index 1971b091..46c34a85 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py
@@ -29,8 +29,10 @@ def import_module_from_path(module_name, module_path):
     return module
 
 
+print("begin preprocess")
 # 1) Preprocess the data
 X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+print("preprocess done")
 
 # 2) Auto feature engineering
 X_train_l, X_valid_l = [], []
@@ -97,6 +99,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 y_valid_pred_l = []
 for model, predict_func in model_l:
     y_valid_pred_l.append(predict_func(model, X_valid))
+    print(predict_func(model, X_valid).shape)
 
 # 5) Ensemble
 y_valid_pred = np.mean(y_valid_pred_l, axis=0)
@@ -112,7 +115,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
 for m, m_pred in model_l:
     y_test_pred_l.append(m_pred(m, X_test))
 
-y_test_pred = np.mean(y_test_pred_l, axis=0)
+y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()
 
 # 8) Submit predictions for the test set
 submission_result = pd.DataFrame({"id": ids, "price": y_test_pred})