From 2393a41e7237615ced2c3fdd5c49308236b9f276 Mon Sep 17 00:00:00 2001 From: Haoran Pan <167847254+TPLin22@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:52:11 +0800 Subject: [PATCH] fix: template for kaggle foreset & s4e9 (#334) * s4e9: remove onehot, reshape output * forest-cover-type-prediction: cross validation --- .../train.py | 175 +++++++++--------- .../train_past.py | 118 ++++++++++++ .../fea_share_preprocess.py | 52 ++---- .../model/model_randomforest.py | 2 +- .../model/model_xgboost.py | 2 +- .../playground-series-s4e9_template/train.py | 5 +- 6 files changed, 225 insertions(+), 129 deletions(-) create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py index c2161ebf..65086289 100644 --- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py @@ -4,9 +4,10 @@ import numpy as np import pandas as pd -from fea_share_preprocess import clean_and_impute_data, preprocess_script from scipy import stats -from sklearn.metrics import accuracy_score, matthews_corrcoef +from sklearn.impute import SimpleImputer +from sklearn.metrics import accuracy_score +from sklearn.model_selection import KFold # Set random seed for reproducibility SEED = 42 @@ -15,19 +16,6 @@ DIRNAME = Path(__file__).absolute().resolve().parent -# support various method for metrics calculation -def compute_metrics_for_classification(y_true, y_pred): - """Compute accuracy metric for classification.""" - accuracy = accuracy_score(y_true, y_pred) - return accuracy - - -def compute_metrics_for_classification(y_true, y_pred): - """Compute MCC for classification.""" - mcc = matthews_corrcoef(y_true, y_pred) - return mcc - - def import_module_from_path(module_name, module_path): spec = importlib.util.spec_from_file_location(module_name, module_path) module = importlib.util.module_from_spec(spec) @@ -36,81 +24,96 @@ def import_module_from_path(module_name, module_path): # 1) Preprocess the data -X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() - -# 2) Auto feature engineering -X_train_l, X_valid_l = [], [] -X_test_l = [] - -for f in DIRNAME.glob("feature/feat*.py"): - cls = import_module_from_path(f.stem, f).feature_engineering_cls() - cls.fit(X_train) - X_train_f = cls.transform(X_train) - X_valid_f = cls.transform(X_valid) - X_test_f = cls.transform(X_test) - - X_train_l.append(X_train_f) - X_valid_l.append(X_valid_f) - X_test_l.append(X_test_f) - -X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) -X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) -X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) - -print(X_train.shape, X_valid.shape, X_test.shape) - -# Handle inf and -inf values -X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test) - - -# 3) Train the model -def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: - """ - Flatten the columns of a DataFrame with MultiIndex columns, - for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b - """ - if df.columns.nlevels == 1: - return df - df.columns = ["_".join(col).strip() for col in df.columns.values] - return df - - -X_train = flatten_columns(X_train) -X_valid = flatten_columns(X_valid) -X_test = flatten_columns(X_test) - -model_l = [] # list[tuple[model, predict_func]] -for f in DIRNAME.glob("model/model*.py"): - m = import_module_from_path(f.stem, f) - model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) - -# 4) Evaluate the model on the validation set -y_valid_pred_l = [] -for model, predict_func in model_l: - y_valid_pred = predict_func(model, X_valid) - y_valid_pred_l.append(y_valid_pred) - print(y_valid_pred) - print(y_valid_pred.shape) - -# 5) Ensemble -# Majority vote ensemble -y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() - -# Compute metrics -accuracy = accuracy_score(y_valid, y_valid_pred_ensemble) -print(f"final accuracy on valid set: {accuracy}") - -# 6) Save the validation metrics -pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv") +data_df = pd.read_csv("/kaggle/input/train.csv") +data_df = data_df.drop(["Id"], axis=1) + +X_train = data_df.drop(["Cover_Type"], axis=1) +y_train = data_df["Cover_Type"] - 1 -# 7) Make predictions on the test set and save them +submission_df = pd.read_csv("/kaggle/input/test.csv") +ids = submission_df["Id"] +X_test = submission_df.drop(["Id"], axis=1) + +# Set up KFold +kf = KFold(n_splits=5, shuffle=True, random_state=SEED) + +# Store results +accuracies = [] y_test_pred_l = [] -for model, predict_func in model_l: - y_test_pred_l.append(predict_func(model, X_test)) -# For multiclass classification, use the mode of the predictions -y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1 +# 3) Train and evaluate using KFold +fold_number = 1 +for train_index, valid_index in kf.split(X_train): + print(f"Starting fold {fold_number}...") + + X_train_l, X_valid_l, X_test_l = [], [], [] # Reset feature lists for each fold + X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index] + y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index] + X_te = X_test + + # Feature engineering + for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_tr) + X_train_f = cls.transform(X_tr) + X_valid_f = cls.transform(X_val) + X_test_f = cls.transform(X_te) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + + X_tr = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_val = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + X_te = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + print("Shape of X_tr: ", X_tr.shape, " Shape of X_val: ", X_val.shape, " Shape of X_te: ", X_te.shape) + + # Replace inf and -inf with NaN + X_tr.replace([np.inf, -np.inf], np.nan, inplace=True) + X_val.replace([np.inf, -np.inf], np.nan, inplace=True) + X_te.replace([np.inf, -np.inf], np.nan, inplace=True) + + # Impute missing values + imputer = SimpleImputer(strategy="mean") + X_tr = pd.DataFrame(imputer.fit_transform(X_tr), columns=X_tr.columns) + X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns) + X_te = pd.DataFrame(imputer.transform(X_te), columns=X_te.columns) + + # Remove duplicate columns + X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()] + X_val = X_val.loc[:, ~X_val.columns.duplicated()] + X_te = X_te.loc[:, ~X_te.columns.duplicated()] + + # Train the model + model_l = [] # list[tuple[model, predict_func]] + for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_tr, y_tr, X_val, y_val), m.predict)) + + # Evaluate the model on the validation set + y_valid_pred_l = [] + for model, predict_func in model_l: + y_valid_pred = predict_func(model, X_val) + y_valid_pred_l.append(y_valid_pred) + y_test_pred_l.append(predict_func(model, X_te)) + + # Majority vote ensemble + y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() + + # Compute metrics + accuracy = accuracy_score(y_val, y_valid_pred_ensemble) + accuracies.append(accuracy) + print(f"Fold {fold_number} accuracy: {accuracy}") + + fold_number += 1 + +# Print average accuracy +accuracy = np.mean(accuracies) +print(f"Average accuracy across folds: {accuracy}") +pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv") +y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1 submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"]) submission_result.insert(0, "Id", ids) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py new file mode 100644 index 00000000..c2161ebf --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train_past.py @@ -0,0 +1,118 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import clean_and_impute_data, preprocess_script +from scipy import stats +from sklearn.metrics import accuracy_score, matthews_corrcoef + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +# support various method for metrics calculation +def compute_metrics_for_classification(y_true, y_pred): + """Compute accuracy metric for classification.""" + accuracy = accuracy_score(y_true, y_pred) + return accuracy + + +def compute_metrics_for_classification(y_true, y_pred): + """Compute MCC for classification.""" + mcc = matthews_corrcoef(y_true, y_pred) + return mcc + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test) + + +# 3) Train the model +def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Flatten the columns of a DataFrame with MultiIndex columns, + for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b + """ + if df.columns.nlevels == 1: + return df + df.columns = ["_".join(col).strip() for col in df.columns.values] + return df + + +X_train = flatten_columns(X_train) +X_valid = flatten_columns(X_valid) +X_test = flatten_columns(X_test) + +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred = predict_func(model, X_valid) + y_valid_pred_l.append(y_valid_pred) + print(y_valid_pred) + print(y_valid_pred.shape) + +# 5) Ensemble +# Majority vote ensemble +y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() + +# Compute metrics +accuracy = accuracy_score(y_valid, y_valid_pred_ensemble) +print(f"final accuracy on valid set: {accuracy}") + +# 6) Save the validation metrics +pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +y_test_pred_l = [] +for model, predict_func in model_l: + y_test_pred_l.append(predict_func(model, X_test)) + +# For multiclass classification, use the mode of the predictions +y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1 + + +submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"]) +submission_result.insert(0, "Id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py index 21fc1652..9684f040 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py @@ -5,82 +5,56 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder def prepreprocess(): - """ - This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. - """ - # Load and preprocess the data data_df = pd.read_csv("/kaggle/input/train.csv") data_df = data_df.drop(["id"], axis=1) X = data_df.drop(["price"], axis=1) y = data_df["price"] - # Split the data into training and validation sets X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) return X_train, X_valid, y_train, y_valid def preprocess_fit(X_train: pd.DataFrame): - """ - Fits the preprocessor on the training data and returns the fitted preprocessor. - """ - # Identify numerical and categorical features numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"] - # Define preprocessors for numerical and categorical features categorical_transformer = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), - ("onehot", OneHotEncoder(handle_unknown="ignore")), + ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)), ] ) numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) - # Combine preprocessing steps preprocessor = ColumnTransformer( transformers=[ - ("cat", categorical_transformer, categorical_cols), ("num", numerical_transformer, numerical_cols), + ("cat", categorical_transformer, categorical_cols), ] ) - # Fit the preprocessor on the training data preprocessor.fit(X_train) - return preprocessor + return preprocessor, numerical_cols, categorical_cols -def preprocess_transform(X: pd.DataFrame, preprocessor): - """ - Transforms the given DataFrame using the fitted preprocessor. - Ensures the processed data has consistent features across train, validation, and test sets. - """ - # Transform the data using the fitted preprocessor - X_array = preprocessor.transform(X).toarray() - - # Get feature names for the columns in the transformed data - categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] - feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out( - categorical_cols - ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]] +def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols): + X_transformed = preprocessor.transform(X) # Convert arrays back to DataFrames - X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index) + X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index) return X_transformed def preprocess_script(): - """ - This method applies the preprocessing steps to the training, validation, and test datasets. - """ if os.path.exists("/kaggle/input/X_train.pkl"): X_train = pd.read_pickle("/kaggle/input/X_train.pkl") X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") @@ -90,19 +64,17 @@ def preprocess_script(): others = pd.read_pickle("/kaggle/input/others.pkl") return X_train, X_valid, y_train, y_valid, X_test, *others + X_train, X_valid, y_train, y_valid = prepreprocess() - # Fit the preprocessor on the training data - preprocessor = preprocess_fit(X_train) + preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train) - # Preprocess the train, validation, and test data - X_train = preprocess_transform(X_train, preprocessor) - X_valid = preprocess_transform(X_valid, preprocessor) + X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols) + X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols) - # Load and preprocess the test data submission_df = pd.read_csv("/kaggle/input/test.csv") ids = submission_df["id"] submission_df = submission_df.drop(["id"], axis=1) - X_test = preprocess_transform(submission_df, preprocessor) + X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols) return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py index 867e6b64..dba55127 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py @@ -45,4 +45,4 @@ def predict(model, X): # Predict using the trained model y_pred = model.predict(X_selected) - return y_pred + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py index 84a8f5e2..90f4d7a4 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py @@ -34,4 +34,4 @@ def predict(model, X): X = select(X) dtest = xgb.DMatrix(X) y_pred = model.predict(dtest) - return y_pred + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py index 1971b091..46c34a85 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py @@ -29,8 +29,10 @@ def import_module_from_path(module_name, module_path): return module +print("begin preprocess") # 1) Preprocess the data X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() +print("preprocess done") # 2) Auto feature engineering X_train_l, X_valid_l = [], [] @@ -97,6 +99,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: y_valid_pred_l = [] for model, predict_func in model_l: y_valid_pred_l.append(predict_func(model, X_valid)) + print(predict_func(model, X_valid).shape) # 5) Ensemble y_valid_pred = np.mean(y_valid_pred_l, axis=0) @@ -112,7 +115,7 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame: for m, m_pred in model_l: y_test_pred_l.append(m_pred(m, X_test)) -y_test_pred = np.mean(y_test_pred_l, axis=0) +y_test_pred = np.mean(y_test_pred_l, axis=0).ravel() # 8) Submit predictions for the test set submission_result = pd.DataFrame({"id": ids, "price": y_test_pred})