Skip to content

Commit

Permalink
feat: add s3e11 kaggle template (#324)
Browse files Browse the repository at this point in the history
* s3e11 tpl v1

* some changes

* fix some bugs in s3e11 tpl, change docker logs color

* fix CI
  • Loading branch information
XianBW authored Sep 25, 2024
1 parent 91979c0 commit 8c57524
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

# train
train = pd.read_csv("/kaggle/input/train.csv")
train["store_sqft"] = train["store_sqft"].astype("category")
train["salad"] = (train["salad_bar"] + train["prepared_food"]) / 2
train["log_cost"] = np.log1p(train["cost"])
most_important_features = [
"total_children",
"num_children_at_home",
"avg_cars_at home(approx).1",
"store_sqft",
"coffee_bar",
"video_store",
"salad",
"florist",
]

X_train, X_valid, y_train, y_valid = train_test_split(
train[most_important_features], train["log_cost"], test_size=0.2, random_state=2023
)

# test
test = pd.read_csv("/kaggle/input/test.csv")
test["store_sqft"] = test["store_sqft"].astype("category")
test["salad"] = (test["salad_bar"] + test["prepared_food"]) / 2

ids = test["id"]
X_test = test.drop(["id"], axis=1)
X_test = X_test[most_important_features]

return X_train, X_valid, y_train, y_valid, X_test, ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)

xgb_params = {
"n_estimators": 280,
"learning_rate": 0.05,
"max_depth": 10,
"subsample": 1.0,
"colsample_bytree": 1.0,
"tree_method": "hist",
"enable_categorical": True,
"verbosity": 1,
"min_child_weight": 3,
"base_score": 4.6,
"random_state": 2023,
}
model = xgb.XGBRegressor(**xgb_params)
model.fit(X_train, y_train)
return model


def predict(model, X_test):
"""
Keep feature select's consistency.
"""
X_test = select(X_test)
y_pred = model.predict(X_test)
return y_pred
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import importlib.util
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import mean_squared_error

DIRNAME = Path(__file__).absolute().resolve().parent


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])

# 3) Train the model
model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_valid)
y_valid_pred_l.append(y_valid_pred)
# print(y_valid_pred)
# print(y_valid_pred.shape)

# 5) Ensemble
# Majority vote ensemble
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)


# 6) Save the validation metrics
metrics = mean_squared_error(y_valid, y_valid_pred_ensemble, squared=False)
print(f"RMLSE on valid set: {metrics}")
pd.Series(data=[metrics], index=["RMLSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))


# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)


submission_result = pd.DataFrame(np.expm1(y_test_pred), columns=["cost"])
submission_result.insert(0, "id", ids)

submission_result.to_csv("submission.csv", index=False)
6 changes: 5 additions & 1 deletion rdagent/utils/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
import docker.models.containers
from pydantic import BaseModel
from pydantic_settings import BaseSettings
from rich import print
from rich.progress import Progress, TextColumn
from rich.rule import Rule

from rdagent.log import rdagent_logger as logger

Expand Down Expand Up @@ -207,7 +209,7 @@ def prepare(self):
status_dict = json.loads(part)
if "error" in status_dict:
p.update(task, description=f"[red]error: {status_dict['error']}")
raise docker.errors.BuildError(status_dict["error"])
raise docker.errors.BuildError(status_dict["error"], "")
if "stream" in status_dict:
p.update(task, description=status_dict["stream"])
logger.info(f"Finished building the image from dockerfile: {self.conf.dockerfile_folder_path}")
Expand Down Expand Up @@ -305,10 +307,12 @@ def run(
**self._gpu_kwargs(client),
)
logs = container.logs(stream=True)
print(Rule("[bold green]Docker Logs Begin[/bold green]", style="dark_orange"))
for log in logs:
decoded_log = log.strip().decode()
print(decoded_log)
log_output += decoded_log + "\n"
print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))
container.wait()
container.stop()
container.remove()
Expand Down

0 comments on commit 8c57524

Please sign in to comment.