Skip to content

Commit

Permalink
feat: New competition - Optiver (#356)
Browse files Browse the repository at this point in the history
* Adding the competition: Optiver Volatility Prediction

* Fixing for CI

* Updating a new competition @ Optiver

* re-writing the optiver competition

* Revise for better commit

* Further fixes

* Further fixes

* Fixes
  • Loading branch information
xisen-w authored Sep 26, 2024
1 parent 13c116d commit 3705efe
Show file tree
Hide file tree
Showing 5 changed files with 321 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder


def prepreprocess():
# Load the training data
train_df = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv")

# Load book and trade data
book_train = pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet")
trade_train = pd.read_parquet("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet")

# Merge book and trade data with train_df
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")

# Split the data
X = merged_df.drop(["target"], axis=1)
y = merged_df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_fit(X_train: pd.DataFrame):
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]]
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]

categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
]
)

numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])

preprocessor = ColumnTransformer(
transformers=[
("num", numerical_transformer, numerical_cols),
("cat", categorical_transformer, categorical_cols),
]
)

preprocessor.fit(X_train)

return preprocessor, numerical_cols, categorical_cols


def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed


def preprocess_script():
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()

preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

submission_df = pd.read_csv("/kaggle/input/test.csv")
ids = submission_df["id"]
submission_df = submission_df.drop(["id"], axis=1)
X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

return X_train, X_valid, y_train, y_valid, X_test, ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
mse = mean_squared_error(y_valid, y_valid_pred)
rmse = np.sqrt(mse)
print(f"Validation RMSE: {rmse:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred = model.predict(X_selected)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
import xgboost as xgb


def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# Parameters for regression
params = {
"objective": "reg:squarederror", # Use squared error for regression
"nthread": -1,
}
num_round = 200

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred = model.predict(dtest)
return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_rmse(y_true, y_pred):
"""Compute RMSE for regression."""
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
return rmse


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


print("begin preprocess")
# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
print("preprocess done")

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])

print(X_train.shape, X_valid.shape, X_test.shape)

# Handle inf and -inf values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]


# 3) Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Flatten the columns of a DataFrame with MultiIndex columns,
for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
"""
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df


X_train = flatten_columns(X_train)
X_valid = flatten_columns(X_valid)
X_test = flatten_columns(X_test)

model_l = [] # list[tuple[model, predict_func,]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred_l.append(predict_func(model, X_valid))
print(predict_func(model, X_valid).shape)

# 5) Ensemble
y_valid_pred = np.mean(y_valid_pred_l, axis=0)

rmse = compute_rmse(y_valid, y_valid_pred)
print("Final RMSE on validation set: ", rmse)

# 6) Save the validation RMSE
pd.Series(data=[rmse], index=["RMSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for m, m_pred in model_l:
y_test_pred_l.append(m_pred(m, X_test))

y_test_pred = np.mean(y_test_pred_l, axis=0).ravel()

# 8) Submit predictions for the test set
submission_result = pd.DataFrame({"id": ids, "price": y_test_pred})
submission_result.to_csv("submission.csv", index=False)

0 comments on commit 3705efe

Please sign in to comment.