Skip to content

Commit

Permalink
fix: fix some errors in scenario.py, proposal.py and runner.py and se…
Browse files Browse the repository at this point in the history
…veral complex competition scenarios(#365)

* fix several bugs in proposal and runner

* fix a bug in feedback-prize-english-language-learning

* fix some bugs and templates

* fix the bug in optiver and nlp problem
  • Loading branch information
WinstonLiyt authored Sep 27, 2024
1 parent acac507 commit 2e383b1
Show file tree
Hide file tree
Showing 14 changed files with 155 additions and 171 deletions.
7 changes: 4 additions & 3 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
st.latex(ft.factor_formulation)

mks = "| Variable | Description |\n| --- | --- |\n"
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)
if isinstance(ft.variables, dict):
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)

elif isinstance(tasks[0], ModelTask):
st.markdown("**Model Tasks🚩**")
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
self.scen.vector_base.save()
elif self.scen.if_using_graph_rag:
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)

return HypothesisFeedback(
observations=observations,
Expand Down
124 changes: 66 additions & 58 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
codes = "\n".join(codes)
return md5_hash(codes)

def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
"""
For the initial development, the experiment serves as a benchmark for feature engineering.
Expand Down Expand Up @@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
feature_shape = org_data.shape[-1]
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))

sub_model_1_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
)
sub_model_2_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
)
model_map = {
"XGBoost": "model_xgboost.py",
"RandomForest": "model_randomforest.py",
"LightGBM": "model_lightgbm.py",
"NN": "model_nn.py",
}

workspace_path = exp.experiment_workspace.workspace_path / "model"

for model_name, model_file in model_map.items():
model_file_path = workspace_path / model_file

exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
if model_file_path.exists():
model_description = (
self.extract_model_task_from_code(model_file_path.read_text())
+ f"""code: {model_file_path.read_text()}"""
)
else:
model_description = ""

exp.experiment_workspace.model_description[model_name] = model_description

if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)
Expand Down Expand Up @@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:


class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
implemented_factor_count = 0
for sub_ws in exp.sub_workspace_list:
Expand All @@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if implemented_factor_count == 0:
raise FactorEmptyError("No factor is implemented")

# initial template result
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])

if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ def data_cleaner(text):

y_train = train[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train["full_text"])
X_test = vectorizer.transform(test["full_text"])

X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
X_train = train[["full_text"]]
X_test = test[["full_text"]]

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Expand All @@ -11,12 +12,15 @@ def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass
self.vectorizer = TfidfVectorizer()
self.vectorizer.fit(train_df["full_text"])

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
X = self.vectorizer.transform(X["full_text"])
X = pd.DataFrame.sparse.from_spmatrix(X)
return X


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
X_train = select(X_train)

xgb_estimator = xgb.XGBRegressor(
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="gpu_hist", device="cuda"
n_estimators=500, random_state=0, objective="reg:squarederror", tree_method="hist", device="cuda"
)

model = MultiOutputRegressor(xgb_estimator, n_jobs=-1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ def import_module_from_path(module_name, module_path):
return module


def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()

Expand All @@ -24,6 +28,7 @@ def import_module_from_path(module_name, module_path):

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
print(X_train.head())
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
Expand Down Expand Up @@ -62,33 +67,18 @@ def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
metrics_all = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_valid)
y_valid_pred_l.append(y_valid_pred)
# print(y_valid_pred)
# print(y_valid_pred.shape)

# 5) Ensemble
# Majority vote ensemble
y_valid_pred_ensemble = np.mean(y_valid_pred_l, axis=0)

metrics = MCRMSE(y_valid, y_valid_pred)
print(f"MCRMSE on valid set: {metrics}")
metrics_all.append(metrics)

# 6) Save the validation metrics
def MCRMSE(y_true, y_pred):
return np.mean(np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0)))


metrics = MCRMSE(y_valid, y_valid_pred_ensemble)
print(f"MCRMSE on valid set: {metrics}")
pd.Series(data=[metrics], index=["MCRMSE"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))
min_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["MCRMSE"]).to_csv("submission_score.csv")

# For multiclass classification, use the mode of the predictions
y_test_pred = np.mean(y_test_pred_l, axis=0)
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test)


submission_result = pd.read_csv("/kaggle/input/sample_submission.csv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,20 @@ def import_module_from_path(module_name, module_path):
X_te = X_te.loc[:, ~X_te.columns.duplicated()]

# Train the model
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Flatten the columns of a DataFrame with MultiIndex columns,
for (feature_0, a), (feature_0, b) -> feature_0_a, feature_0_b
"""
if df.columns.nlevels == 1:
return df
df.columns = ["_".join(col).strip() for col in df.columns.values]
return df

X_tr = flatten_columns(X_tr)
X_val = flatten_columns(X_val)
X_te = flatten_columns(X_te)

model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,26 @@

def prepreprocess():
# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
train_df = pd.read_csv("/kaggle/input/train.csv")

# Load book and trade data
book_train = pd.read_parquet("/kaggle/input/book_train.parquet").head(1000)
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet").head(1000)
book_train = pd.read_parquet("/kaggle/input/book_train.parquet")
trade_train = pd.read_parquet("/kaggle/input/trade_train.parquet")

# Merge book and trade data with train_df
merged_df = pd.merge(train_df, book_train, on=["stock_id", "time_id"], how="left")
merged_df = pd.merge(merged_df, trade_train, on=["stock_id", "time_id"], how="left")

print(merged_df.head())

# Split the data
X = merged_df.drop(["target"], axis=1)
y = merged_df["target"]

print(X.columns.to_list())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.columns.to_list())

return X_train, X_valid, y_train, y_valid


Expand Down Expand Up @@ -60,7 +62,6 @@ def preprocess_fit(X_train: pd.DataFrame):
def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols, categorical_cols):
X_transformed = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols + categorical_cols, index=X.index)

return X_transformed
Expand All @@ -79,11 +80,6 @@ def preprocess_script():

X_train, X_valid, y_train, y_valid = prepreprocess()

preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)

X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)
X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols, categorical_cols)

submission_df = pd.read_csv("/kaggle/input/test.csv")

ids = submission_df["row_id"]
Expand All @@ -94,10 +90,8 @@ def preprocess_script():
if col not in submission_df.columns:
submission_df[col] = 0 # Fill with 0 or another appropriate value

X_test = preprocess_transform(submission_df, preprocessor, numerical_cols, categorical_cols)

# Handle missing values
for df in [X_train, X_valid, X_test]:
for df in [X_train, X_valid, submission_df]:
df.fillna(df.mean(), inplace=True)

return X_train, X_valid, y_train, y_valid, X_test, ids
return X_train, X_valid, y_train, y_valid, submission_df, ids
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
params = {
"objective": "reg:squarederror", # Use squared error for regression
"nthread": -1,
"tree_method": "gpu_hist",
"tree_method": "hist",
"device": "cuda",
}
num_round = 200
Expand Down
Loading

0 comments on commit 2e383b1

Please sign in to comment.