Skip to content

Commit

Permalink
feat: Kaggle loop update (Feature & Model) (#241)
Browse files Browse the repository at this point in the history
* Init todo

* Evaluation & dataset

* Generate new data

* dataset generation

* add the result

* Analysis

* Factor update

* Updates

* Reformat analysis.py

* CI fix

* Revised Preprocessing & Supported Random Forest

* Revised to support three models with feature

* Further revised prompts

* Slight Revision

* docs: update contributors (#230)

* Revised to support three models with feature

* Further revised prompts

* Slight Revision

* feat: kaggle model and feature (#238)

* update first version code

* make hypothesis_gen and experiment_builder fit for both feature and model

* feat: continue kaggle feature and model coder (#239)

* use qlib docker to run qlib models

* feature coder ready

* model coder ready

* fix CI

* finish the first round of runner (#240)

* Optimized the factor scenario and added the front-end.

* fix a small bug

* fix a typo

* update the kaggle scenario

* delete model_template folder

* use experiment to run data preprocess script

* add source data to scenarios

* minor fix

* minor bug fix

* train.py debug

* fixed a bug in train.py and added some TODOs

* For Debugging

* fix two small bugs in based_exp

* fix some bugs

* update preprocess

* fix a bug in preprocess

* fix a bug in train.py

* reformat

* Follow-up

* fix a bug in train.py

* fix a bug in workspace

* fix a bug in feature duplication

* fix a bug in feedback

* fix a bug in preprocessed data

* fix a bug om feature engineering

* fix a ci error

* Debugged & Connected

* Fixed error on feedback & added other fixes

* fix CI errors

* fix a CI bug

* fix: fix_dotenv_error (#257)

* fix_dotenv_error

* format with isort

* Update rdagent/app/cli.py

---------

Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>

* chore(main): release 0.2.1 (#249)

Release-As: 0.2.1

* init a scenario for kaggle feature engineering

* delete error codes

* Delete rdagent/app/kaggle_feature/conf.py

---------

Co-authored-by: Young <afe.young@gmail.com>
Co-authored-by: Taozhi Wang <taozhi.mark.wang@gmail.com>
Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>
Co-authored-by: cyncyw <47289405+taozhiwang@users.noreply.github.com>
Co-authored-by: Xisen-Wang <xisen_application@163.com>
Co-authored-by: Haotian Chen <113661982+Hytn@users.noreply.github.com>
Co-authored-by: WinstonLiye <1957922024@qq.com>
Co-authored-by: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com>
Co-authored-by: Linlang <30293408+SunsetWolf@users.noreply.github.com>
  • Loading branch information
10 people authored Sep 11, 2024
1 parent 44031d5 commit 4cf22a6
Show file tree
Hide file tree
Showing 42 changed files with 1,518 additions and 686 deletions.
24 changes: 14 additions & 10 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,33 @@ class Config:
"""Add 'model_' to the protected namespaces"""

# 1) overriding the default
scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
"""Scenario class for data mining model"""

hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
"""Hypothesis generation class"""

hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
"""Hypothesis to experiment class"""

coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
"""Coder class"""
feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
"""Feature Coder class"""

runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
"""Runner class"""
model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
"""Model Coder class"""

summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
"""Feature Runner class"""

model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
"""Model Runner class"""

summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
"""Summarizer class"""

evolving_n: int = 10
"""Number of evolutions"""

evolving_n: int = 10

competition: str = ""


Expand Down
98 changes: 98 additions & 0 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from collections import defaultdict
from typing import Any

import fire

from rdagent.app.kaggle.conf import PROP_SETTING
from rdagent.components.workflow.conf import BasePropSetting
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.developer import Developer
from rdagent.core.exception import ModelEmptyError
from rdagent.core.proposal import (
Hypothesis2Experiment,
HypothesisExperiment2Feedback,
HypothesisGen,
Trace,
)
from rdagent.core.scenario import Scenario
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
)


class ModelRDLoop(RDLoop):
def __init__(self, PROP_SETTING: BasePropSetting):
with logger.tag("init"):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")

self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
logger.log_object(self.feature_coder, tag="feature coder")
self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
logger.log_object(self.model_coder, tag="model coder")

self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
logger.log_object(self.feature_runner, tag="feature runner")
self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
logger.log_object(self.model_runner, tag="model runner")

self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
logger.log_object(self.summarizer, tag="summarizer")
self.trace = Trace(scen=scen)
super(RDLoop, self).__init__()

def coding(self, prev_out: dict[str, Any]):
with logger.tag("d"): # develop
if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
exp = self.feature_coder.develop(prev_out["exp_gen"])
else:
exp = self.model_coder.develop(prev_out["exp_gen"])
logger.log_object(exp.sub_workspace_list, tag="coder result")
return exp

def running(self, prev_out: dict[str, Any]):
with logger.tag("ef"): # evaluate and feedback
if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
exp = self.feature_runner.develop(prev_out["coding"])
else:
exp = self.model_runner.develop(prev_out["coding"])
logger.log_object(exp, tag="runner result")
return exp

skip_loop_error = (ModelEmptyError,)


def main(path=None, step_n=None, competition=None):
"""
Auto R&D Evolving loop for models in a kaggle{} scenario.
You can continue running session by
.. code-block:: python
dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional paramter
"""
if competition:
PROP_SETTING.competition = competition
if path is None:
model_loop = ModelRDLoop(PROP_SETTING)
else:
model_loop = ModelRDLoop.load(path)
model_loop.run(step_n=step_n)


if __name__ == "__main__":
from dotenv import load_dotenv

load_dotenv(override=True)
fire.Fire(main)
File renamed without changes.
28 changes: 17 additions & 11 deletions rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def evaluate(
)
buffer = io.StringIO()
gen_df.info(buf=buffer)
gen_df_info_str = buffer.getvalue()
gen_df_info_str = f"The use is currently working on a feature related task.\nThe output dataframe info is:\n{buffer.getvalue()}"
system_prompt = (
Environment(undefined=StrictUndefined)
.from_string(
Expand Down Expand Up @@ -378,6 +378,7 @@ def evaluate(
self,
implementation: Workspace,
gt_implementation: Workspace,
version: int = 1, # 1 for qlib factors and 2 for kaggle factors
**kwargs,
) -> Tuple:
conclusions = []
Expand All @@ -389,18 +390,21 @@ def evaluate(
equal_value_ratio_result = 0
high_correlation_result = False

# Check if both dataframe has only one columns
feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
conclusions.append(feedback_str)
# Check if both dataframe has only one columns Mute this since factor task might generate more than one columns now
if version == 1:
feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
conclusions.append(feedback_str)

# Check if the index of the dataframe is ("datetime", "instrument")
feedback_str, _ = FactorOutputFormatEvaluator(self.scen).evaluate(implementation, gt_implementation)
conclusions.append(feedback_str)

feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
implementation, gt_implementation
)
conclusions.append(feedback_str)
if version == 1:
feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
implementation, gt_implementation
)
conclusions.append(feedback_str)
else:
daily_check_result = None

# Check if both dataframe have the same rows count
if gt_implementation is not None:
Expand Down Expand Up @@ -627,7 +631,9 @@ def evaluate(
(
factor_feedback.factor_value_feedback,
decision_from_value_check,
) = self.value_evaluator.evaluate(implementation=implementation, gt_implementation=gt_implementation)
) = self.value_evaluator.evaluate(
implementation=implementation, gt_implementation=gt_implementation, version=target_task.version
)

factor_feedback.final_decision_based_on_gt = gt_implementation is not None

Expand All @@ -647,7 +653,7 @@ def evaluate(
target_task=target_task,
implementation=implementation,
execution_feedback=factor_feedback.execution_feedback,
value_feedback=factor_feedback.factor_value_feedback,
factor_value_feedback=factor_feedback.factor_value_feedback,
gt_implementation=gt_implementation,
)
(
Expand Down
41 changes: 30 additions & 11 deletions rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,19 @@ def __init__(
factor_name,
factor_description,
factor_formulation,
*args,
variables: dict = {},
resource: str = None,
factor_implementation: bool = False,
**kwargs,
) -> None:
self.factor_name = factor_name
self.factor_description = factor_description
self.factor_formulation = factor_formulation
self.variables = variables
self.factor_resources = resource
self.factor_implementation = factor_implementation
super().__init__(*args, **kwargs)

def get_task_information(self):
return f"""factor_name: {self.factor_name}
Expand Down Expand Up @@ -75,8 +78,8 @@ class FactorFBWorkspace(FBWorkspace):
def __init__(
self,
*args,
executed_factor_value_dataframe=None,
raise_exception=False,
executed_factor_value_dataframe: pd.DataFrame = None,
raise_exception: bool = False,
**kwargs,
) -> None:
super().__init__(*args, **kwargs)
Expand All @@ -102,7 +105,10 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
1. make the directory in workspace path
2. write the code to the file in the workspace path
3. link all the source data to the workspace path folder
4. execute the code
if call_factor_py is True:
4. execute the code
else:
4. generate a script from template to import the factor.py dump get the factor value to result.h5
5. read the factor value from the output file in the workspace path folder
returns the execution feedback as a string and the factor value as a pandas dataframe
Expand Down Expand Up @@ -130,15 +136,21 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
if self.executed_factor_value_dataframe is not None:
return self.FB_FROM_CACHE, self.executed_factor_value_dataframe

source_data_path = (
Path(
FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
if self.target_task.version == 1:
source_data_path = (
Path(
FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
)
if data_type == "Debug"
else Path(
FACTOR_IMPLEMENT_SETTINGS.data_folder,
)
)
if data_type == "Debug"
else Path(
elif self.target_task.version == 2:
# TODO you can change the name of the data folder for a better understanding
source_data_path = Path(
FACTOR_IMPLEMENT_SETTINGS.data_folder,
)
)

source_data_path.mkdir(exist_ok=True, parents=True)
code_path = self.workspace_path / f"factor.py"
Expand All @@ -147,9 +159,16 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple

execution_feedback = self.FB_EXECUTION_SUCCEEDED
execution_success = False

if self.target_task.version == 1:
execution_code_path = code_path
elif self.target_task.version == 2:
execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py"
execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text())

try:
subprocess.check_output(
f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {code_path}",
f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {execution_code_path}",
shell=True,
cwd=self.workspace_path,
stderr=subprocess.STDOUT,
Expand All @@ -161,7 +180,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple

execution_feedback = (
e.output.decode()
.replace(str(code_path.parent.absolute()), r"/path/to")
.replace(str(execution_code_path.parent.absolute()), r"/path/to")
.replace(str(site.getsitepackages()[0]), r"/path/to/site-packages")
)
if len(execution_feedback) > 2000:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os

import numpy as np
import pandas as pd
from factor import feat_eng

if os.path.exists("valid.pkl"):
valid_df = pd.read_pickle("valid.pkl")
else:
raise FileNotFoundError("No valid data found.")

new_feat = feat_eng(valid_df)
new_feat.to_hdf("result.h5", key="data", mode="w")
10 changes: 3 additions & 7 deletions rdagent/components/coder/model_coder/CoSTEER/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
evaluate_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")


def shape_evaluator(prediction: torch.Tensor, target_shape: Tuple = None) -> Tuple[str, bool]:
def shape_evaluator(prediction: torch.Tensor | np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
if target_shape is None or prediction is None:
return (
"No output generated from the model. No shape evaluation conducted.",
Expand Down Expand Up @@ -279,12 +279,8 @@ def evaluate(
else:
gt_tensor = None

if target_task.model_type == "XGBoost":
shape_feedback = "Not applicable for XGBoost models"
shape_decision = True
else:
shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
value_feedback, value_decision = value_evaluator(gt_tensor, gen_tensor)
shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
value_feedback, value_decision = value_evaluator(gen_tensor, gt_tensor)
code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
target_task=target_task,
implementation=implementation,
Expand Down
Loading

0 comments on commit 4cf22a6

Please sign in to comment.