feat: Kaggle loop update (Feature & Model) (#241)

* Init todo * Evaluation & dataset * Generate new data * dataset generation * add the result * Analysis * Factor update * Updates * Reformat analysis.py * CI fix * Revised Preprocessing & Supported Random Forest * Revised to support three models with feature * Further revised prompts * Slight Revision * docs: update contributors (#230) * Revised to support three models with feature * Further revised prompts * Slight Revision * feat: kaggle model and feature (#238) * update first version code * make hypothesis_gen and experiment_builder fit for both feature and model * feat: continue kaggle feature and model coder (#239) * use qlib docker to run qlib models * feature coder ready * model coder ready * fix CI * finish the first round of runner (#240) * Optimized the factor scenario and added the front-end. * fix a small bug * fix a typo * update the kaggle scenario * delete model_template folder * use experiment to run data preprocess script * add source data to scenarios * minor fix * minor bug fix * train.py debug * fixed a bug in train.py and added some TODOs * For Debugging * fix two small bugs in based_exp * fix some bugs * update preprocess * fix a bug in preprocess * fix a bug in train.py * reformat * Follow-up * fix a bug in train.py * fix a bug in workspace * fix a bug in feature duplication * fix a bug in feedback * fix a bug in preprocessed data * fix a bug om feature engineering * fix a ci error * Debugged & Connected * Fixed error on feedback & added other fixes * fix CI errors * fix a CI bug * fix: fix_dotenv_error (#257) * fix_dotenv_error * format with isort * Update rdagent/app/cli.py --------- Co-authored-by: you-n-g <you-n-g@users.noreply.github.com> * chore(main): release 0.2.1 (#249) Release-As: 0.2.1 * init a scenario for kaggle feature engineering * delete error codes * Delete rdagent/app/kaggle_feature/conf.py --------- Co-authored-by: Young <afe.young@gmail.com> Co-authored-by: Taozhi Wang <taozhi.mark.wang@gmail.com> Co-authored-by: you-n-g <you-n-g@users.noreply.github.com> Co-authored-by: cyncyw <47289405+taozhiwang@users.noreply.github.com> Co-authored-by: Xisen-Wang <xisen_application@163.com> Co-authored-by: Haotian Chen <113661982+Hytn@users.noreply.github.com> Co-authored-by: WinstonLiye <1957922024@qq.com> Co-authored-by: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com> Co-authored-by: Linlang <30293408+SunsetWolf@users.noreply.github.com>
microsoft · Sep 11, 2024 · 4cf22a6 · 4cf22a6
1 parent 44031d5
commit 4cf22a6
Show file tree

Hide file tree

Showing 42 changed files with 1,518 additions and 686 deletions.
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -13,29 +13,33 @@ class Config:
         """Add 'model_' to the protected namespaces"""
 
     # 1) overriding the default
-    scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
+    scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
     """Scenario class for data mining model"""
 
-    hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
+    hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
     """Hypothesis generation class"""
 
-    hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
+    hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
     """Hypothesis to experiment class"""
 
-    coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
-    """Coder class"""
+    feature_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGFactorCoSTEER"
+    """Feature Coder class"""
 
-    runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
-    """Runner class"""
+    model_coder: str = "rdagent.scenarios.kaggle.developer.coder.KGModelCoSTEER"
+    """Model Coder class"""
 
-    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
+    feature_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGFactorRunner"
+    """Feature Runner class"""
+
+    model_runner: str = "rdagent.scenarios.kaggle.developer.runner.KGModelRunner"
+    """Model Runner class"""
+
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
     """Summarizer class"""
 
     evolving_n: int = 10
     """Number of evolutions"""
 
-    evolving_n: int = 10
-
     competition: str = ""
 
 

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -0,0 +1,98 @@
+from collections import defaultdict
+from typing import Any
+
+import fire
+
+from rdagent.app.kaggle.conf import PROP_SETTING
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.developer import Developer
+from rdagent.core.exception import ModelEmptyError
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Trace,
+)
+from rdagent.core.scenario import Scenario
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.kaggle.proposal.proposal import (
+    KG_ACTION_FEATURE_ENGINEERING,
+    KG_ACTION_FEATURE_PROCESSING,
+)
+
+
+class ModelRDLoop(RDLoop):
+    def __init__(self, PROP_SETTING: BasePropSetting):
+        with logger.tag("init"):
+            scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+            logger.log_object(scen, tag="scenario")
+
+            self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+            logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
+
+            self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+            logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
+
+            self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
+            logger.log_object(self.feature_coder, tag="feature coder")
+            self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
+            logger.log_object(self.model_coder, tag="model coder")
+
+            self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
+            logger.log_object(self.feature_runner, tag="feature runner")
+            self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
+            logger.log_object(self.model_runner, tag="model runner")
+
+            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            logger.log_object(self.summarizer, tag="summarizer")
+            self.trace = Trace(scen=scen)
+            super(RDLoop, self).__init__()
+
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_coder.develop(prev_out["exp_gen"])
+            else:
+                exp = self.model_coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
+    def running(self, prev_out: dict[str, Any]):
+        with logger.tag("ef"):  # evaluate and feedback
+            if prev_out["propose"].action in [KG_ACTION_FEATURE_ENGINEERING, KG_ACTION_FEATURE_PROCESSING]:
+                exp = self.feature_runner.develop(prev_out["coding"])
+            else:
+                exp = self.model_runner.develop(prev_out["coding"])
+            logger.log_object(exp, tag="runner result")
+        return exp
+
+    skip_loop_error = (ModelEmptyError,)
+
+
+def main(path=None, step_n=None, competition=None):
+    """
+    Auto R&D Evolving loop for models in a kaggle{} scenario.
+
+    You can continue running session by
+
+    .. code-block:: python
+
+        dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional paramter
+
+    """
+    if competition:
+        PROP_SETTING.competition = competition
+    if path is None:
+        model_loop = ModelRDLoop(PROP_SETTING)
+    else:
+        model_loop = ModelRDLoop.load(path)
+    model_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    from dotenv import load_dotenv
+
+    load_dotenv(override=True)
+    fire.Fire(main)
diff --git a/rdagent/app/kaggle/model.py → rdagent/app/kaggle_feature/model.py b/rdagent/app/kaggle/model.py → rdagent/app/kaggle_feature/model.py
diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -161,7 +161,7 @@ def evaluate(
             )
         buffer = io.StringIO()
         gen_df.info(buf=buffer)
-        gen_df_info_str = buffer.getvalue()
+        gen_df_info_str = f"The use is currently working on a feature related task.\nThe output dataframe info is:\n{buffer.getvalue()}"
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(
@@ -378,6 +378,7 @@ def evaluate(
         self,
         implementation: Workspace,
         gt_implementation: Workspace,
+        version: int = 1,  # 1 for qlib factors and 2 for kaggle factors
         **kwargs,
     ) -> Tuple:
         conclusions = []
@@ -389,18 +390,21 @@ def evaluate(
         equal_value_ratio_result = 0
         high_correlation_result = False
 
-        # Check if both dataframe has only one columns
-        feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
-        conclusions.append(feedback_str)
+        # Check if both dataframe has only one columns Mute this since factor task might generate more than one columns now
+        if version == 1:
+            feedback_str, _ = FactorSingleColumnEvaluator(self.scen).evaluate(implementation, gt_implementation)
+            conclusions.append(feedback_str)
 
         # Check if the index of the dataframe is ("datetime", "instrument")
         feedback_str, _ = FactorOutputFormatEvaluator(self.scen).evaluate(implementation, gt_implementation)
         conclusions.append(feedback_str)
-
-        feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
-            implementation, gt_implementation
-        )
-        conclusions.append(feedback_str)
+        if version == 1:
+            feedback_str, daily_check_result = FactorDatetimeDailyEvaluator(self.scen).evaluate(
+                implementation, gt_implementation
+            )
+            conclusions.append(feedback_str)
+        else:
+            daily_check_result = None
 
         # Check if both dataframe have the same rows count
         if gt_implementation is not None:
@@ -627,7 +631,9 @@ def evaluate(
                 (
                     factor_feedback.factor_value_feedback,
                     decision_from_value_check,
-                ) = self.value_evaluator.evaluate(implementation=implementation, gt_implementation=gt_implementation)
+                ) = self.value_evaluator.evaluate(
+                    implementation=implementation, gt_implementation=gt_implementation, version=target_task.version
+                )
 
             factor_feedback.final_decision_based_on_gt = gt_implementation is not None
 
@@ -647,7 +653,7 @@ def evaluate(
                     target_task=target_task,
                     implementation=implementation,
                     execution_feedback=factor_feedback.execution_feedback,
-                    value_feedback=factor_feedback.factor_value_feedback,
+                    factor_value_feedback=factor_feedback.factor_value_feedback,
                     gt_implementation=gt_implementation,
                 )
                 (

diff --git a/rdagent/components/coder/factor_coder/factor.py b/rdagent/components/coder/factor_coder/factor.py
@@ -24,16 +24,19 @@ def __init__(
         factor_name,
         factor_description,
         factor_formulation,
+        *args,
         variables: dict = {},
         resource: str = None,
         factor_implementation: bool = False,
+        **kwargs,
     ) -> None:
         self.factor_name = factor_name
         self.factor_description = factor_description
         self.factor_formulation = factor_formulation
         self.variables = variables
         self.factor_resources = resource
         self.factor_implementation = factor_implementation
+        super().__init__(*args, **kwargs)
 
     def get_task_information(self):
         return f"""factor_name: {self.factor_name}
@@ -75,8 +78,8 @@ class FactorFBWorkspace(FBWorkspace):
     def __init__(
         self,
         *args,
-        executed_factor_value_dataframe=None,
-        raise_exception=False,
+        executed_factor_value_dataframe: pd.DataFrame = None,
+        raise_exception: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
@@ -102,7 +105,10 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
         1. make the directory in workspace path
         2. write the code to the file in the workspace path
         3. link all the source data to the workspace path folder
-        4. execute the code
+        if call_factor_py is True:
+            4. execute the code
+        else:
+            4. generate a script from template to import the factor.py dump get the factor value to result.h5
         5. read the factor value from the output file in the workspace path folder
         returns the execution feedback as a string and the factor value as a pandas dataframe
 
@@ -130,15 +136,21 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
             if self.executed_factor_value_dataframe is not None:
                 return self.FB_FROM_CACHE, self.executed_factor_value_dataframe
 
-            source_data_path = (
-                Path(
-                    FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
+            if self.target_task.version == 1:
+                source_data_path = (
+                    Path(
+                        FACTOR_IMPLEMENT_SETTINGS.data_folder_debug,
+                    )
+                    if data_type == "Debug"
+                    else Path(
+                        FACTOR_IMPLEMENT_SETTINGS.data_folder,
+                    )
                 )
-                if data_type == "Debug"
-                else Path(
+            elif self.target_task.version == 2:
+                # TODO you can change the name of the data folder for a better understanding
+                source_data_path = Path(
                     FACTOR_IMPLEMENT_SETTINGS.data_folder,
                 )
-            )
 
             source_data_path.mkdir(exist_ok=True, parents=True)
             code_path = self.workspace_path / f"factor.py"
@@ -147,9 +159,16 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
 
             execution_feedback = self.FB_EXECUTION_SUCCEEDED
             execution_success = False
+
+            if self.target_task.version == 1:
+                execution_code_path = code_path
+            elif self.target_task.version == 2:
+                execution_code_path = self.workspace_path / f"{uuid.uuid4()}.py"
+                execution_code_path.write_text((Path(__file__).parent / "factor_execution_template.txt").read_text())
+
             try:
                 subprocess.check_output(
-                    f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {code_path}",
+                    f"{FACTOR_IMPLEMENT_SETTINGS.python_bin} {execution_code_path}",
                     shell=True,
                     cwd=self.workspace_path,
                     stderr=subprocess.STDOUT,
@@ -161,7 +180,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
 
                 execution_feedback = (
                     e.output.decode()
-                    .replace(str(code_path.parent.absolute()), r"/path/to")
+                    .replace(str(execution_code_path.parent.absolute()), r"/path/to")
                     .replace(str(site.getsitepackages()[0]), r"/path/to/site-packages")
                 )
                 if len(execution_feedback) > 2000:

diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt
@@ -0,0 +1,13 @@
+import os
+
+import numpy as np
+import pandas as pd
+from factor import feat_eng
+
+if os.path.exists("valid.pkl"):
+    valid_df = pd.read_pickle("valid.pkl")
+else:
+    raise FileNotFoundError("No valid data found.")
+
+new_feat = feat_eng(valid_df)
+new_feat.to_hdf("result.h5", key="data", mode="w")
diff --git a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py
@@ -24,7 +24,7 @@
 evaluate_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
 
 
-def shape_evaluator(prediction: torch.Tensor, target_shape: Tuple = None) -> Tuple[str, bool]:
+def shape_evaluator(prediction: torch.Tensor | np.ndarray, target_shape: Tuple = None) -> Tuple[str, bool]:
     if target_shape is None or prediction is None:
         return (
             "No output generated from the model. No shape evaluation conducted.",
@@ -279,12 +279,8 @@ def evaluate(
         else:
             gt_tensor = None
 
-        if target_task.model_type == "XGBoost":
-            shape_feedback = "Not applicable for XGBoost models"
-            shape_decision = True
-        else:
-            shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
-        value_feedback, value_decision = value_evaluator(gt_tensor, gen_tensor)
+        shape_feedback, shape_decision = shape_evaluator(gen_tensor, (batch_size, 1))
+        value_feedback, value_decision = value_evaluator(gen_tensor, gt_tensor)
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,
             implementation=implementation,