Skip to content

Commit

Permalink
feat: fix some bugs and add original features' description (#259)
Browse files Browse the repository at this point in the history
* init a scenario for kaggle feature engineering

* fix some bugs and add original features' description

* refine the process of data downloading

* fix a error

* revert the code

* fix a bug in feedback

* fix a ci bug

* fix a ci bug
  • Loading branch information
WinstonLiyt authored Sep 11, 2024
1 parent 4cf22a6 commit 1a5f45a
Show file tree
Hide file tree
Showing 17 changed files with 51 additions and 107 deletions.
2 changes: 1 addition & 1 deletion docs/scens/model_agent_med.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,6 @@ You can try our demo by running the following command:
The following environment variables can be set in the `.env` file to customize the application's behavior:
.. autopydantic_settings:: rdagent.app.data_mining.conf.PropSetting
.. autopydantic_settings:: rdagent.app.data_mining.conf.MedBasePropSetting
:settings-show-field-summary: False
:exclude-members: Config
4 changes: 2 additions & 2 deletions rdagent/app/data_mining/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from rdagent.components.workflow.conf import BasePropSetting


class PropSetting(BasePropSetting):
class MedBasePropSetting(BasePropSetting):
class Config:
env_prefix = "DM_"
"""Use `DM_` as prefix for environment variables"""
Expand Down Expand Up @@ -46,4 +46,4 @@ class Config:
"""Physionet account password"""


PROP_SETTING = PropSetting()
MED_PROP_SETTING = MedBasePropSetting()
4 changes: 2 additions & 2 deletions rdagent/app/data_mining/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fire

from rdagent.app.data_mining.conf import PROP_SETTING
from rdagent.app.data_mining.conf import MED_PROP_SETTING
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.exception import ModelEmptyError

Expand All @@ -21,7 +21,7 @@ def main(path=None, step_n=None):
"""
if path is None:
model_loop = ModelRDLoop(PROP_SETTING)
model_loop = ModelRDLoop(MED_PROP_SETTING)
else:
model_loop = ModelRDLoop.load(path)
model_loop.run(step_n=step_n)
Expand Down
4 changes: 2 additions & 2 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from rdagent.components.workflow.conf import BasePropSetting


class PropSetting(BasePropSetting):
class KaggleBasePropSetting(BasePropSetting):
class Config:
env_prefix = "KG_"
"""Use `KG_` as prefix for environment variables"""
Expand Down Expand Up @@ -43,4 +43,4 @@ class Config:
competition: str = ""


PROP_SETTING = PropSetting()
KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
6 changes: 3 additions & 3 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import fire

from rdagent.app.kaggle.conf import PROP_SETTING
from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.workflow.conf import BasePropSetting
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.developer import Developer
Expand Down Expand Up @@ -83,9 +83,9 @@ def main(path=None, step_n=None, competition=None):
"""
if competition:
PROP_SETTING.competition = competition
KAGGLE_IMPLEMENT_SETTING.competition = competition
if path is None:
model_loop = ModelRDLoop(PROP_SETTING)
model_loop = ModelRDLoop(KAGGLE_IMPLEMENT_SETTING)
else:
model_loop = ModelRDLoop.load(path)
model_loop.run(step_n=step_n)
Expand Down
65 changes: 0 additions & 65 deletions rdagent/app/kaggle_feature/model.py

This file was deleted.

6 changes: 3 additions & 3 deletions rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pandas as pd
from filelock import FileLock

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.core.exception import CodeFormatError, CustomRuntimeError, NoOutputError
from rdagent.core.experiment import Experiment, FBWorkspace, Task
Expand Down Expand Up @@ -148,9 +149,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
)
elif self.target_task.version == 2:
# TODO you can change the name of the data folder for a better understanding
source_data_path = Path(
FACTOR_IMPLEMENT_SETTINGS.data_folder,
)
source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition

source_data_path.mkdir(exist_ok=True, parents=True)
code_path = self.workspace_path / f"factor.py"
Expand Down Expand Up @@ -237,3 +236,4 @@ def from_folder(task: FactorTask, path: Union[str, Path], **kwargs):


FactorExperiment = Experiment
FeatureExperiment = Experiment
1 change: 0 additions & 1 deletion rdagent/components/coder/model_coder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,5 +137,4 @@ def execute(
return execution_feedback_str, execution_model_output


FeatureExperiment = Experiment
ModelExperiment = Experiment
4 changes: 2 additions & 2 deletions rdagent/scenarios/data_mining/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from rdagent.app.data_mining.conf import PROP_SETTING
from rdagent.app.data_mining.conf import MED_PROP_SETTING
from rdagent.core.experiment import FBWorkspace
from rdagent.log import rdagent_logger as logger
from rdagent.utils.env import DMDockerEnv
Expand All @@ -15,7 +15,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
qtde = DMDockerEnv()
qtde.prepare(PROP_SETTING.username, PROP_SETTING.password)
qtde.prepare(MED_PROP_SETTING.username, MED_PROP_SETTING.password)

execute_log = qtde.run(
local_path=str(self.workspace_path),
Expand Down
14 changes: 7 additions & 7 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,20 @@ def process_results(current_result, sota_result):
sota_df = pd.DataFrame(sota_result)

# Combine the dataframes on the Metric index
combined_df = pd.DataFrame({"Current Result": current_df, "SOTA Result": sota_df})
combined_df = pd.concat([current_df, sota_df], axis=1)
combined_df.columns = ["current_df", "sota_df"]

# Add a new column to show which result is bigger
combined_df["Bigger Result"] = combined_df.apply(
lambda row: "Equal"
if row["Current Result"] == row["SOTA Result"]
else ("Current Result" if row["Current Result"] > row["SOTA Result"] else "SOTA Result"),
combined_df["the largest"] = combined_df.apply(
lambda row: "sota_df"
if row["sota_df"] > row["current_df"]
else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
axis=1,
)

# Add a note about metric direction
combined_df["Note"] = "Direction of improvement (higher/lower is better) should be judged per metric"

return combined_df.to_string()
return combined_df


class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
Expand Down
17 changes: 14 additions & 3 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pickle
import shutil
import uuid
from pathlib import Path

import pandas as pd

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.components.coder.factor_coder.factor import FactorTask
from rdagent.components.runner import CachedRunner
from rdagent.components.runner.conf import RUNNER_SETTINGS
from rdagent.core.exception import ModelEmptyError
Expand Down Expand Up @@ -81,6 +82,16 @@ def init_develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
result = exp.experiment_workspace.execute(run_env=env_to_use)

exp.result = result
sub_task = FactorTask(
factor_name="original features", factor_description="here is the original features", factor_formulation=""
)

org_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition / "valid.pkl"
with open(org_data_path, "rb") as f:
org_data = pickle.load(f)
feature_shape = org_data.shape[-1]
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))

if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)

Expand Down
8 changes: 6 additions & 2 deletions rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from pathlib import Path

from rdagent.components.coder.factor_coder.factor import FactorFBWorkspace
from rdagent.components.coder.factor_coder.factor import (
FactorFBWorkspace,
FactorTask,
FeatureExperiment,
)
from rdagent.components.coder.model_coder.model import (
ModelExperiment,
ModelFBWorkspace,
Expand All @@ -15,7 +19,7 @@ def __init__(self, *args, **kwargs) -> None:
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")


class KGFactorExperiment(ModelExperiment[ModelTask, KGFBWorkspace, FactorFBWorkspace]):
class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "meta_tpl")
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ def feat_eng(X: pd.DataFrame):
"""
return the selected features
"""
return None
return X
4 changes: 2 additions & 2 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def import_module_from_path(module_name, module_path):
X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [X_train], [X_valid]
X_test_l = [X_test]
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
m = import_module_from_path(f.stem, f)
Expand Down
3 changes: 1 addition & 2 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ def background(self) -> str:

@property
def source_data(self) -> str:
# TODO later we should improve this part
data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder)
data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition

if (data_folder / "valid.pkl").exists():
X_valid = pd.read_pickle(data_folder / "valid.pkl")
Expand Down
12 changes: 4 additions & 8 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pandas as pd

from rdagent.app.kaggle.conf import PROP_SETTING
from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.core.experiment import FBWorkspace
from rdagent.log import rdagent_logger as logger
from rdagent.utils.env import KGDockerEnv
Expand Down Expand Up @@ -34,7 +34,7 @@ def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
def generate_preprocess_data(
self,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.DataFrame, pd.Series]:
kgde = KGDockerEnv(PROP_SETTING.competition)
kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

execute_log, results = kgde.dump_python_code_run_and_get_results(
Expand All @@ -58,7 +58,7 @@ def generate_preprocess_data(

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
logger.info(f"Running the experiment in {self.workspace_path}")
kgde = KGDockerEnv(PROP_SETTING.competition)
kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

execute_log = kgde.run(
Expand All @@ -69,11 +69,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:

csv_path = self.workspace_path / "submission_score.csv"

print("WORKSPACE PATH IS HERE --------------------------------------------------------------------------------")
print(self.workspace_path)
print("CSV PATH IS HERE --------------------------------------------------------------------------------------")
print(csv_path)
print("CSV PATH IS HERE --------------------------------------------------------------------------------------")
logger.info(self.workspace_path)

if not csv_path.exists():
logger.error(f"File {csv_path} does not exist.")
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ hypothesis_and_feedback: |-
hypothesis_output_format: |-
The output should follow JSON format. The schema is as follows:
{
"action": "The action that the user wants to take based on the information provided. should be one of ["Feature engineering", "Feature processing", "Model feature selection", "Model tuning"]", Only "Model tuning" For Now
"action": "The action that the user wants to take based on the information provided. should be one of ["Feature engineering", "Feature processing", "Model feature selection", "Model tuning"]"
"hypothesis": "The new hypothesis generated based on the information provided.",
"reason": "The reason why you generate this hypothesis. It should be comprehensive and logical. It should cover the other keys below and extend them.",
"concise_reason": "Two-line summary. First line focuses on a concise justification for the change. Second line generalizes a knowledge statement.",
Expand Down

0 comments on commit 1a5f45a

Please sign in to comment.