Skip to content

Commit

Permalink
feat: get kaggle notebooks & disscussion text for RAG (#371)
Browse files Browse the repository at this point in the history
* crawl notebooks & change to DS-Agent format text

* give one function in kaggle_crawler to collect kaggle knowledge texts

* fix CI

* add tool for merge .py files to one py file

* fix CI

* delete files

* changes for select function

* add nbformat

* jump crawler import test

* del test code

* CI

* change

* change

* change
  • Loading branch information
XianBW authored Sep 27, 2024
1 parent a1b63db commit cead345
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 6 deletions.
9 changes: 9 additions & 0 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.log.time import measure_time
from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
from rdagent.scenarios.kaggle.kaggle_crawler import download_data
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
Expand Down Expand Up @@ -88,6 +89,14 @@ def running(self, prev_out: dict[str, Any]):
exp = self.model_runner.develop(prev_out["coding"])
logger.log_object(exp, tag="runner result")

if KAGGLE_IMPLEMENT_SETTING.competition in ["optiver-realized-volatility-prediction"]:
try:
python_files_to_notebook(
KAGGLE_IMPLEMENT_SETTING.competition, exp.experiment_workspace.workspace_path
)
except Exception as e:
logger.error(f"Merge python files to one file failed: {e}")

if KAGGLE_IMPLEMENT_SETTING.auto_submit:
csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
try:
Expand Down
95 changes: 95 additions & 0 deletions rdagent/scenarios/kaggle/experiment/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from pathlib import Path

import nbformat as nbf


def python_files_to_notebook(competition: str, py_dir: str):
py_dir: Path = Path(py_dir)
save_path: Path = py_dir / "merged.ipynb"

pre_file = py_dir / "fea_share_preprocess.py"
pre_py = pre_file.read_text()

pre_py = pre_py.replace("/kaggle/input", f"/kaggle/input/{competition}")

fea_files = list(py_dir.glob("feature/*.py"))
fea_pys = {
f"{fea_file.stem}_cls": fea_file.read_text().replace("feature_engineering_cls", f"{fea_file.stem}_cls").strip()
+ "()\n"
for fea_file in fea_files
}

model_files = list(py_dir.glob("model/model*.py"))
model_pys = {f"{model_file.stem}": model_file.read_text().strip() for model_file in model_files}
for k, v in model_pys.items():
model_pys[k] = v.replace("def fit(", "def fit(self, ").replace("def predict(", "def predict(self, ")

lines = model_pys[k].split("\n")
indent = False
first_line = -1
for i, line in enumerate(lines):
if "def " in line:
indent = True
if first_line == -1:
first_line = i
if indent:
lines[i] = " " + line
lines.insert(first_line, f"class {k}:\n")
model_pys[k] = "\n".join(lines)

select_files = list(py_dir.glob("model/select*.py"))
select_pys = {
f"{select_file.stem}": select_file.read_text().replace("def select(", f"def {select_file.stem}(")
for select_file in select_files
}

train_file = py_dir / "train.py"
train_py = train_file.read_text()

train_py = train_py.replace("from fea_share_preprocess import preprocess_script", "")
train_py = train_py.replace("DIRNAME = Path(__file__).absolute().resolve().parent", "")

fea_cls_list_str = "[" + ", ".join(list(fea_pys.keys())) + "]"
train_py = train_py.replace(
'for f in DIRNAME.glob("feature/feat*.py"):', f"for cls in {fea_cls_list_str}:"
).replace("cls = import_module_from_path(f.stem, f).feature_engineering_cls()", "")

model_cls_list_str = "[" + ", ".join(list(model_pys.keys())) + "]"
train_py = (
train_py.replace('for f in DIRNAME.glob("model/model*.py"):', f"for mc in {model_cls_list_str}:")
.replace("m = import_module_from_path(f.stem, f)", "m = mc()")
.replace('select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)', "")
.replace(
"select_m = import_module_from_path(select_python_path.stem, select_python_path)",
'select_m = eval(mc.__name__.replace("model", "select"))',
)
.replace("select_m.select", "select_m")
.replace("[2].select", "[2]")
)

nb = nbf.v4.new_notebook()
all_py = ""

nb.cells.append(nbf.v4.new_code_cell(pre_py))
all_py += pre_py + "\n\n"

for v in fea_pys.values():
nb.cells.append(nbf.v4.new_code_cell(v))
all_py += v + "\n\n"

for v in model_pys.values():
nb.cells.append(nbf.v4.new_code_cell(v))
all_py += v + "\n\n"

for v in select_pys.values():
nb.cells.append(nbf.v4.new_code_cell(v))
all_py += v + "\n\n"

nb.cells.append(nbf.v4.new_code_cell(train_py))
all_py += train_py + "\n"

with save_path.open("w", encoding="utf-8") as f:
nbf.write(nb, f)

with save_path.with_suffix(".py").open("w", encoding="utf-8") as f:
f.write(all_py)
133 changes: 129 additions & 4 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
# %%
import json
import subprocess
import time
import zipfile
from itertools import chain
from pathlib import Path

import nbformat
from jinja2 import Environment, StrictUndefined
from rich import print
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.core.prompts import Prompts
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend

# %%
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
Expand Down Expand Up @@ -79,6 +87,121 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
zip_ref.extractall(data_path)


def download_notebooks(
competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
) -> None:
data_path = Path(f"{local_path}/{competition}")
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# judge the sort_by
ll = api.competition_leaderboard_view(competition)
score_diff = float(ll[0].score) - float(ll[-1].score)
if score_diff > 0:
sort_by = "scoreDescending"
else:
sort_by = "scoreAscending"

# download notebooks
nl = api.kernels_list(competition=competition, sort_by=sort_by, page=1, page_size=num)
for nb in nl:
author = nb.ref.split("/")[0]
api.kernels_pull(nb.ref, path=data_path / author)
print(f"Downloaded {len(nl)} notebooks for {competition}. ([red]{sort_by}[/red])")


def notebook_to_knowledge(notebook_text: str) -> str:
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")

sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["user"])
.render(notebook=notebook_text)
)

response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=False,
)
return response


def convert_notebooks_to_text(competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks") -> None:
data_path = Path(f"{local_path}/{competition}")
converted_num = 0

# convert ipynb and irnb files
for nb_path in chain(data_path.glob("**/*.ipynb"), data_path.glob("**/*.irnb")):
with nb_path.open("r", encoding="utf-8") as f:
nb = nbformat.read(f, as_version=4)
text = []
for cell in nb.cells:
if cell.cell_type == "markdown":
text.append(f"```markdown\n{cell.source}```")
elif cell.cell_type == "code":
text.append(f"```code\n{cell.source}```")
text = "\n\n".join(text)

text = notebook_to_knowledge(text)

text_path = nb_path.with_suffix(".txt")
text_path.write_text(text, encoding="utf-8")
converted_num += 1

# convert py files
for py_path in data_path.glob("**/*.py"):
with py_path.open("r", encoding="utf-8") as f:
text = f"```code\n{f.read()}```"

text = notebook_to_knowledge(text)

text_path = py_path.with_suffix(".txt")
text_path.write_text(text, encoding="utf-8")
converted_num += 1

print(f"Converted {converted_num} notebooks to text files.")


def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") -> dict[str, list[str]]:
"""
{
"competition1": [
"knowledge_text1",
"knowledge_text2",
...
],
“competition2”: [
"knowledge_text1",
"knowledge_text2",
...
],
...
}
"""
notebooks_dir = Path(local_path) / "notebooks"

competition_knowledge_texts_dict = {}
for competition_dir in notebooks_dir.iterdir():
knowledge_texts = []
for text_path in competition_dir.glob("**/*.txt"):
text = text_path.read_text(encoding="utf-8")
knowledge_texts.append(text)

competition_knowledge_texts_dict[competition_dir.name] = knowledge_texts

return competition_knowledge_texts_dict


# %%
if __name__ == "__main__":
dsagent_cs = [
"feedback-prize-english-language-learning",
Expand Down Expand Up @@ -124,14 +247,16 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
"store-sales-time-series-forecasting",
"titanic",
"tpu-getting-started",
# scenario competition
"covid19-global-forecasting-week-1",
"birdsong-recognition",
"optiver-trading-at-the-close",
"statoil-iceberg-classifier-challenge",
"optiver-realized-volatility-prediction",
"facebook-v-predicting-check-ins",
]

for i in dsagent_cs + other_cs:
crawl_descriptions(i)
all_cs = dsagent_cs + other_cs
for c in all_cs:
convert_notebooks_to_text(c)
exit()
from kaggle.api.kaggle_api_extended import KaggleApi

Expand Down
32 changes: 30 additions & 2 deletions rdagent/scenarios/kaggle/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,38 @@ model_feature_selection:
{
"Selected Group Index": [1, 3, 5], # List of selected group indices, notice: the index starts from 1
}
user: |-
Current feature groups:
{% for feature in feature_groups %}
Group {{ loop.index }}:
{{ feature }}
{% endfor %}
{% endfor %}
gen_knowledge_from_code_DSAgent:
system: |-
You were a proficient data scientist.
user: |-
The following notebook (contain markdown part and code part) is a high-performing solution for a kaggle competition.
Please answer the following questions one by one and **as detailedly as possible**.
Make sure that another data scientist can exactly reproduce this copy of code based on your answer.
Focus on the training process.
(1) Please give a summary of the overall design.
(2) What is the overall model architecture? Please use a long article to answer this question as accurately and in detail as possible.
(3) How are the important hyper-parameters setting in this code?
(4) What is the optimization objective?
(5) What advanced machine learning technique does this copy of code use?
(6) What other important tricks do you think play an important role for high performance?
Note that make sure the answers are directly included from the code or markdown text, rather than based on your assumption.
--------------------
{{ notebook }}
--------------------
gen_knowledge_from_code_RDAgent:
system: |-
You were a proficient data scientist.
user: |-
TODO...
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ st-theme
# kaggle crawler
selenium
kaggle
nbformat

# tool
seaborn
Expand Down

0 comments on commit cead345

Please sign in to comment.