Skip to content

Commit

Permalink
feat: Added loop code for Kaggle scene. (#211)
Browse files Browse the repository at this point in the history
* fuse all code into one commit

* remove container auto

* change remove method

* add kaggle env start

* change kaggle api

* change structure

* add crawler

* add requirements

* refeact the code

* delete mistaken codes

* merge docker settings and crawler

* add chrome install README for crawler usage

* Connect scen with Kaggle to download data

* Reformat some files to pass CI.

* fix some ci errors

* fix a ci error

* fix a ci error

---------

Co-authored-by: Bowen Xian <[email protected]>
  • Loading branch information
WinstonLiyt and XianBW authored Aug 19, 2024
1 parent e8f6af9 commit 975c327
Show file tree
Hide file tree
Showing 16 changed files with 848 additions and 0 deletions.
42 changes: 42 additions & 0 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pathlib import Path

from pydantic_settings import BaseSettings

from rdagent.components.workflow.conf import BasePropSetting


class PropSetting(BasePropSetting):
class Config:
env_prefix = "KG_"
"""Use `KG_` as prefix for environment variables"""
protected_namespaces = ()
"""Add 'model_' to the protected namespaces"""

# 1) overriding the default
scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
"""Scenario class for data mining model"""

hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
"""Hypothesis generation class"""

hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
"""Hypothesis to experiment class"""

coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
"""Coder class"""

runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
"""Runner class"""

summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
"""Summarizer class"""

evolving_n: int = 10
"""Number of evolutions"""

evolving_n: int = 10

competition: str = ""


PROP_SETTING = PropSetting()
65 changes: 65 additions & 0 deletions rdagent/app/kaggle/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from collections import defaultdict

import fire

from rdagent.app.kaggle.conf import PROP_SETTING
from rdagent.components.workflow.conf import BasePropSetting
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.exception import ModelEmptyError
from rdagent.core.proposal import (
Hypothesis2Experiment,
HypothesisExperiment2Feedback,
HypothesisGen,
Trace,
)
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger


class ModelRDLoop(RDLoop):
def __init__(self, PROP_SETTING: BasePropSetting):
with logger.tag("init"):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")

self.coder: Developer = import_class(PROP_SETTING.coder)(scen)
logger.log_object(self.coder, tag="coder")
self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
logger.log_object(self.runner, tag="runner")

self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
logger.log_object(self.summarizer, tag="summarizer")
self.trace = Trace(scen=scen)
super(RDLoop, self).__init__()

skip_loop_error = (ModelEmptyError,)


def main(path=None, step_n=None, competition=None):
"""
Auto R&D Evolving loop for models in a kaggle{} scenario.
You can continue running session by
.. code-block:: python
dotenv run -- python rdagent/app/kaggle/model.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional paramter
"""
if competition:
PROP_SETTING.competition = competition
if path is None:
model_loop = ModelRDLoop(PROP_SETTING)
else:
model_loop = ModelRDLoop.load(path)
model_loop.run(step_n=step_n)


if __name__ == "__main__":
fire.Fire(main)
20 changes: 20 additions & 0 deletions rdagent/scenarios/kaggle/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Kaggle Crawler

## Install chrome & chromedriver for Linux

In one folder
```shell
# install chrome
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo apt install ./google-chrome-stable_current_amd64.deb
google-chrome --version

# install chromedriver
wget https://storage.googleapis.com/chrome-for-testing-public/<chrome-version>/linux64/chromedriver-linux64.zip
unzip chromedriver-linux64.zip
cd chromedriver-linux64
sudo mv chromedriver /usr/local/bin
sudo chmod +x /usr/local/bin/chromedriver

chromedriver --version
```
68 changes: 68 additions & 0 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
from pathlib import Path

from jinja2 import Environment, StrictUndefined

from rdagent.core.experiment import Experiment
from rdagent.core.prompts import Prompts
from rdagent.core.proposal import (
Hypothesis,
HypothesisExperiment2Feedback,
HypothesisFeedback,
Trace,
)
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.utils import convert2bool

feedback_prompts = Prompts(file_path=Path(__file__).parent.parent.parent / "qlib" / "prompts.yaml")
DIRNAME = Path(__file__).absolute().resolve().parent


class KGModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
"""Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""

def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
"""
The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
For example: `mlflow` of Qlib will be included.
"""

logger.info("Generating feedback...")
# Define the system prompt for hypothesis feedback
system_prompt = feedback_prompts["model_feedback_generation"]["system"]

# Define the user prompt for hypothesis feedback
context = trace.scen
SOTA_hypothesis, SOTA_experiment = trace.get_sota_hypothesis_and_experiment()

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(feedback_prompts["model_feedback_generation"]["user"])
.render(
context=context,
last_hypothesis=SOTA_hypothesis,
last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
last_result=SOTA_experiment.result if SOTA_hypothesis else None,
hypothesis=hypothesis,
exp=exp,
)
)

# Call the APIBackend to generate the response for hypothesis feedback
response_hypothesis = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=system_prompt,
json_mode=True,
)

# Parse the JSON response to extract the feedback
response_json_hypothesis = json.loads(response_hypothesis)
return HypothesisFeedback(
observations=response_json_hypothesis.get("Observations", "No observations provided"),
hypothesis_evaluation=response_json_hypothesis.get("Feedback for Hypothesis", "No feedback provided"),
new_hypothesis=response_json_hypothesis.get("New Hypothesis", "No new hypothesis provided"),
reason=response_json_hypothesis.get("Reasoning", "No reasoning provided"),
decision=convert2bool(response_json_hypothesis.get("Decision", "false")),
)
3 changes: 3 additions & 0 deletions rdagent/scenarios/kaggle/developer/model_coder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER

KGModelCoSTEER = ModelCoSTEER
38 changes: 38 additions & 0 deletions rdagent/scenarios/kaggle/developer/model_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import shutil
import uuid
from pathlib import Path

import pandas as pd

from rdagent.components.coder.model_coder.model import ModelExperiment, ModelFBWorkspace
from rdagent.components.runner import CachedRunner
from rdagent.components.runner.conf import RUNNER_SETTINGS
from rdagent.core.developer import Developer
from rdagent.core.exception import ModelEmptyError
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.kaggle.experiment.model_experiment import KGModelExperiment
from rdagent.utils.env import KGDockerEnv


class KGModelRunner(CachedRunner[KGModelExperiment]):
def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
exp.result = result
return exp

if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
raise ModelEmptyError("model.py is empty")
# to replace & inject code
exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})

env_to_use = {"PYTHONPATH": "./"}

result = exp.experiment_workspace.execute(run_env=env_to_use)

exp.result = result
if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)

return exp
25 changes: 25 additions & 0 deletions rdagent/scenarios/kaggle/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM pytorch/pytorch:latest
# For GPU support, please choose the proper tag from https://hub.docker.com/r/pytorch/pytorch/tags

RUN apt-get clean && apt-get update && apt-get install -y \
curl \
vim \
git \
build-essential \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /workspace

RUN python -m pip install numpy
RUN python -m pip install --upgrade cython
# RUN python -m pip install -e .

RUN python -m pip install pandas
# RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.3.0%2Bcu121.html
RUN pip install torch_geometric
RUN pip install ogb
RUN pip install networkx
RUN pip install scikit-learn
RUN pip install catboost
RUN pip install xgboost
RUN pip install sparse
113 changes: 113 additions & 0 deletions rdagent/scenarios/kaggle/experiment/model_experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import json
from pathlib import Path

from jinja2 import Environment, StrictUndefined

from rdagent.components.coder.model_coder.model import (
ModelExperiment,
ModelFBWorkspace,
ModelTask,
)
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions

prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")


class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "model_template")


class KGModelScenario(Scenario):
def __init__(self, competition: str) -> None:
super().__init__()
self.competition = competition
self.competition_descriptions = crawl_descriptions(competition)
self.competition_type = None
self.competition_description = None
self.target_description = None
self.competition_features = None
self._analysis_competition_description()

def _analysis_competition_description(self):
# TODO: use gpt to analyze the competition description

sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["kg_description_template"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["kg_description_template"]["user"])
.render(
competition_descriptions=self.competition_descriptions,
)
)

response_analysis = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

response_json_analysis = json.loads(response_analysis)
self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
self.target_description = response_json_analysis.get("Target Description", "No target provided")
self.competition_features = response_json_analysis.get("Competition Features", "No features provided")

@property
def background(self) -> str:
background_template = prompt_dict["kg_model_background"]

background_prompt = (
Environment(undefined=StrictUndefined)
.from_string(background_template)
.render(
competition_type=self.competition_type,
competition_description=self.competition_description,
target_description=self.target_description,
competition_features=self.competition_features,
)
)

return background_prompt

@property
def source_data(self) -> str:
raise NotImplementedError("source_data is not implemented")

@property
def output_format(self) -> str:
return prompt_dict["kg_model_output_format"]

@property
def interface(self) -> str:
return prompt_dict["kg_model_interface"]

@property
def simulator(self) -> str:
return prompt_dict["kg_model_simulator"]

@property
def rich_style_description(self) -> str:
return """
kaggle scen """

def get_scenario_all_desc(self) -> str:
return f"""Background of the scenario:
{self.background}
The interface you should follow to write the runnable code:
{self.interface}
The output of your code should be in the format:
{self.output_format}
The simulator user can use to test your model:
{self.simulator}
"""
3 changes: 3 additions & 0 deletions rdagent/scenarios/kaggle/experiment/model_template/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## This folder is a template to be copied from for each model implementation & running process.

Components: Dummy model.py, versatile conf.yaml, and a result reader.
Loading

0 comments on commit 975c327

Please sign in to comment.