-
-
Notifications
You must be signed in to change notification settings - Fork 121
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Added loop code for Kaggle scene. (#211)
* fuse all code into one commit * remove container auto * change remove method * add kaggle env start * change kaggle api * change structure * add crawler * add requirements * refeact the code * delete mistaken codes * merge docker settings and crawler * add chrome install README for crawler usage * Connect scen with Kaggle to download data * Reformat some files to pass CI. * fix some ci errors * fix a ci error * fix a ci error --------- Co-authored-by: Bowen Xian <[email protected]>
- Loading branch information
1 parent
e8f6af9
commit 975c327
Showing
16 changed files
with
848 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from pathlib import Path | ||
|
||
from pydantic_settings import BaseSettings | ||
|
||
from rdagent.components.workflow.conf import BasePropSetting | ||
|
||
|
||
class PropSetting(BasePropSetting): | ||
class Config: | ||
env_prefix = "KG_" | ||
"""Use `KG_` as prefix for environment variables""" | ||
protected_namespaces = () | ||
"""Add 'model_' to the protected namespaces""" | ||
|
||
# 1) overriding the default | ||
scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario" | ||
"""Scenario class for data mining model""" | ||
|
||
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen" | ||
"""Hypothesis generation class""" | ||
|
||
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment" | ||
"""Hypothesis to experiment class""" | ||
|
||
coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER" | ||
"""Coder class""" | ||
|
||
runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner" | ||
"""Runner class""" | ||
|
||
summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback" | ||
"""Summarizer class""" | ||
|
||
evolving_n: int = 10 | ||
"""Number of evolutions""" | ||
|
||
evolving_n: int = 10 | ||
|
||
competition: str = "" | ||
|
||
|
||
PROP_SETTING = PropSetting() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from collections import defaultdict | ||
|
||
import fire | ||
|
||
from rdagent.app.kaggle.conf import PROP_SETTING | ||
from rdagent.components.workflow.conf import BasePropSetting | ||
from rdagent.components.workflow.rd_loop import RDLoop | ||
from rdagent.core.exception import ModelEmptyError | ||
from rdagent.core.proposal import ( | ||
Hypothesis2Experiment, | ||
HypothesisExperiment2Feedback, | ||
HypothesisGen, | ||
Trace, | ||
) | ||
from rdagent.core.utils import import_class | ||
from rdagent.log import rdagent_logger as logger | ||
|
||
|
||
class ModelRDLoop(RDLoop): | ||
def __init__(self, PROP_SETTING: BasePropSetting): | ||
with logger.tag("init"): | ||
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition) | ||
logger.log_object(scen, tag="scenario") | ||
|
||
self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen) | ||
logger.log_object(self.hypothesis_gen, tag="hypothesis generator") | ||
|
||
self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)() | ||
logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment") | ||
|
||
self.coder: Developer = import_class(PROP_SETTING.coder)(scen) | ||
logger.log_object(self.coder, tag="coder") | ||
self.runner: Developer = import_class(PROP_SETTING.runner)(scen) | ||
logger.log_object(self.runner, tag="runner") | ||
|
||
self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen) | ||
logger.log_object(self.summarizer, tag="summarizer") | ||
self.trace = Trace(scen=scen) | ||
super(RDLoop, self).__init__() | ||
|
||
skip_loop_error = (ModelEmptyError,) | ||
|
||
|
||
def main(path=None, step_n=None, competition=None): | ||
""" | ||
Auto R&D Evolving loop for models in a kaggle{} scenario. | ||
You can continue running session by | ||
.. code-block:: python | ||
dotenv run -- python rdagent/app/kaggle/model.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional paramter | ||
""" | ||
if competition: | ||
PROP_SETTING.competition = competition | ||
if path is None: | ||
model_loop = ModelRDLoop(PROP_SETTING) | ||
else: | ||
model_loop = ModelRDLoop.load(path) | ||
model_loop.run(step_n=step_n) | ||
|
||
|
||
if __name__ == "__main__": | ||
fire.Fire(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Kaggle Crawler | ||
|
||
## Install chrome & chromedriver for Linux | ||
|
||
In one folder | ||
```shell | ||
# install chrome | ||
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb | ||
sudo apt install ./google-chrome-stable_current_amd64.deb | ||
google-chrome --version | ||
|
||
# install chromedriver | ||
wget https://storage.googleapis.com/chrome-for-testing-public/<chrome-version>/linux64/chromedriver-linux64.zip | ||
unzip chromedriver-linux64.zip | ||
cd chromedriver-linux64 | ||
sudo mv chromedriver /usr/local/bin | ||
sudo chmod +x /usr/local/bin/chromedriver | ||
|
||
chromedriver --version | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import json | ||
from pathlib import Path | ||
|
||
from jinja2 import Environment, StrictUndefined | ||
|
||
from rdagent.core.experiment import Experiment | ||
from rdagent.core.prompts import Prompts | ||
from rdagent.core.proposal import ( | ||
Hypothesis, | ||
HypothesisExperiment2Feedback, | ||
HypothesisFeedback, | ||
Trace, | ||
) | ||
from rdagent.log import rdagent_logger as logger | ||
from rdagent.oai.llm_utils import APIBackend | ||
from rdagent.utils import convert2bool | ||
|
||
feedback_prompts = Prompts(file_path=Path(__file__).parent.parent.parent / "qlib" / "prompts.yaml") | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
class KGModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback): | ||
"""Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances""" | ||
|
||
def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback: | ||
""" | ||
The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM). | ||
For example: `mlflow` of Qlib will be included. | ||
""" | ||
|
||
logger.info("Generating feedback...") | ||
# Define the system prompt for hypothesis feedback | ||
system_prompt = feedback_prompts["model_feedback_generation"]["system"] | ||
|
||
# Define the user prompt for hypothesis feedback | ||
context = trace.scen | ||
SOTA_hypothesis, SOTA_experiment = trace.get_sota_hypothesis_and_experiment() | ||
|
||
user_prompt = ( | ||
Environment(undefined=StrictUndefined) | ||
.from_string(feedback_prompts["model_feedback_generation"]["user"]) | ||
.render( | ||
context=context, | ||
last_hypothesis=SOTA_hypothesis, | ||
last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None, | ||
last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None, | ||
last_result=SOTA_experiment.result if SOTA_hypothesis else None, | ||
hypothesis=hypothesis, | ||
exp=exp, | ||
) | ||
) | ||
|
||
# Call the APIBackend to generate the response for hypothesis feedback | ||
response_hypothesis = APIBackend().build_messages_and_create_chat_completion( | ||
user_prompt=user_prompt, | ||
system_prompt=system_prompt, | ||
json_mode=True, | ||
) | ||
|
||
# Parse the JSON response to extract the feedback | ||
response_json_hypothesis = json.loads(response_hypothesis) | ||
return HypothesisFeedback( | ||
observations=response_json_hypothesis.get("Observations", "No observations provided"), | ||
hypothesis_evaluation=response_json_hypothesis.get("Feedback for Hypothesis", "No feedback provided"), | ||
new_hypothesis=response_json_hypothesis.get("New Hypothesis", "No new hypothesis provided"), | ||
reason=response_json_hypothesis.get("Reasoning", "No reasoning provided"), | ||
decision=convert2bool(response_json_hypothesis.get("Decision", "false")), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER | ||
|
||
KGModelCoSTEER = ModelCoSTEER |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import shutil | ||
import uuid | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
from rdagent.components.coder.model_coder.model import ModelExperiment, ModelFBWorkspace | ||
from rdagent.components.runner import CachedRunner | ||
from rdagent.components.runner.conf import RUNNER_SETTINGS | ||
from rdagent.core.developer import Developer | ||
from rdagent.core.exception import ModelEmptyError | ||
from rdagent.log import rdagent_logger as logger | ||
from rdagent.scenarios.kaggle.experiment.model_experiment import KGModelExperiment | ||
from rdagent.utils.env import KGDockerEnv | ||
|
||
|
||
class KGModelRunner(CachedRunner[KGModelExperiment]): | ||
def develop(self, exp: KGModelExperiment) -> KGModelExperiment: | ||
if RUNNER_SETTINGS.cache_result: | ||
cache_hit, result = self.get_cache_result(exp) | ||
if cache_hit: | ||
exp.result = result | ||
return exp | ||
|
||
if exp.sub_workspace_list[0].code_dict.get("model.py") is None: | ||
raise ModelEmptyError("model.py is empty") | ||
# to replace & inject code | ||
exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]}) | ||
|
||
env_to_use = {"PYTHONPATH": "./"} | ||
|
||
result = exp.experiment_workspace.execute(run_env=env_to_use) | ||
|
||
exp.result = result | ||
if RUNNER_SETTINGS.cache_result: | ||
self.dump_cache_result(exp, result) | ||
|
||
return exp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
FROM pytorch/pytorch:latest | ||
# For GPU support, please choose the proper tag from https://hub.docker.com/r/pytorch/pytorch/tags | ||
|
||
RUN apt-get clean && apt-get update && apt-get install -y \ | ||
curl \ | ||
vim \ | ||
git \ | ||
build-essential \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
WORKDIR /workspace | ||
|
||
RUN python -m pip install numpy | ||
RUN python -m pip install --upgrade cython | ||
# RUN python -m pip install -e . | ||
|
||
RUN python -m pip install pandas | ||
# RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.3.0%2Bcu121.html | ||
RUN pip install torch_geometric | ||
RUN pip install ogb | ||
RUN pip install networkx | ||
RUN pip install scikit-learn | ||
RUN pip install catboost | ||
RUN pip install xgboost | ||
RUN pip install sparse |
113 changes: 113 additions & 0 deletions
113
rdagent/scenarios/kaggle/experiment/model_experiment.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import json | ||
from pathlib import Path | ||
|
||
from jinja2 import Environment, StrictUndefined | ||
|
||
from rdagent.components.coder.model_coder.model import ( | ||
ModelExperiment, | ||
ModelFBWorkspace, | ||
ModelTask, | ||
) | ||
from rdagent.core.prompts import Prompts | ||
from rdagent.core.scenario import Scenario | ||
from rdagent.oai.llm_utils import APIBackend | ||
from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace | ||
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions | ||
|
||
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml") | ||
|
||
|
||
class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]): | ||
def __init__(self, *args, **kwargs) -> None: | ||
super().__init__(*args, **kwargs) | ||
self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "model_template") | ||
|
||
|
||
class KGModelScenario(Scenario): | ||
def __init__(self, competition: str) -> None: | ||
super().__init__() | ||
self.competition = competition | ||
self.competition_descriptions = crawl_descriptions(competition) | ||
self.competition_type = None | ||
self.competition_description = None | ||
self.target_description = None | ||
self.competition_features = None | ||
self._analysis_competition_description() | ||
|
||
def _analysis_competition_description(self): | ||
# TODO: use gpt to analyze the competition description | ||
|
||
sys_prompt = ( | ||
Environment(undefined=StrictUndefined) | ||
.from_string(prompt_dict["kg_description_template"]["system"]) | ||
.render() | ||
) | ||
|
||
user_prompt = ( | ||
Environment(undefined=StrictUndefined) | ||
.from_string(prompt_dict["kg_description_template"]["user"]) | ||
.render( | ||
competition_descriptions=self.competition_descriptions, | ||
) | ||
) | ||
|
||
response_analysis = APIBackend().build_messages_and_create_chat_completion( | ||
user_prompt=user_prompt, | ||
system_prompt=sys_prompt, | ||
json_mode=True, | ||
) | ||
|
||
response_json_analysis = json.loads(response_analysis) | ||
self.competition_type = response_json_analysis.get("Competition Type", "No type provided") | ||
self.competition_description = response_json_analysis.get("Competition Description", "No description provided") | ||
self.target_description = response_json_analysis.get("Target Description", "No target provided") | ||
self.competition_features = response_json_analysis.get("Competition Features", "No features provided") | ||
|
||
@property | ||
def background(self) -> str: | ||
background_template = prompt_dict["kg_model_background"] | ||
|
||
background_prompt = ( | ||
Environment(undefined=StrictUndefined) | ||
.from_string(background_template) | ||
.render( | ||
competition_type=self.competition_type, | ||
competition_description=self.competition_description, | ||
target_description=self.target_description, | ||
competition_features=self.competition_features, | ||
) | ||
) | ||
|
||
return background_prompt | ||
|
||
@property | ||
def source_data(self) -> str: | ||
raise NotImplementedError("source_data is not implemented") | ||
|
||
@property | ||
def output_format(self) -> str: | ||
return prompt_dict["kg_model_output_format"] | ||
|
||
@property | ||
def interface(self) -> str: | ||
return prompt_dict["kg_model_interface"] | ||
|
||
@property | ||
def simulator(self) -> str: | ||
return prompt_dict["kg_model_simulator"] | ||
|
||
@property | ||
def rich_style_description(self) -> str: | ||
return """ | ||
kaggle scen """ | ||
|
||
def get_scenario_all_desc(self) -> str: | ||
return f"""Background of the scenario: | ||
{self.background} | ||
The interface you should follow to write the runnable code: | ||
{self.interface} | ||
The output of your code should be in the format: | ||
{self.output_format} | ||
The simulator user can use to test your model: | ||
{self.simulator} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
## This folder is a template to be copied from for each model implementation & running process. | ||
|
||
Components: Dummy model.py, versatile conf.yaml, and a result reader. |
Oops, something went wrong.