feat: Added loop code for Kaggle scene. (#211)

* fuse all code into one commit * remove container auto * change remove method * add kaggle env start * change kaggle api * change structure * add crawler * add requirements * refeact the code * delete mistaken codes * merge docker settings and crawler * add chrome install README for crawler usage * Connect scen with Kaggle to download data * Reformat some files to pass CI. * fix some ci errors * fix a ci error * fix a ci error --------- Co-authored-by: Bowen Xian <[email protected]>
microsoft · Aug 19, 2024 · 975c327 · 975c327
1 parent e8f6af9
commit 975c327
Show file tree

Hide file tree

Showing 16 changed files with 848 additions and 0 deletions.
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+
+from pydantic_settings import BaseSettings
+
+from rdagent.components.workflow.conf import BasePropSetting
+
+
+class PropSetting(BasePropSetting):
+    class Config:
+        env_prefix = "KG_"
+        """Use `KG_` as prefix for environment variables"""
+        protected_namespaces = ()
+        """Add 'model_' to the protected namespaces"""
+
+    # 1) overriding the default
+    scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
+    """Scenario class for data mining model"""
+
+    hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
+    """Hypothesis generation class"""
+
+    hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
+    """Hypothesis to experiment class"""
+
+    coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
+    """Coder class"""
+
+    runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
+    """Runner class"""
+
+    summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
+    """Summarizer class"""
+
+    evolving_n: int = 10
+    """Number of evolutions"""
+
+    evolving_n: int = 10
+
+    competition: str = ""
+
+
+PROP_SETTING = PropSetting()
diff --git a/rdagent/app/kaggle/model.py b/rdagent/app/kaggle/model.py
@@ -0,0 +1,65 @@
+from collections import defaultdict
+
+import fire
+
+from rdagent.app.kaggle.conf import PROP_SETTING
+from rdagent.components.workflow.conf import BasePropSetting
+from rdagent.components.workflow.rd_loop import RDLoop
+from rdagent.core.exception import ModelEmptyError
+from rdagent.core.proposal import (
+    Hypothesis2Experiment,
+    HypothesisExperiment2Feedback,
+    HypothesisGen,
+    Trace,
+)
+from rdagent.core.utils import import_class
+from rdagent.log import rdagent_logger as logger
+
+
+class ModelRDLoop(RDLoop):
+    def __init__(self, PROP_SETTING: BasePropSetting):
+        with logger.tag("init"):
+            scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
+            logger.log_object(scen, tag="scenario")
+
+            self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
+            logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
+
+            self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
+            logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
+
+            self.coder: Developer = import_class(PROP_SETTING.coder)(scen)
+            logger.log_object(self.coder, tag="coder")
+            self.runner: Developer = import_class(PROP_SETTING.runner)(scen)
+            logger.log_object(self.runner, tag="runner")
+
+            self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
+            logger.log_object(self.summarizer, tag="summarizer")
+            self.trace = Trace(scen=scen)
+            super(RDLoop, self).__init__()
+
+    skip_loop_error = (ModelEmptyError,)
+
+
+def main(path=None, step_n=None, competition=None):
+    """
+    Auto R&D Evolving loop for models in a kaggle{} scenario.
+
+    You can continue running session by
+
+    .. code-block:: python
+
+        dotenv run -- python rdagent/app/kaggle/model.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional paramter
+
+    """
+    if competition:
+        PROP_SETTING.competition = competition
+    if path is None:
+        model_loop = ModelRDLoop(PROP_SETTING)
+    else:
+        model_loop = ModelRDLoop.load(path)
+    model_loop.run(step_n=step_n)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/rdagent/scenarios/kaggle/README.md b/rdagent/scenarios/kaggle/README.md
@@ -0,0 +1,20 @@
+# Kaggle Crawler
+
+## Install chrome & chromedriver for Linux
+
+In one folder
+```shell
+# install chrome
+wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
+sudo apt install ./google-chrome-stable_current_amd64.deb
+google-chrome --version
+
+# install chromedriver
+wget https://storage.googleapis.com/chrome-for-testing-public/<chrome-version>/linux64/chromedriver-linux64.zip
+unzip chromedriver-linux64.zip
+cd chromedriver-linux64
+sudo mv chromedriver /usr/local/bin
+sudo chmod +x /usr/local/bin/chromedriver
+
+chromedriver --version
+```
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -0,0 +1,68 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.core.experiment import Experiment
+from rdagent.core.prompts import Prompts
+from rdagent.core.proposal import (
+    Hypothesis,
+    HypothesisExperiment2Feedback,
+    HypothesisFeedback,
+    Trace,
+)
+from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.utils import convert2bool
+
+feedback_prompts = Prompts(file_path=Path(__file__).parent.parent.parent / "qlib" / "prompts.yaml")
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+class KGModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+    """Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
+
+    def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
+        """
+        The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
+        For example: `mlflow` of Qlib will be included.
+        """
+
+        logger.info("Generating feedback...")
+        # Define the system prompt for hypothesis feedback
+        system_prompt = feedback_prompts["model_feedback_generation"]["system"]
+
+        # Define the user prompt for hypothesis feedback
+        context = trace.scen
+        SOTA_hypothesis, SOTA_experiment = trace.get_sota_hypothesis_and_experiment()
+
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(feedback_prompts["model_feedback_generation"]["user"])
+            .render(
+                context=context,
+                last_hypothesis=SOTA_hypothesis,
+                last_task=SOTA_experiment.sub_tasks[0].get_task_information() if SOTA_hypothesis else None,
+                last_code=SOTA_experiment.sub_workspace_list[0].code_dict.get("model.py") if SOTA_hypothesis else None,
+                last_result=SOTA_experiment.result if SOTA_hypothesis else None,
+                hypothesis=hypothesis,
+                exp=exp,
+            )
+        )
+
+        # Call the APIBackend to generate the response for hypothesis feedback
+        response_hypothesis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=system_prompt,
+            json_mode=True,
+        )
+
+        # Parse the JSON response to extract the feedback
+        response_json_hypothesis = json.loads(response_hypothesis)
+        return HypothesisFeedback(
+            observations=response_json_hypothesis.get("Observations", "No observations provided"),
+            hypothesis_evaluation=response_json_hypothesis.get("Feedback for Hypothesis", "No feedback provided"),
+            new_hypothesis=response_json_hypothesis.get("New Hypothesis", "No new hypothesis provided"),
+            reason=response_json_hypothesis.get("Reasoning", "No reasoning provided"),
+            decision=convert2bool(response_json_hypothesis.get("Decision", "false")),
+        )
diff --git a/rdagent/scenarios/kaggle/developer/model_coder.py b/rdagent/scenarios/kaggle/developer/model_coder.py
@@ -0,0 +1,3 @@
+from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER
+
+KGModelCoSTEER = ModelCoSTEER
diff --git a/rdagent/scenarios/kaggle/developer/model_runner.py b/rdagent/scenarios/kaggle/developer/model_runner.py
@@ -0,0 +1,38 @@
+import shutil
+import uuid
+from pathlib import Path
+
+import pandas as pd
+
+from rdagent.components.coder.model_coder.model import ModelExperiment, ModelFBWorkspace
+from rdagent.components.runner import CachedRunner
+from rdagent.components.runner.conf import RUNNER_SETTINGS
+from rdagent.core.developer import Developer
+from rdagent.core.exception import ModelEmptyError
+from rdagent.log import rdagent_logger as logger
+from rdagent.scenarios.kaggle.experiment.model_experiment import KGModelExperiment
+from rdagent.utils.env import KGDockerEnv
+
+
+class KGModelRunner(CachedRunner[KGModelExperiment]):
+    def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
+        if RUNNER_SETTINGS.cache_result:
+            cache_hit, result = self.get_cache_result(exp)
+            if cache_hit:
+                exp.result = result
+                return exp
+
+        if exp.sub_workspace_list[0].code_dict.get("model.py") is None:
+            raise ModelEmptyError("model.py is empty")
+        # to replace & inject code
+        exp.experiment_workspace.inject_code(**{"model.py": exp.sub_workspace_list[0].code_dict["model.py"]})
+
+        env_to_use = {"PYTHONPATH": "./"}
+
+        result = exp.experiment_workspace.execute(run_env=env_to_use)
+
+        exp.result = result
+        if RUNNER_SETTINGS.cache_result:
+            self.dump_cache_result(exp, result)
+
+        return exp
diff --git a/rdagent/scenarios/kaggle/docker/Dockerfile b/rdagent/scenarios/kaggle/docker/Dockerfile
@@ -0,0 +1,25 @@
+FROM pytorch/pytorch:latest
+# For GPU support, please choose the proper tag from https://hub.docker.com/r/pytorch/pytorch/tags
+
+RUN apt-get clean && apt-get update && apt-get install -y \  
+    curl \  
+    vim \  
+    git \  
+    build-essential \
+    && rm -rf /var/lib/apt/lists/* 
+
+WORKDIR /workspace
+
+RUN python -m pip install numpy
+RUN python -m pip install --upgrade cython
+# RUN python -m pip install -e .
+
+RUN python -m pip install pandas
+# RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.3.0%2Bcu121.html
+RUN pip install torch_geometric
+RUN pip install ogb
+RUN pip install networkx
+RUN pip install scikit-learn
+RUN pip install catboost
+RUN pip install xgboost
+RUN pip install sparse
diff --git a/rdagent/scenarios/kaggle/experiment/model_experiment.py b/rdagent/scenarios/kaggle/experiment/model_experiment.py
@@ -0,0 +1,113 @@
+import json
+from pathlib import Path
+
+from jinja2 import Environment, StrictUndefined
+
+from rdagent.components.coder.model_coder.model import (
+    ModelExperiment,
+    ModelFBWorkspace,
+    ModelTask,
+)
+from rdagent.core.prompts import Prompts
+from rdagent.core.scenario import Scenario
+from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.workspace import KGFBWorkspace
+from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
+
+prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+
+class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspace]):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.experiment_workspace = KGFBWorkspace(template_folder_path=Path(__file__).parent / "model_template")
+
+
+class KGModelScenario(Scenario):
+    def __init__(self, competition: str) -> None:
+        super().__init__()
+        self.competition = competition
+        self.competition_descriptions = crawl_descriptions(competition)
+        self.competition_type = None
+        self.competition_description = None
+        self.target_description = None
+        self.competition_features = None
+        self._analysis_competition_description()
+
+    def _analysis_competition_description(self):
+        # TODO: use gpt to analyze the competition description
+
+        sys_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["kg_description_template"]["system"])
+            .render()
+        )
+
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["kg_description_template"]["user"])
+            .render(
+                competition_descriptions=self.competition_descriptions,
+            )
+        )
+
+        response_analysis = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        response_json_analysis = json.loads(response_analysis)
+        self.competition_type = response_json_analysis.get("Competition Type", "No type provided")
+        self.competition_description = response_json_analysis.get("Competition Description", "No description provided")
+        self.target_description = response_json_analysis.get("Target Description", "No target provided")
+        self.competition_features = response_json_analysis.get("Competition Features", "No features provided")
+
+    @property
+    def background(self) -> str:
+        background_template = prompt_dict["kg_model_background"]
+
+        background_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(background_template)
+            .render(
+                competition_type=self.competition_type,
+                competition_description=self.competition_description,
+                target_description=self.target_description,
+                competition_features=self.competition_features,
+            )
+        )
+
+        return background_prompt
+
+    @property
+    def source_data(self) -> str:
+        raise NotImplementedError("source_data is not implemented")
+
+    @property
+    def output_format(self) -> str:
+        return prompt_dict["kg_model_output_format"]
+
+    @property
+    def interface(self) -> str:
+        return prompt_dict["kg_model_interface"]
+
+    @property
+    def simulator(self) -> str:
+        return prompt_dict["kg_model_simulator"]
+
+    @property
+    def rich_style_description(self) -> str:
+        return """
+kaggle scen """
+
+    def get_scenario_all_desc(self) -> str:
+        return f"""Background of the scenario:
+{self.background}
+The interface you should follow to write the runnable code:
+{self.interface}
+The output of your code should be in the format:
+{self.output_format}
+The simulator user can use to test your model:
+{self.simulator}
+"""
diff --git a/rdagent/scenarios/kaggle/experiment/model_template/README.md b/rdagent/scenarios/kaggle/experiment/model_template/README.md
@@ -0,0 +1,3 @@
+## This folder is a template to be copied from for each model implementation & running process. 
+
+Components: Dummy model.py, versatile conf.yaml, and a result reader.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from rdagent.components.coder.model_coder.CoSTEER import ModelCoSTEER

		KGModelCoSTEER = ModelCoSTEER
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		## This folder is a template to be copied from for each model implementation & running process.

		Components: Dummy model.py, versatile conf.yaml, and a result reader.