fix: refine some codes (#353)

* refine some codes * fix ci errors * update * update advanced rag
microsoft · Sep 26, 2024 · 866c2e6 · 866c2e6
1 parent b8b2cd6
commit 866c2e6
Show file tree

Hide file tree

Showing 8 changed files with 80 additions and 39 deletions.
diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py
@@ -51,12 +51,18 @@ class Config:
 
     local_data_path: str = "/data/userdata/share/kaggle"
 
+    domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"
+
     rag_path: str = "git_ignore_folder/rag"
 
     if_action_choosing_based_on_UCB: bool = False
 
     if_using_feature_selection: bool = False
 
+    if_using_graph_rag: bool = False
+
+    if_using_vector_rag: bool = False
+
     auto_submit: bool = True
 
 

diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -14,9 +14,6 @@
 )
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
-from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
-    extract_knowledge_from_feedback,
-)
 from rdagent.utils import convert2bool
 
 prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
@@ -42,11 +39,10 @@ def process_results(self, current_result, sota_result):
 
         # Add a note about metric direction
         evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower"
-        combined_df[
-            "Note"
-        ] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
+        evaluation_description = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
+        combined_df["Note"] = evaluation_description
 
-        return combined_df
+        return combined_df, evaluation_description
 
     def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
         """
@@ -75,14 +71,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
                 except AttributeError:
                     print(f"Warning: Task {task} does not have get_task_information_and_implementation_result method")
 
+        evaluation_description = None
         # Check if there are any based experiments
         if exp.based_experiments:
             sota_result = exp.based_experiments[-1].result
             # Process the results to filter important metrics
-            combined_result = self.process_results(current_result, sota_result)
+            combined_result, evaluation_description = self.process_results(current_result, sota_result)
         else:
             # If there are no based experiments, we'll only use the current result
-            combined_result = self.process_results(current_result, current_result)  # Compare with itself
+            combined_result, evaluation_description = self.process_results(
+                current_result, current_result
+            )  # Compare with itself
             print("Warning: No previous experiments to compare against. Using current result as baseline.")
 
         available_features = {
@@ -129,6 +128,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             "combined_result": combined_result,  # This turn and sota
             "hypothesis_text": hypothesis_text,  # This turn
             "task_details": tasks_factors,  # This turn
+            "evaluation_description": evaluation_description,
         }
 
         usr_prompt = (
@@ -152,13 +152,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         experiment_feedback = {
             "hypothesis_text": hypothesis_text,
             "current_result": current_result,
-            "tasks_factors": tasks_factors,
+            "model_code": model_code,
+            "available_features": available_features,
             "observations": observations,
             "hypothesis_evaluation": hypothesis_evaluation,
             "reason": reason,
         }
 
-        # self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
+        if self.scen.if_using_vector_rag:
+            self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
+        elif self.scen.if_using_graph_rag:
+            self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
 
         return HypothesisFeedback(
             observations=observations,

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -35,9 +35,16 @@ def __init__(self, competition: str) -> None:
         self.model_output_channel = None
         self.evaluation_desc = None
         self.evaluation_metric_direction = None
+        self.vector_base = None
         self._analysis_competition_description()
         self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
         self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection
+        self.if_using_graph_rag = KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag
+        self.if_using_vector_rag = KAGGLE_IMPLEMENT_SETTING.if_using_vector_rag
+
+        if self.if_using_vector_rag and KAGGLE_IMPLEMENT_SETTING.rag_path:
+            self.vector_base = KaggleExperienceBase()
+            self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)
 
         self._output_format = self.output_format
         self._interface = self.interface
@@ -124,6 +131,9 @@ def source_data(self) -> str:
 
         if (data_folder / "X_valid.pkl").exists():
             X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
+            # TODO: Hardcoded for now, need to be fixed
+            if self.competition == "feedback-prize-english-language-learning":
+                return "This is a sparse matrix of descriptive text."
             buffer = io.StringIO()
             X_valid.info(verbose=True, buf=buffer, show_counts=True)
             data_info = buffer.getvalue()
@@ -187,7 +197,7 @@ def simulator(self) -> str:
     @property
     def rich_style_description(self) -> str:
         return f"""
-This is the Kaggle scenario for the competition: {KAGGLE_IMPLEMENT_SETTING.competition}
+This is the Kaggle scenario for the competition: {self.competitionn}
 """
 
     def get_scenario_all_desc(self) -> str:

diff --git a/rdagent/scenarios/kaggle/knowledge_management/graph.py b/rdagent/scenarios/kaggle/knowledge_management/graph.py
@@ -1,4 +1,5 @@
 import json
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import List
 
@@ -20,21 +21,31 @@
 
 
 class KGKnowledgeGraph(UndirectedGraph):
-    def __init__(self, path: str | Path | None, scenario: KGScenario) -> None:
+    def __init__(self, path: str | Path | None, scenario: KGScenario | None) -> None:
         super().__init__(path)
-        if path is not None and not Path(path).exists():
+        if path is not None and Path(path).exists():
+            self.load()
+            self.path = Path(path).parent / (
+                datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl"
+            )
+        else:
             documents = []
-            for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / "domain_knowledge").glob("*.case"):
+            print(Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path))
+            for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path)).rglob("*.case"):
                 with open(file_path, "r") as f:
                     documents.append(f.read())
             self.load_from_documents(documents=documents, scenario=scenario)
             self.dump()
 
-    def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list:
+    def add_document(self, document_content: str, scenario: KGScenario | None) -> None:
+        self.load_from_documents([document_content], scenario)
+        self.dump()  # Each valid experiment will overwrite this file once again.
+
+    def analyze_one_document(self, document_content: str, scenario: KGScenario | None) -> list:
         session_system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["system"])
-            .render(scenario=scenario.get_scenario_all_desc())
+            .render(scenario=scenario.get_scenario_all_desc() if scenario is not None else "")
         )
 
         session = APIBackend().build_chat_session(
@@ -53,7 +64,7 @@ def analyze_one_document(self, document_content: str, scenario: KGScenario) -> l
             user_prompt = "Continue from the last step please. Don't extract the same knowledge again."
         return knowledge_list
 
-    def load_from_documents(self, documents: List[str], scenario: KGScenario):
+    def load_from_documents(self, documents: List[str], scenario: KGScenario | None) -> None:
         knowledge_list_list = multiprocessing_wrapper(
             [
                 (
@@ -105,3 +116,7 @@ def load_from_documents(self, documents: List[str], scenario: KGScenario):
         node_list = self.batch_embedding(node_list)
         for node_pair in node_pairs:
             self.add_node(node_pair[0], node_pair[1])
+
+
+if __name__ == "__main__":
+    graph = KGKnowledgeGraph(path="git_ignore_folder/kg_graph.pkl", scenario=None)
diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
@@ -41,11 +41,14 @@ extract_kaggle_knowledge_from_feedback_prompts:
 
 extract_knowledge_graph_from_document:
   system: |-
-    You are helping user to extract knowledge from a document.
-    The user is working on data science competitions in Kaggle in the following scenario:
-    {{ scenario }}
+    You are helping the user extract knowledge from a document.
+    {% if scenario %}
+      The user is working on data science competitions in Kaggle, with the following scenario: {{ scenario }}
+    {% else %}
+      The user is working on general data science competitions on Kaggle.
+    {% endif %}
 
-    The user has found some possible high value documents from other experts, and they need your help to extract some knowledge from these documents.
+    The user has identified valuable documents from other experts and requires your help to extract meaningful insights from them.
 
     Considering each document might contain several valuable insights, you need to extract them one by one and organize them in a structured format.
 
@@ -58,13 +61,13 @@ extract_knowledge_graph_from_document:
 
     Please provide the analysis in the following JSON format:
     {
-      "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features",
+      "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features (If no specific competition name or other fields are found, leave them blank).", 
       "hypothesis":
         {
           "type": "one of the hypothesis types from ['Feature engineering', 'Feature processing', 'Model feature selection', 'Model tuning']",
           "explanation": "(Plain text) extracted detailed explanation to the hypothesis"
         },
-      "experiments": "(Plain text) extracted experiments details. You can list them in bullet points.",
+      "experiments": "(Plain text) Detailed descriptions of the experiments conducted in the document, which can be listed in bullet points.",
       "code": "extracted code snippets if available",
       "conclusion": 
         {

diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py
@@ -1,3 +1,4 @@
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import List, Union
 
@@ -107,7 +108,7 @@ class KaggleExperienceBase(PDVectorBase):
     Class for handling Kaggle competition experience posts and organizing them for reference
     """
 
-    def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
+    def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
         """
         Initialize the KaggleExperienceBase class
 
@@ -118,12 +119,14 @@ def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[
         kaggle_experience_path: str or Path, optional
             Path to the Kaggle experience post data.
         """
-        super().__init__(path)
+        super().__init__(vector_df_path)
         self.kaggle_experience_path = kaggle_experience_path
         self.kaggle_experience_data = []
-
-        if kaggle_experience_path:
-            self.load_kaggle_experience(kaggle_experience_path)
+        # if path is not None and Path(path).exists():
+        #     self.load_kaggle_experience(kaggle_experience_path)
+        #     self.path = Path(path).parent / (datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl")
+        # else:
+        #     pass
 
     def add(self, document: Union[KGDocument, List[KGDocument]]):
         document.split_into_trunk()
@@ -258,7 +261,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
 
     kaggle_base.add_experience_to_vector_base()
 
-    kaggle_base.save()
+    kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl")
 
     print(f"There are {kaggle_base.shape()[0]} records in the vector base.")
 

diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
@@ -173,8 +173,8 @@ model_tuning_feedback_generation:
     {{ combined_result }}
 
     Analyze the combined result in the context of its ability to:
-    1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
-    2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
+    1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
+    2. To a large extent, the experiment with better metrics is the better one.
 
     Consider Changing Direction for Significant Gaps with the Best Result and the last round:
       - If the new results significantly differ from SOTA, consider a new direction.
@@ -234,8 +234,8 @@ factor_feedback_generation:
     {{ combined_result }}
     
     Analyze the combined result in the context of its ability to:
-    1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
-    2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
+    1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
+    2. To a large extent, the experiment with better metrics is the better one.
     
     Consider Changing Direction for Significant Gaps with the Best Result:
       - If the new results significantly differ from the best, consider exploring a new direction.
@@ -282,8 +282,8 @@ feature_selection_feedback_generation:
     {{ combined_result }}
 
     Analyze the combined result in the context of its ability to:
-    1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
-    2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
+    1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
+    2. To a large extent, the experiment with better metrics is the better one.
 
     In your feedback, consider:
     1. How effective is the current feature selection strategy?

diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -29,13 +29,13 @@
 prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
 
 
-KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
 KG_ACTION_FEATURE_PROCESSING = "Feature processing"
+KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
 KG_ACTION_MODEL_FEATURE_SELECTION = "Model feature selection"
 KG_ACTION_MODEL_TUNING = "Model tuning"
 KG_ACTION_LIST = [
-    KG_ACTION_FEATURE_ENGINEERING,
     KG_ACTION_FEATURE_PROCESSING,
+    KG_ACTION_FEATURE_ENGINEERING,
     *([KG_ACTION_MODEL_FEATURE_SELECTION] if KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection else []),
     KG_ACTION_MODEL_TUNING,
 ]
@@ -94,7 +94,7 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
         self.initial_performance = 0.0
 
     def generate_RAG_content(self, trace: Trace) -> str:
-        if trace.knowledge_base is None:
+        if self.scen.if_using_graph_rag is False or trace.knowledge_base is None:
             return None
         same_competition_node = trace.knowledge_base.get_node_by_content(trace.scen.get_competition_full_desc())
         if same_competition_node is not None: