feat: better feedback & evaluation (#346)

* Updated new keys for evaluation * fix the bug in feedback --------- Co-authored-by: WinstonLiye <[email protected]>
microsoft · Sep 26, 2024 · cc9a8c1 · cc9a8c1
1 parent c18cc6a
commit cc9a8c1
Show file tree

Hide file tree

Showing 4 changed files with 125 additions and 102 deletions.
diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -23,29 +23,31 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-def process_results(current_result, sota_result):
-    # Convert the results to dataframes
-    current_df = pd.DataFrame(current_result)
-    sota_df = pd.DataFrame(sota_result)
-
-    # Combine the dataframes on the Metric index
-    combined_df = pd.concat([current_df, sota_df], axis=1)
-    combined_df.columns = ["current_df", "sota_df"]
-
-    combined_df["the largest"] = combined_df.apply(
-        lambda row: "sota_df"
-        if row["sota_df"] > row["current_df"]
-        else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
-        axis=1,
-    )
-
-    # Add a note about metric direction
-    combined_df["Note"] = "Direction of improvement (higher/lower is better) should be judged per metric"
-
-    return combined_df
-
-
 class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
+    def process_results(self, current_result, sota_result):
+        # Convert the results to dataframes
+        current_df = pd.DataFrame(current_result)
+        sota_df = pd.DataFrame(sota_result)
+
+        # Combine the dataframes on the Metric index
+        combined_df = pd.concat([current_df, sota_df], axis=1)
+        combined_df.columns = ["current_df", "sota_df"]
+
+        # combined_df["the largest"] = combined_df.apply(
+        #     lambda row: "sota_df"
+        #     if row["sota_df"] > row["current_df"]
+        #     else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
+        #     axis=1,
+        # )
+
+        # Add a note about metric direction
+        evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower"
+        combined_df[
+            "Note"
+        ] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
+
+        return combined_df
+
     def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
         """
         The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
@@ -77,10 +79,10 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         if exp.based_experiments:
             sota_result = exp.based_experiments[-1].result
             # Process the results to filter important metrics
-            combined_result = process_results(current_result, sota_result)
+            combined_result = self.process_results(current_result, sota_result)
         else:
             # If there are no based experiments, we'll only use the current result
-            combined_result = process_results(current_result, current_result)  # Compare with itself
+            combined_result = self.process_results(current_result, current_result)  # Compare with itself
             print("Warning: No previous experiments to compare against. Using current result as baseline.")
 
         available_features = {
@@ -113,35 +115,34 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
 
         # Prepare render dictionary
         render_dict = {
-            "context": self.scen.get_scenario_all_desc(),
             "last_hypothesis": trace.hist[-1][0] if trace.hist else None,
             "last_task_and_code": last_task_and_code,
             "last_result": trace.hist[-1][1].result if trace.hist else None,
+            "sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description
+            if exp.based_experiments
+            else None,
+            "sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
             "hypothesis": hypothesis,
             "exp": exp,
-            "model_code": model_code,
-            "available_features": available_features,
-            "combined_result": combined_result,
-            "hypothesis_text": hypothesis_text,
-            "task_details": tasks_factors,
+            "model_code": model_code,  # This turn
+            "available_features": available_features,  # This turn
+            "combined_result": combined_result,  # This turn and sota
+            "hypothesis_text": hypothesis_text,  # This turn
+            "task_details": tasks_factors,  # This turn
         }
 
-        # Generate the user prompt
         usr_prompt = (
             Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict)
         )
 
-        # Call the APIBackend to generate the response for hypothesis feedback
         response = APIBackend().build_messages_and_create_chat_completion(
             user_prompt=usr_prompt,
             system_prompt=sys_prompt,
             json_mode=True,
         )
 
-        # Parse the JSON response to extract the feedback
         response_json = json.loads(response)
 
-        # Extract fields from JSON response
         observations = response_json.get("Observations", "No observations provided")
         hypothesis_evaluation = response_json.get("Feedback for Hypothesis", "No feedback provided")
         new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")

diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -11,6 +11,8 @@ kg_description_template:
       "Competition Features": "Two-line description of the overall features involved within the competition as background."
       "Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
       "Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
+      "Evaluation Description": "A brief description for what metrics are used in evaluation. An explanation of whether a higher score is better or lower is better in terms of performance."
+      "Evaluation Boolean": "True" or "False" (True means the higher score the better (like accuracy); False means the lower value the better (like loss).) 
     }
     Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
 

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -33,6 +33,8 @@ def __init__(self, competition: str) -> None:
         self.competition_features = None
         self.submission_specifications = None
         self.model_output_channel = None
+        self.evaluation_desc = None
+        self.evaluation_metric_direction = None
         self._analysis_competition_description()
         self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
         self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection
@@ -73,12 +75,23 @@ def _analysis_competition_description(self):
             "Submission Specifications", "No submission requirements provided"
         )
         self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
+        self.evaluation_desc = response_json_analysis.get(
+            "Evaluation Description", "No evaluation specification provided."
+        )
+        self.evaluation_metric_direction = response_json_analysis.get(
+            "Evaluation Boolean", "No evaluation specification provided."
+        )
 
     def get_competition_full_desc(self) -> str:
+        evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
         return f"""Competition Type: {self.competition_type}
     Competition Description: {self.competition_description}
     Target Description: {self.target_description}
     Competition Features: {self.competition_features}
+    Submission Specifications: {self.submission_specifications}
+    Model Output Channel: {self.model_output_channel}
+    Evaluation Descriptions: {self.evaluation_desc}
+    Is the evaluation metric the higher the better: {evaluation_direction}
     """
 
     @property
@@ -99,6 +112,8 @@ def background(self) -> str:
                 target_description=self.target_description,
                 competition_features=self.competition_features,
                 submission_specifications=self.submission_specifications,
+                evaluation_desc=self.evaluation_desc,
+                evaluate_bool=self.evaluation_metric_direction,
             )
         )
         return background_prompt
@@ -171,8 +186,9 @@ def simulator(self) -> str:
 
     @property
     def rich_style_description(self) -> str:
-        return """
-kaggle scen """
+        return f"""
+This is the Kaggle scenario for the competition: {KAGGLE_IMPLEMENT_SETTING.competition}
+"""
 
     def get_scenario_all_desc(self) -> str:
         return f"""Background of the scenario: