Skip to content

Commit

Permalink
feat: better feedback & evaluation (#346)
Browse files Browse the repository at this point in the history
* Updated new keys for evaluation

* fix the bug in feedback

---------

Co-authored-by: WinstonLiye <[email protected]>
  • Loading branch information
xisen-w and WinstonLiyt authored Sep 26, 2024
1 parent c18cc6a commit cc9a8c1
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 102 deletions.
69 changes: 35 additions & 34 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,29 +23,31 @@
DIRNAME = Path(__file__).absolute().resolve().parent


def process_results(current_result, sota_result):
# Convert the results to dataframes
current_df = pd.DataFrame(current_result)
sota_df = pd.DataFrame(sota_result)

# Combine the dataframes on the Metric index
combined_df = pd.concat([current_df, sota_df], axis=1)
combined_df.columns = ["current_df", "sota_df"]

combined_df["the largest"] = combined_df.apply(
lambda row: "sota_df"
if row["sota_df"] > row["current_df"]
else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
axis=1,
)

# Add a note about metric direction
combined_df["Note"] = "Direction of improvement (higher/lower is better) should be judged per metric"

return combined_df


class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
def process_results(self, current_result, sota_result):
# Convert the results to dataframes
current_df = pd.DataFrame(current_result)
sota_df = pd.DataFrame(sota_result)

# Combine the dataframes on the Metric index
combined_df = pd.concat([current_df, sota_df], axis=1)
combined_df.columns = ["current_df", "sota_df"]

# combined_df["the largest"] = combined_df.apply(
# lambda row: "sota_df"
# if row["sota_df"] > row["current_df"]
# else ("Equal" if row["sota_df"] == row["current_df"] else "current_df"),
# axis=1,
# )

# Add a note about metric direction
evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower"
combined_df[
"Note"
] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."

return combined_df

def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
"""
The `ti` should be executed and the results should be included, as well as the comparison between previous results (done by LLM).
Expand Down Expand Up @@ -77,10 +79,10 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
if exp.based_experiments:
sota_result = exp.based_experiments[-1].result
# Process the results to filter important metrics
combined_result = process_results(current_result, sota_result)
combined_result = self.process_results(current_result, sota_result)
else:
# If there are no based experiments, we'll only use the current result
combined_result = process_results(current_result, current_result) # Compare with itself
combined_result = self.process_results(current_result, current_result) # Compare with itself
print("Warning: No previous experiments to compare against. Using current result as baseline.")

available_features = {
Expand Down Expand Up @@ -113,35 +115,34 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac

# Prepare render dictionary
render_dict = {
"context": self.scen.get_scenario_all_desc(),
"last_hypothesis": trace.hist[-1][0] if trace.hist else None,
"last_task_and_code": last_task_and_code,
"last_result": trace.hist[-1][1].result if trace.hist else None,
"sota_task_and_code": exp.based_experiments[-1].experiment_workspace.data_description
if exp.based_experiments
else None,
"sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
"hypothesis": hypothesis,
"exp": exp,
"model_code": model_code,
"available_features": available_features,
"combined_result": combined_result,
"hypothesis_text": hypothesis_text,
"task_details": tasks_factors,
"model_code": model_code, # This turn
"available_features": available_features, # This turn
"combined_result": combined_result, # This turn and sota
"hypothesis_text": hypothesis_text, # This turn
"task_details": tasks_factors, # This turn
}

# Generate the user prompt
usr_prompt = (
Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict)
)

# Call the APIBackend to generate the response for hypothesis feedback
response = APIBackend().build_messages_and_create_chat_completion(
user_prompt=usr_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

# Parse the JSON response to extract the feedback
response_json = json.loads(response)

# Extract fields from JSON response
observations = response_json.get("Observations", "No observations provided")
hypothesis_evaluation = response_json.get("Feedback for Hypothesis", "No feedback provided")
new_hypothesis = response_json.get("New Hypothesis", "No new hypothesis provided")
Expand Down
2 changes: 2 additions & 0 deletions rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ kg_description_template:
"Competition Features": "Two-line description of the overall features involved within the competition as background."
"Submission Specifications": "The submission specification & sample submission csv descriptions for the model to output."
"Submission channel number to each sample": "The number of channels in the output for each sample, e.g., 1 for regression, N for N class classification with probabilities, etc. A Integer. If not specified, it is 1."
"Evaluation Description": "A brief description for what metrics are used in evaluation. An explanation of whether a higher score is better or lower is better in terms of performance."
"Evaluation Boolean": "True" or "False" (True means the higher score the better (like accuracy); False means the lower value the better (like loss).)
}
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.
Expand Down
20 changes: 18 additions & 2 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def __init__(self, competition: str) -> None:
self.competition_features = None
self.submission_specifications = None
self.model_output_channel = None
self.evaluation_desc = None
self.evaluation_metric_direction = None
self._analysis_competition_description()
self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection
Expand Down Expand Up @@ -73,12 +75,23 @@ def _analysis_competition_description(self):
"Submission Specifications", "No submission requirements provided"
)
self.model_output_channel = response_json_analysis.get("Submission channel number to each sample", 1)
self.evaluation_desc = response_json_analysis.get(
"Evaluation Description", "No evaluation specification provided."
)
self.evaluation_metric_direction = response_json_analysis.get(
"Evaluation Boolean", "No evaluation specification provided."
)

def get_competition_full_desc(self) -> str:
evaluation_direction = "higher the better" if self.evaluation_metric_direction else "lower the better"
return f"""Competition Type: {self.competition_type}
Competition Description: {self.competition_description}
Target Description: {self.target_description}
Competition Features: {self.competition_features}
Submission Specifications: {self.submission_specifications}
Model Output Channel: {self.model_output_channel}
Evaluation Descriptions: {self.evaluation_desc}
Is the evaluation metric the higher the better: {evaluation_direction}
"""

@property
Expand All @@ -99,6 +112,8 @@ def background(self) -> str:
target_description=self.target_description,
competition_features=self.competition_features,
submission_specifications=self.submission_specifications,
evaluation_desc=self.evaluation_desc,
evaluate_bool=self.evaluation_metric_direction,
)
)
return background_prompt
Expand Down Expand Up @@ -171,8 +186,9 @@ def simulator(self) -> str:

@property
def rich_style_description(self) -> str:
return """
kaggle scen """
return f"""
This is the Kaggle scenario for the competition: {KAGGLE_IMPLEMENT_SETTING.competition}
"""

def get_scenario_all_desc(self) -> str:
return f"""Background of the scenario:
Expand Down
Loading

0 comments on commit cc9a8c1

Please sign in to comment.