From 5b5dfeefbc7eb9dcbd9923544005c5d281262c03 Mon Sep 17 00:00:00 2001
From: Way2Learn <118058822+Xisen-Wang@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:25:14 +0800
Subject: [PATCH] fix: Update prompts.yaml to constrain only one model type  
 (#341)

* Update prompts.yaml

* Update prompts.yaml

* fix a bug

---------

Co-authored-by: WinstonLiyt <104308117+WinstonLiyt@users.noreply.github.com>
Co-authored-by: WinstonLiye <1957922024@qq.com>
---
 rdagent/scenarios/kaggle/developer/runner.py                  | 4 ++--
 .../model/model_randomforest.py                               | 2 +-
 .../playground-series-s4e8_template/model/model_xgboost.py    | 2 +-
 rdagent/scenarios/kaggle/prompts.yaml                         | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
index 87c4e64d..0785c937 100644
--- a/rdagent/scenarios/kaggle/developer/runner.py
+++ b/rdagent/scenarios/kaggle/developer/runner.py
@@ -97,8 +97,8 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
         self.build_from_SOTA(exp)
 
         sub_ws = exp.sub_workspace_list[0]
-        # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list. Hardcoded now.
-        model_type = sub_ws.target_task.model_type[0]
+        # TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
+        model_type = sub_ws.target_task.model_type
 
         if sub_ws.code_dict == {}:
             raise ModelEmptyError("No model is implemented.")
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py
index 3c64a094..377683b9 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_randomforest.py
@@ -51,4 +51,4 @@ def predict(model, X):
     y_pred_prob = model.predict_proba(X_selected)[:, 1]
 
     # Apply threshold to get boolean predictions
-    return y_pred_prob
+    return y_pred_prob.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py
index a70fa680..b25e87d9 100644
--- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py
+++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py
@@ -37,4 +37,4 @@ def predict(model, X):
     X = select(X)
     dtest = xgb.DMatrix(X)
     y_pred_prob = model.predict(dtest)
-    return y_pred_prob
+    return y_pred_prob.reshape(-1, 1)
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
index 5820294b..c4250602 100644
--- a/rdagent/scenarios/kaggle/prompts.yaml
+++ b/rdagent/scenarios/kaggle/prompts.yaml
@@ -95,7 +95,7 @@ feature_experiment_output_format: |-
 
 model_experiment_output_format: |-
   According to the hypothesis, please help user design one model task.
-  Since we only build one model from four model types: ["XGBoost", "RandomForest", "LightGBM", "NN"].  
+  We only build one model from four main model types: ["XGBoost", "RandomForest", "LightGBM", "NN"].
   The output should follow JSON format. The schema is as follows: 
   {
       "model_name": "model_name",
@@ -106,7 +106,7 @@ model_experiment_output_format: |-
           "hyperparameter_name_2": "value of hyperparameter 2",
           "hyperparameter_name_3": "value of hyperparameter 3"
       },
-      "model_type": "model type"
+      "model_type": "Select only one model type: XGBoost, RandomForest, LightGBM, or NN. The primary model must be unique, but you may use auxiliary models for support if you think it can have a good result like choosing A model as the main model, with B Model used for auxiliary support or optimization on specific details."
   }
   Usually, a larger model works better than a smaller one. Hence, the parameters should be larger.