From 29b36d15abef99286a1455b1f6026a9dc36168a5 Mon Sep 17 00:00:00 2001
From: Yue Yang <118830877+yangyue5114@users.noreply.github.com>
Date: Sun, 19 Jan 2025 20:23:20 +0800
Subject: [PATCH 1/2] Update image_vqa.py

---
 vlmeval/dataset/image_vqa.py | 77 ++----------------------------------
 1 file changed, 4 insertions(+), 73 deletions(-)

diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
index dca75f3a..6d285572 100644
--- a/vlmeval/dataset/image_vqa.py
+++ b/vlmeval/dataset/image_vqa.py
@@ -643,79 +643,9 @@ def evaluate(self, eval_file, **judge_kwargs):
         return accdz
 
 
-class WeMath(ImageBaseDataset):
-    TYPE = 'VQA'
-    DATASET_URL = {
-        'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv'
-    }
-    DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'}
-
-    def evaluate(self, eval_file, **judge_kwargs):
-        from .utils.wemath import wemath_evaluate_models, wemath_accuracy
-        from .utils.multiple_choice import mcq_vanilla_eval
-
-        # model = judge_kwargs['model']
-        model = judge_kwargs.get('model', 'exact_matching')
-        assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
-        name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
-        name_str = name_str_map[model] if model in name_str_map else model
-
-        if model == 'exact_matching':
-            model = None
-        elif gpt_key_set():
-            model = build_judge(**judge_kwargs)
-            if not model.working():
-                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
-                warnings.warn(DEBUG_MESSAGE)
-                model = None
-        else:
-            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
-            model = None
-
-        suffix = eval_file.split('.')[-1]
-        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
-        nproc = judge_kwargs.pop('nproc', 4)
-
-        if not osp.exists(storage) and model is not None:
-            data = load(eval_file)
-            result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
-
-            data = load(eval_file)
-            data = data.sort_values(by='index')
-            data['prediction'] = [str(x) for x in data['prediction']]
-            # If not choice label, then use lower case
-            for k in data.keys():
-                data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
-
-            meta = self.data
-            meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
-            data_map = {x: y for x, y in zip(data['index'], data['question'])}
-            for k in data_map:
-                assert k in meta_q_map, (
-                    f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
-                )
-            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
-
-            if 'id' in data.columns:
-                # 更改列名
-                data.rename(columns={'id': 'ID'}, inplace=True)
-            dump(data, storage)
-        if osp.exists(storage):
-            accuracy_scores = wemath_evaluate_models(storage)
-            four_dim_scores = wemath_accuracy(storage)
-        else:
-            accuracy_scores = wemath_evaluate_models(eval_file)
-            four_dim_scores = wemath_accuracy(eval_file)
-        combine_score = {**accuracy_scores, **four_dim_scores}
-        combine_score = pd.DataFrame(combine_score)
-        score_pth = storage.replace('.xlsx', '_score.csv')
-        dump(combine_score, score_pth)
-        return combine_score
-
-
 class LLaVABench(ImageBaseDataset):
     TYPE = 'VQA'
-    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
+    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv',}
     DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
 
     # It returns a DataFrame
@@ -755,9 +685,10 @@ def evaluate(self, eval_file, **judge_kwargs):
 class MMVet(ImageBaseDataset):
     TYPE = 'VQA'
     DATASET_URL = {
-        'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
+        'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv',
+        'MMVet_Hard': 'http://opencompass.openxlab.space/utils/VLMEval/MMVet_Hard.tsv'
     }
-    DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
+    DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3', 'MMVet_Hard': '63a598819a936a2e77c410a78a21ff16'}
 
     # It returns a DataFrame
     @classmethod

From b094e798d88f82bd0e51a0bb7244d16a0b83b721 Mon Sep 17 00:00:00 2001
From: Yue Yang <118830877+yangyue5114@users.noreply.github.com>
Date: Sun, 19 Jan 2025 20:27:22 +0800
Subject: [PATCH 2/2] Update image_vqa.py

---
 vlmeval/dataset/image_vqa.py | 72 +++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
index 6d285572..5f86451f 100644
--- a/vlmeval/dataset/image_vqa.py
+++ b/vlmeval/dataset/image_vqa.py
@@ -643,9 +643,79 @@ def evaluate(self, eval_file, **judge_kwargs):
         return accdz
 
 
+class WeMath(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'WeMath': 'https://opencompass.openxlab.space/utils/VLMEval/WeMath.tsv'
+    }
+    DATASET_MD5 = {'WeMath': '056142c89b09d864702450b5b5ea0913'}
+
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.wemath import wemath_evaluate_models, wemath_accuracy
+        from .utils.multiple_choice import mcq_vanilla_eval
+
+        # model = judge_kwargs['model']
+        model = judge_kwargs.get('model', 'exact_matching')
+        assert model in ['exact_matching', 'gpt-4-0125', 'gpt-4-turbo', 'gpt-4o-mini'], model
+        name_str_map = {'gpt-4-0125': 'gpt4', 'gpt-4-turbo': 'gpt4-turbo', 'gpt-4o-mini': 'gpt4o-mini'}
+        name_str = name_str_map[model] if model in name_str_map else model
+
+        if model == 'exact_matching':
+            model = None
+        elif gpt_key_set():
+            model = build_judge(**judge_kwargs)
+            if not model.working():
+                warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
+                warnings.warn(DEBUG_MESSAGE)
+                model = None
+        else:
+            warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+        suffix = eval_file.split('.')[-1]
+        storage = eval_file.replace(f'.{suffix}', f'_{name_str}.xlsx')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(storage) and model is not None:
+            data = load(eval_file)
+            result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
+
+            data = load(eval_file)
+            data = data.sort_values(by='index')
+            data['prediction'] = [str(x) for x in data['prediction']]
+            # If not choice label, then use lower case
+            for k in data.keys():
+                data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+            meta = self.data
+            meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+            data_map = {x: y for x, y in zip(data['index'], data['question'])}
+            for k in data_map:
+                assert k in meta_q_map, (
+                    f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
+                )
+            data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name)
+
+            if 'id' in data.columns:
+                # 更改列名
+                data.rename(columns={'id': 'ID'}, inplace=True)
+            dump(data, storage)
+        if osp.exists(storage):
+            accuracy_scores = wemath_evaluate_models(storage)
+            four_dim_scores = wemath_accuracy(storage)
+        else:
+            accuracy_scores = wemath_evaluate_models(eval_file)
+            four_dim_scores = wemath_accuracy(eval_file)
+        combine_score = {**accuracy_scores, **four_dim_scores}
+        combine_score = pd.DataFrame(combine_score)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        dump(combine_score, score_pth)
+        return combine_score
+
+
 class LLaVABench(ImageBaseDataset):
     TYPE = 'VQA'
-    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv',}
+    DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
     DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
 
     # It returns a DataFrame