open-compass · weikaih04 · Aug 4, 2024 · Aug 4, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/run.py b/run.py
@@ -44,7 +44,6 @@ def main():
 
     args = parse_args()
     assert len(args.data), '--data should be a list of data files'
-
     if args.retry is not None:
         for k, v in supported_VLM.items():
             if hasattr(v, 'keywords') and 'retry' in v.keywords:
@@ -89,6 +88,9 @@ def main():
                 continue
 
             result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
+            if dataset_name in ['TaskMeAnything_v1_videoqa_random']:
+                packstr = 'pack' if args.pack else 'nopack'
+                result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
             if dataset_name in ['MMBench-Video']:
                 packstr = 'pack' if args.pack else 'nopack'
                 result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -11,6 +11,7 @@
 from .mmbench_video import MMBenchVideo
 from .text_mcq import CustomTextMCQDataset, TextMCQDataset
 from .videomme import VideoMME
+from .video_mcq import VideoMCQDataset
 from .utils import *
 from ..smp import *
 
@@ -23,7 +24,7 @@
 ]
 
 VIDEO_DATASET = [
-    MMBenchVideo, VideoMME
+    MMBenchVideo, VideoMME, VideoMCQDataset
 ]
 
 TEXT_DATASET = [

diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py
@@ -6,7 +6,6 @@
 
 
 class ImageMCQDataset(ImageBaseDataset):
-
     TYPE = 'MCQ'
 
     DATASET_URL = {
@@ -16,14 +15,14 @@ class ImageMCQDataset(ImageBaseDataset):
         'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv',
         'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv',
         'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv',  # Internal Only
-        'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv',    # Internal Only
+        'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv',  # Internal Only
         # MMBench v1.1
         'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv',
         'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv',
         'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv',
         'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv',
         'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv',  # Internal Only
-        'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv',    # Internal Only
+        'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv',  # Internal Only
         # SEEDBench Series
         'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv',
         'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv',
@@ -66,14 +65,14 @@ class ImageMCQDataset(ImageBaseDataset):
         'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd',
         'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e',
         'MMBench': '4115aea3383f3dd0083be6a633e0f820',  # Internal Only
-        'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',    # Internal Only
+        'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee',  # Internal Only
         # MMBench v1.1
         'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184',
         'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6',
         'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37',
         'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050',
         'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c',  # Internal Only
-        'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',    # Internal Only
+        'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25',  # Internal Only
         # SEEDBench
         'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0',
         'SEEDBench2': '4ec15cf864c4f16274112284f531813e',
@@ -103,11 +102,10 @@ class ImageMCQDataset(ImageBaseDataset):
         'RealWorldQA': '92321028d2bc29040284b6674721e48f',
         'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f',
         'BLINK': '3b6649b6a662184ea046908e5506260e',
-        'TaskMeAnything_v1_imageqa_random': '93b7290b447ef947f3b3abae5ad4bc1b'
+        'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889',
     }
 
     def build_prompt(self, line):
-
         if isinstance(line, int):
             line = self.data.iloc[line]
 
@@ -117,11 +115,7 @@ def build_prompt(self, line):
             tgt_path = self.dump_image(line)
 
         question = line['question']
-        options = {
-            cand: line[cand]
-            for cand in string.ascii_uppercase
-            if cand in line and not pd.isna(line[cand])
-        }
+        options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
         options_prompt = 'Options:\n'
         for key, item in options.items():
             options_prompt += f'{key}. {item}\n'
@@ -145,10 +139,13 @@ def build_prompt(self, line):
 
     def evaluate(self, eval_file, **judge_kwargs):
         from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval
+
         # assert dataset is not None
         dataset_map = {
-            'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11',
-            'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11'
+            'MMBench_TEST_EN': 'MMBench',
+            'MMBench_TEST_EN_V11': 'MMBench_V11',
+            'MMBench_TEST_CN': 'MMBench_CN',
+            'MMBench_TEST_CN_V11': 'MMBench_CN_V11',
         }
         dataset = self.dataset_name
         if dataset in dataset_map:
@@ -193,9 +190,7 @@ def evaluate(self, eval_file, **judge_kwargs):
         meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
         data_map = {x: y for x, y in zip(data['index'], data['question'])}
         for k in data_map:
-            assert k in meta_q_map, (
-                f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
-            )
+            assert k in meta_q_map, f'eval_file should be the same as or a subset of dataset {self.dataset_name}'
 
         if circular:
             data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name)
@@ -216,14 +211,15 @@ def evaluate(self, eval_file, **judge_kwargs):
         dump(acc, score_file)
 
         if dataset == 'AesBench_VAL':
-            warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
+            warnings.warn(
+                'Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \
                            please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \
-                           larger than the VAL dataset and the leaderboard results are based on AesBench TEST.')
+                           larger than the VAL dataset and the leaderboard results are based on AesBench TEST.'
+            )
         return acc
 
 
 class MMMUDataset(ImageMCQDataset):
-
     DATASET_URL = {
         'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv',
         'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv',
@@ -264,14 +260,14 @@ def build_prompt(self, line):
 
 
 class CustomMCQDataset(ImageMCQDataset):
-
     def load_data(self, dataset):
         data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
 
         if file_size(data_path, 'GB') > 1:
             local_path = data_path.replace('.tsv', '_local.tsv')
             if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
                 from ..tools import LOCALIZE
+
                 LOCALIZE(data_path, local_path)
             data_path = local_path
         return load(data_path)
diff --git a/vlmeval/dataset/utils/video_mcq_utils.py b/vlmeval/dataset/utils/video_mcq_utils.py
@@ -0,0 +1,15 @@
+import base64
+
+
+# video_mcq use base64 for mp4 video encoding and decoding.
+# using this code to convert mp4 to base64
+def mp4_to_base64(mp4_path):
+    try:
+        with open(mp4_path, 'rb') as video_file:
+            video_data = video_file.read()
+            base64_encoded_video = base64.b64encode(video_data).decode('utf-8')
+        return base64_encoded_video
+    except FileNotFoundError:
+        return 'The file was not found.'
+    except Exception as e:
+        return f'An error occurred: {e}'
diff --git a/vlmeval/dataset/video_base.py b/vlmeval/dataset/video_base.py
@@ -2,35 +2,68 @@
 from ..smp import *
 
 
-class VideoBaseDataset:
+def video_root_map(dataset):
+    return dataset
+
 
+class VideoBaseDataset:
     MODALITY = 'VIDEO'
 
-    def __init__(self,
-                 dataset='MMBench-Video',
-                 pack=False):
+    def __init__(self, dataset='MMBench-Video', pack=False, skip_novideo=True):
         try:
             import decord
         except:
             warnings.warn('Please install decord via `pip install decord`.')
 
-        self.dataset_name = dataset
-        ret = self.prepare_dataset(dataset)
-        assert ret is not None
-        lmu_root = LMUDataRoot()
-        self.frame_root = osp.join(lmu_root, 'images', dataset)
-        os.makedirs(self.frame_root, exist_ok=True)
-        self.frame_tmpl = 'frame-{}-of-{}.jpg'
+        # the init for previous two video dataset
+        if dataset in ['MMBench-Video', 'Video-MME']:
+            self.dataset_name = dataset
+            ret = self.prepare_dataset(dataset)
+            assert ret is not None
+            lmu_root = LMUDataRoot()
+            self.frame_root = osp.join(lmu_root, 'images', dataset)
+            os.makedirs(self.frame_root, exist_ok=True)
+            self.frame_tmpl = 'frame-{}-of-{}.jpg'
+
+            self.data_root = ret['root']
+            self.data_file = ret['data_file']
+            self.data = load(self.data_file)
+
+            assert 'question' in self.data and 'video' in self.data
+            videos = list(set(self.data['video']))
+            videos.sort()
+            self.videos = videos
+            self.pack = pack
+
+        # dataset init without prepare_dataset, just like image_base
+        else:
+            lmu_root = LMUDataRoot()
+            # You can override this variable to save image files to a different directory
+            self.dataset_name = dataset
+            self.frame_root = osp.join(lmu_root, 'images', dataset)
+            self.frame_tmpl = 'frame-{}-of-{}.jpg'
+            data, data_root = self.load_data(dataset)
+            self.data_root = data_root
+            self.meta_only = True
+            self.skip_novideo = skip_novideo
+            if skip_novideo and 'video' in data:
+                data = data[~pd.isna(data['video'])]
+
+            data['index'] = [str(x) for x in data['index']]
+            data['index'] = [str(x) for x in data['index']]
 
-        self.data_root = ret['root']
-        self.data_file = ret['data_file']
-        self.data = load(self.data_file)
+            if 'video' in data:
+                self.meta_only = False
 
-        assert 'question' in self.data and 'video' in self.data
-        videos = list(set(self.data['video']))
-        videos.sort()
-        self.videos = videos
-        self.pack = pack
+            if 'video_path' in data:
+                paths = [toliststr(x) for x in data['video_path']]
+                data['video_path'] = [x[0] if len(x) == 1 else x for x in paths]
+
+            if np.all([istype(x, int) for x in data['index']]):
+                data['index'] = [int(x) for x in data['index']]
+
+            self.data = data
+            self.post_build(dataset)
 
     def __len__(self):
         return len(self.videos) if self.pack else len(self.data)
@@ -44,6 +77,26 @@ def __getitem__(self, idx):
             assert idx < len(self.data)
             return dict(self.data.iloc[idx])
 
+    def load_data(self, dataset):
+        url = self.DATASET_URL[dataset]
+        file_md5 = self.DATASET_MD5[dataset]
+        return self.prepare_tsv(url, file_md5)
+
+    def prepare_tsv(self, url, file_md5=None):
+        data_root = LMUDataRoot()
+        os.makedirs(data_root, exist_ok=True)
+        file_name = url.split('/')[-1]
+        data_path = osp.join(data_root, file_name)
+        if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
+            pass
+        else:
+            warnings.warn('The dataset tsv is not downloaded')
+            download_file(url, data_path)
+        return load(data_path), data_root
+
+    def post_build(self, dataset):
+        pass
+
     def frame_paths(self, video, num_frames=8):
         frame_root = osp.join(self.frame_root, video)
         os.makedirs(frame_root, exist_ok=True)
@@ -68,7 +121,7 @@ def save_video_frames(self, video, num_frames=8):
     # Return a list of dataset names that are supported by this class, can override
     @classmethod
     def supported_datasets(cls):
-        return ['MMBench-Video', 'Video-MME']
+        return ['MMBench-Video', 'Video-MME'] + list(cls.DATASET_URL)
 
     # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
     @abstractmethod