diff --git a/run.py b/run.py index 943d4ec0d..4e8775d71 100644 --- a/run.py +++ b/run.py @@ -44,7 +44,6 @@ def main(): args = parse_args() assert len(args.data), '--data should be a list of data files' - if args.retry is not None: for k, v in supported_VLM.items(): if hasattr(v, 'keywords') and 'retry' in v.keywords: @@ -89,6 +88,9 @@ def main(): continue result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx' + if dataset_name in ['TaskMeAnything_v1_videoqa_random']: + packstr = 'pack' if args.pack else 'nopack' + result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' if dataset_name in ['MMBench-Video']: packstr = 'pack' if args.pack else 'nopack' result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 980f8b488..8b98774f5 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -11,6 +11,7 @@ from .mmbench_video import MMBenchVideo from .text_mcq import CustomTextMCQDataset, TextMCQDataset from .videomme import VideoMME +from .video_mcq import VideoMCQDataset from .utils import * from ..smp import * @@ -23,7 +24,7 @@ ] VIDEO_DATASET = [ - MMBenchVideo, VideoMME + MMBenchVideo, VideoMME, VideoMCQDataset ] TEXT_DATASET = [ diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 12ca20a28..0d2d86cbd 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -6,7 +6,6 @@ class ImageMCQDataset(ImageBaseDataset): - TYPE = 'MCQ' DATASET_URL = { @@ -16,14 +15,14 @@ class ImageMCQDataset(ImageBaseDataset): 'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv', 'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv', 'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv', # Internal Only - 'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only + 'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only # MMBench v1.1 'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv', 'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv', 'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv', 'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv', 'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv', # Internal Only - 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only + 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only # SEEDBench Series 'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv', 'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv', @@ -66,14 +65,14 @@ class ImageMCQDataset(ImageBaseDataset): 'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd', 'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e', 'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only - 'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only + 'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only # MMBench v1.1 'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184', 'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6', 'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37', 'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050', 'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only - 'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only + 'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only # SEEDBench 'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0', 'SEEDBench2': '4ec15cf864c4f16274112284f531813e', @@ -103,11 +102,10 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '93b7290b447ef947f3b3abae5ad4bc1b' + 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889', } def build_prompt(self, line): - if isinstance(line, int): line = self.data.iloc[line] @@ -117,11 +115,7 @@ def build_prompt(self, line): tgt_path = self.dump_image(line) question = line['question'] - options = { - cand: line[cand] - for cand in string.ascii_uppercase - if cand in line and not pd.isna(line[cand]) - } + options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])} options_prompt = 'Options:\n' for key, item in options.items(): options_prompt += f'{key}. {item}\n' @@ -145,10 +139,13 @@ def build_prompt(self, line): def evaluate(self, eval_file, **judge_kwargs): from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval + # assert dataset is not None dataset_map = { - 'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11', - 'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11' + 'MMBench_TEST_EN': 'MMBench', + 'MMBench_TEST_EN_V11': 'MMBench_V11', + 'MMBench_TEST_CN': 'MMBench_CN', + 'MMBench_TEST_CN_V11': 'MMBench_CN_V11', } dataset = self.dataset_name if dataset in dataset_map: @@ -193,9 +190,7 @@ def evaluate(self, eval_file, **judge_kwargs): meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} data_map = {x: y for x, y in zip(data['index'], data['question'])} for k in data_map: - assert k in meta_q_map, ( - f'eval_file should be the same as or a subset of dataset {self.dataset_name}' - ) + assert k in meta_q_map, f'eval_file should be the same as or a subset of dataset {self.dataset_name}' if circular: data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name) @@ -216,14 +211,15 @@ def evaluate(self, eval_file, **judge_kwargs): dump(acc, score_file) if dataset == 'AesBench_VAL': - warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \ + warnings.warn( + 'Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \ please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \ - larger than the VAL dataset and the leaderboard results are based on AesBench TEST.') + larger than the VAL dataset and the leaderboard results are based on AesBench TEST.' + ) return acc class MMMUDataset(ImageMCQDataset): - DATASET_URL = { 'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv', 'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv', @@ -264,7 +260,6 @@ def build_prompt(self, line): class CustomMCQDataset(ImageMCQDataset): - def load_data(self, dataset): data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv') @@ -272,6 +267,7 @@ def load_data(self, dataset): local_path = data_path.replace('.tsv', '_local.tsv') if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None): from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) data_path = local_path return load(data_path) diff --git a/vlmeval/dataset/utils/video_mcq_utils.py b/vlmeval/dataset/utils/video_mcq_utils.py new file mode 100644 index 000000000..c3c65dd50 --- /dev/null +++ b/vlmeval/dataset/utils/video_mcq_utils.py @@ -0,0 +1,15 @@ +import base64 + + +# video_mcq use base64 for mp4 video encoding and decoding. +# using this code to convert mp4 to base64 +def mp4_to_base64(mp4_path): + try: + with open(mp4_path, 'rb') as video_file: + video_data = video_file.read() + base64_encoded_video = base64.b64encode(video_data).decode('utf-8') + return base64_encoded_video + except FileNotFoundError: + return 'The file was not found.' + except Exception as e: + return f'An error occurred: {e}' diff --git a/vlmeval/dataset/video_base.py b/vlmeval/dataset/video_base.py index d2e0b4a51..dfcdc1c81 100644 --- a/vlmeval/dataset/video_base.py +++ b/vlmeval/dataset/video_base.py @@ -2,35 +2,68 @@ from ..smp import * -class VideoBaseDataset: +def video_root_map(dataset): + return dataset + +class VideoBaseDataset: MODALITY = 'VIDEO' - def __init__(self, - dataset='MMBench-Video', - pack=False): + def __init__(self, dataset='MMBench-Video', pack=False, skip_novideo=True): try: import decord except: warnings.warn('Please install decord via `pip install decord`.') - self.dataset_name = dataset - ret = self.prepare_dataset(dataset) - assert ret is not None - lmu_root = LMUDataRoot() - self.frame_root = osp.join(lmu_root, 'images', dataset) - os.makedirs(self.frame_root, exist_ok=True) - self.frame_tmpl = 'frame-{}-of-{}.jpg' + # the init for previous two video dataset + if dataset in ['MMBench-Video', 'Video-MME']: + self.dataset_name = dataset + ret = self.prepare_dataset(dataset) + assert ret is not None + lmu_root = LMUDataRoot() + self.frame_root = osp.join(lmu_root, 'images', dataset) + os.makedirs(self.frame_root, exist_ok=True) + self.frame_tmpl = 'frame-{}-of-{}.jpg' + + self.data_root = ret['root'] + self.data_file = ret['data_file'] + self.data = load(self.data_file) + + assert 'question' in self.data and 'video' in self.data + videos = list(set(self.data['video'])) + videos.sort() + self.videos = videos + self.pack = pack + + # dataset init without prepare_dataset, just like image_base + else: + lmu_root = LMUDataRoot() + # You can override this variable to save image files to a different directory + self.dataset_name = dataset + self.frame_root = osp.join(lmu_root, 'images', dataset) + self.frame_tmpl = 'frame-{}-of-{}.jpg' + data, data_root = self.load_data(dataset) + self.data_root = data_root + self.meta_only = True + self.skip_novideo = skip_novideo + if skip_novideo and 'video' in data: + data = data[~pd.isna(data['video'])] + + data['index'] = [str(x) for x in data['index']] + data['index'] = [str(x) for x in data['index']] - self.data_root = ret['root'] - self.data_file = ret['data_file'] - self.data = load(self.data_file) + if 'video' in data: + self.meta_only = False - assert 'question' in self.data and 'video' in self.data - videos = list(set(self.data['video'])) - videos.sort() - self.videos = videos - self.pack = pack + if 'video_path' in data: + paths = [toliststr(x) for x in data['video_path']] + data['video_path'] = [x[0] if len(x) == 1 else x for x in paths] + + if np.all([istype(x, int) for x in data['index']]): + data['index'] = [int(x) for x in data['index']] + + self.data = data + self.post_build(dataset) def __len__(self): return len(self.videos) if self.pack else len(self.data) @@ -44,6 +77,26 @@ def __getitem__(self, idx): assert idx < len(self.data) return dict(self.data.iloc[idx]) + def load_data(self, dataset): + url = self.DATASET_URL[dataset] + file_md5 = self.DATASET_MD5[dataset] + return self.prepare_tsv(url, file_md5) + + def prepare_tsv(self, url, file_md5=None): + data_root = LMUDataRoot() + os.makedirs(data_root, exist_ok=True) + file_name = url.split('/')[-1] + data_path = osp.join(data_root, file_name) + if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): + pass + else: + warnings.warn('The dataset tsv is not downloaded') + download_file(url, data_path) + return load(data_path), data_root + + def post_build(self, dataset): + pass + def frame_paths(self, video, num_frames=8): frame_root = osp.join(self.frame_root, video) os.makedirs(frame_root, exist_ok=True) @@ -68,7 +121,7 @@ def save_video_frames(self, video, num_frames=8): # Return a list of dataset names that are supported by this class, can override @classmethod def supported_datasets(cls): - return ['MMBench-Video', 'Video-MME'] + return ['MMBench-Video', 'Video-MME'] + list(cls.DATASET_URL) # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe @abstractmethod diff --git a/vlmeval/dataset/video_mcq.py b/vlmeval/dataset/video_mcq.py new file mode 100644 index 000000000..d46e27359 --- /dev/null +++ b/vlmeval/dataset/video_mcq.py @@ -0,0 +1,148 @@ +import os +import string + +# uuid is for generate random video file name to save the video +import uuid +from .utils import build_judge, DEBUG_MESSAGE +import pandas as pd + +from ..smp import * +from .video_base import VideoBaseDataset + + +def combine_images(self): + pass + + +class VideoMCQDataset(VideoBaseDataset): + TYPE = 'MCQ' + + DATASET_URL = { + # TaskMeAnything_v1_videoqa + 'TaskMeAnything_v1_videoqa_random': ( + 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random/' + 'resolve/main/TaskMeAnything-v1-videoqa-random.tsv' + ) + # Other Benchmarks + } + + DATASET_MD5 = { + # TaskMeAnything_v1_videoqa + 'TaskMeAnything_v1_videoqa_random': 'd18394a66dd476f0ff7b92bb9c300aeb' + # Other Benchmarks + } + + def base64_to_mp4(self, base64_string): + video_name = str(uuid.uuid4()) + video_path = os.path.join(self.data_root, video_name + '.mp4') + with open(video_path, 'wb') as f: + f.write(base64.b64decode(base64_string)) + return video_name, video_path + + def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images: bool = False): + # if line is an index, get the line from the data + if isinstance(line, int): + line = self.data.iloc[line] + + # the video stored in data should be a binary stream format + video_name, video_path = self.base64_to_mp4(line['video']) + message = [] + # setup default video or frames for ImageQA model or VideoQA model + if video_llm: + message.append(dict(type='video', value=video_path)) + elif is_combine_images: + # combine images means that combine all the frames into one image, instead of provide a sequences of image. + # This is useful for some models that only accept one image as input. + # And I was surprised to find that most of the time, ImageQA models perform better + # with a combined image instead of a sequence of frames. + frame_paths = self.save_video_frames(video_name, num_frames) + combined_image = combine_images(frame_paths) + message.append(dict(type='image', value=combined_image)) + else: + frame_paths = self.save_video_frames(video_name, num_frames) + for im in frame_paths: + message.append(dict(type='image', value=im)) + + # setup default prompt for MCQ + question = line['question'] + options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])} + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + prompt += 'Please select the correct answer from the options above. \n' + message.append(dict(type='text', value=prompt)) + return message + + def evaluate(self, eval_file, **judge_kwargs): + from .utils.multiple_choice import ( + mcq_circular_eval, + mcq_vanilla_eval, + report_acc, + report_acc_MMT, + ) + + dataset = self.dataset_name + + nproc = judge_kwargs.pop('nproc', 4) + + circular = False + + suffix = eval_file.split('.')[-1] + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + + data = load(eval_file) + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + # If not choice label, then use lower case + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + meta = self.data + meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} + data_map = {x: y for x, y in zip(data['index'], data['question'])} + for k in data_map: + assert k in meta_q_map, f'eval_file should be the same as or a subset of dataset {self.dataset_name}' + + if circular: + data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name) + else: + data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) + + # load split + dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + + # May have different report acc functions for different datasets + if 'MMT' in dataset: + acc = report_acc_MMT(data) + else: + acc = report_acc(data) + + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(acc, score_file) + + return acc