open-compass · Espere-1119-Song · Jan 11, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/run.py b/run.py
@@ -318,7 +318,7 @@ def main():
                 if args.judge is not None:
                     judge_kwargs['model'] = args.judge
                 else:
-                    if dataset.TYPE in ['MCQ', 'Y/N']:
+                    if dataset.TYPE in ['MCQ', 'Y/N'] or listinstr(['moviechat1k'], dataset_name.lower()):
                         judge_kwargs['model'] = 'chatgpt-0125'
                     elif listinstr(['MMVet', 'LLaVABench', 'MMBench-Video'], dataset_name):
                         judge_kwargs['model'] = 'gpt-4-turbo'

diff --git a/vlmeval/api/hf_chat_model.py b/vlmeval/api/hf_chat_model.py
@@ -10,7 +10,7 @@ def get_gpu_num(model_name):
     kws = {
         8: ['65b', '70b'],
         4: ['30b', '33b', '35b', '40b'],
-        2: ['13b', '14b', '20b'],
+        2: ['13b', '14b', '20b', '8b'],
         1: ['6b', '7b', 'moss'],
     }
     for k in [8, 4, 2, 1]:
@@ -26,7 +26,8 @@ def get_gpu_num(model_name):
     'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k',
     'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat',
     'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5',
-    'meta-llama/Llama-2-7b-chat-hf'
+    'meta-llama/Llama-2-7b-chat-hf',
+    'meta-llama/Llama-3.1-8B-Instruct'
 ]
 Auto_model = ['chatglm']
 
@@ -65,15 +66,14 @@ def __init__(self,
                  **kwargs):
 
         self.logger = get_logger('HFChatModel')
-        if 'vicuna' in model_path.lower():
+        if 'vicuna' in model_path.lower() or 'llama' in model_path.lower():
             try:
                 from fastchat.model import get_conversation_template
             except Exception as err:
                 self.logger.critical('Please install fastchat first to use vicuna. ')
                 raise err
 
         self.explicit_device = kwargs.pop('device', None)
-
         if self.explicit_device is None:
             # If CUDA_VISIBLE_DEVICES is not properly set
             if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7':
@@ -93,7 +93,6 @@ def __init__(self,
             LoadModel = AutoModel
         else:
             LoadModel = AutoModelForCausalLM
-
         assert osp.exists(model_path) or len(model_path.split('/')) == 2
 
         device = self.explicit_device if self.explicit_device else 'auto'
@@ -105,20 +104,36 @@ def __init__(self,
             precision = {'torch_dtype': torch.bfloat16}
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
-        model = model.eval()
+        cuda_devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0')
+        if ',' in cuda_devices:
+            device_ids = [int(x) for x in cuda_devices.split(',')]
+            device_map = {i: i for i in range(len(device_ids))}
+        else:
+            device_map = {'': 0}
+
+        if 'llama' in self.model_path.lower():
+            from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
+            print(f"Loading model {model_path} with {num_gpu} GPUs")
+            backend_config = TurbomindEngineConfig(tp=num_gpu)
+            self.gen_config = GenerationConfig(max_new_tokens=256)
+            model = pipeline(model_path,
+                            backend_config=backend_config)
+        else:
+            model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
+            model = model.eval()
 
-        if device != 'cpu':
-            model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
-        try:
-            model.generation_config = GenerationConfig.from_pretrained(
-                model_path, trust_remote_code=True, device_map=device)
-        except Exception as err:
-            self.logger.warning(f'{type(err)}: {err}')
+            if device != 'cpu':
+                model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
+            try:
+                model.generation_config = GenerationConfig.from_pretrained(
+                    model_path, trust_remote_code=True, device_map=device)
+            except Exception as err:
+                self.logger.warning(f'{type(err)}: {err}')
+
+            self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
 
         torch.cuda.empty_cache()
         self.model = model
-        self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
         self.answer_buffer = 192
         self.system_prompt = system_prompt
         for k, v in kwargs.items():
@@ -149,7 +164,9 @@ def generate_str(self, input, **kwargs):
                 outputs[0][len(inputs['input_ids'][0]):],
                 skip_special_tokens=True,
                 spaces_between_special_tokens=False)
-
+        elif 'llama' in self.model_path.lower():
+            prompt = [{'role': 'system', 'content': self.system_prompt}, {'role': 'user', 'content': input}]
+            resp = self.model(prompt, gen_config=self.gen_config).text
         else:
             params = self.kwargs
             params.update(kwargs)
@@ -165,7 +182,6 @@ def length_ok(self, inputs):
 
     def generate_list(self, full_inputs, offset=0, **kwargs):
         assert isinstance(full_inputs, list)
-
         inputs = full_inputs[offset:]
         if not self.length_ok(inputs):
             return self.chat(full_inputs, offset + 1)
@@ -244,3 +260,4 @@ def generate(self, inputs, **kwargs):
             return self.generate_str(inputs, **kwargs)
         elif isinstance(inputs, list):
             return self.generate_list(inputs, **kwargs)
+
diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -27,6 +27,7 @@
     'PLLaVA-7B': partial(PLLaVA, model_path='ermu2001/pllava-7b', dir_root=PLLaVA_ROOT),
     'PLLaVA-13B': partial(PLLaVA, model_path='ermu2001/pllava-13b', dir_root=PLLaVA_ROOT),
     'PLLaVA-34B': partial(PLLaVA, model_path='ermu2001/pllava-34b', dir_root=PLLaVA_ROOT),
+    "MovieChat": partial(MovieChat, model_path='Enxin/MovieChat-vicuna', short_memory_length=18, long_memory_length=256)
 }
 
 ungrouped = {
@@ -403,6 +404,10 @@
     "Aria": partial(Aria, model_path='rhymes-ai/Aria')
 }
 
+aurora_series = {
+    "AuroraCap": partial(AuroraCap, model_path='wchai/AuroraCap-7B-VID-xtuner', token_merge_ratio=0.4)
+}
+
 h2ovl_series = {
     'h2ovl-mississippi-2b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-2b'),
     'h2ovl-mississippi-1b': partial(H2OVLChat, model_path='h2oai/h2ovl-mississippi-800m'),
@@ -422,7 +427,7 @@
     cambrian_series, chameleon_series, video_models, ovis_series, vila_series,
     mantis_series, mmalaya_series, phi3_series, xgen_mm_series, qwen2vl_series,
     slime_series, eagle_series, moondream_series, llama_series, molmo_series,
-    kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, aria_series,
+    kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, aria_series, aurora_series,
     smolvlm_series, sail_series, valley_series, vita_series
 ]
 

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -31,7 +31,8 @@
 from .video_concat_dataset import ConcatVideoDataset
 from .mmgenbench import MMGenBench
 from .cgbench import CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded
-
+from .moviechat1k import MovieChat1k
+from .vdc import VDC
 from .miabench import MIABench
 from .cmmmu import CMMMU
 from .wildvision import WildVision
@@ -138,7 +139,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 ]
 
 VIDEO_DATASET = [
-    MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
+    MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench, VDC, MovieChat1k, 
     MLVU, MLVU_MCQ, MLVU_OpenEnded,
     TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN,
     CGBench_MCQ_Grounding_Mini, CGBench_OpenEnded_Mini, CGBench_MCQ_Grounding, CGBench_OpenEnded