InternVL 2.5，首个MMMU超过70%的开源模型，性能媲美GPT-4o

通过链式思考（CoT）推理实现了3.7个百分点的提升，展示了强大的测试时间可扩展性潜力

引言

近期Internvl2.5发布，性能与GPT-4o和Claude-3.5-sonnet等领先的商业模型相媲美，成为首个在MMMU上超过70%的开源模型，通过链式思考（CoT）推理实现了3.7个百分点的提升，展示了强大的测试时间可扩展性潜力。InternVL 2.5是基于InternVL 2.0发展而来，通过增强训练和测试策略以及提高数据质量来进一步提升性能。该模型在多个方面进行了优化，包括视觉编码器、语言模型、数据集大小和测试时间配置等方面的研究，旨在探索模型规模与性能之间的关系。InternVL 2.5经过广泛的评估，在多项基准测试中展现了竞争性的性能，特别是在多学科推理、文档理解、多图像/视频理解、现实世界理解、多模态幻觉检测、视觉地面化、多语言能力以及纯语言处理等领域。

? 课代表划重点：

1. InternVL 2.5的开源多模态大型语言模型发布，通过数据和测试时间扩展提高了性能边界

2. 实验表明，InternVL 2.5在多种基准测试中表现出竞争力，并超越了商业模型GPT-4o和Claude-3.5-Sonnet

3. 该模型采用了新的训练和测试策略以及高质量的数据集，能够处理多种模态的信息，包括文本、图像和视频等

4. 通过链式思维推理等方式，该模型在MMMU基准测试中实现了超过70%的准确率，展现了强大的测试时间扩展潜力

5. 该研究为开放源代码社区提供了一个新标准，用于开发和应用多模态AI系统

InternVL 2.5保留了其前身的相同模型架构：InternVL 1.5 和 InternVL 2.0 ，遵循各种 MLLM 研究中广泛采用的“ViT-MLP-LLM”范式。InternVL 2.5实现将一种新的增量预训练的InternViT-6B或InternViT-300M与各种不同大小和类型的预先训练的LLMs集成在一起，包括InternLM2.5和Qwen 2.5，使用随机初始化的两层MLP投影器。正如之前的版本一样，为了增强高分辨率处理的可扩展性，研究团队简单地应用了一个像素无序操作，将视觉令牌的数量减少到原始数量的一半。因此，在InternVL 2.5中，一个448×448图像块由256个视觉令牌表示。

模型链接：

https://www.modelscope.cn/collections/InternVL-25-fbde6e47302942

模型下载

命令行下载：

modelscope download --model OpenGVLab/InternVL2_5-4B

Python SDK下载：

#模型下载from modelscope import snapshot_downloadmodel_dir = snapshot_download('OpenGVLab/InternVL2_5-4B')

模型推理

transformers推理

import numpy as npimport torchimport torchvision.transforms as Tfrom decord import VideoReader, cpufrom PIL import Imagefrom torchvision.transforms.functional import InterpolationModefrom modelscope import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD    transform = T.Compose([        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),        T.ToTensor(),        T.Normalize(mean=MEAN, std=STD)    ])    return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):    best_ratio_diff = float('inf')    best_ratio = (1, 1)    area = width * height    for ratio in target_ratios:        target_aspect_ratio = ratio[0] / ratio[1]        ratio_diff = abs(aspect_ratio - target_aspect_ratio)        if ratio_diff < best_ratio_diff:            best_ratio_diff = ratio_diff            best_ratio = ratio        elif ratio_diff == best_ratio_diff:            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:                best_ratio = ratio    return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):    orig_width, orig_height = image.size    aspect_ratio = orig_width / orig_height
    # calculate the existing image aspect ratio    target_ratios = set(        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if        i * j <= max_num and i * j >= min_num)    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
    # find the closest aspect ratio to the target    target_aspect_ratio = find_closest_aspect_ratio(        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
    # calculate the target width and height    target_width = image_size * target_aspect_ratio[0]    target_height = image_size * target_aspect_ratio[1]    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
    # resize the image    resized_img = image.resize((target_width, target_height))    processed_images = []    for i in range(blocks):        box = (            (i % (target_width // image_size)) * image_size,            (i // (target_width // image_size)) * image_size,            ((i % (target_width // image_size)) + 1) * image_size,            ((i // (target_width // image_size)) + 1) * image_size        )        # split the image        split_img = resized_img.crop(box)        processed_images.append(split_img)    assert len(processed_images) == blocks    if use_thumbnail and len(processed_images) != 1:        thumbnail_img = image.resize((image_size, image_size))        processed_images.append(thumbnail_img)    return processed_images
def load_image(image_file, input_size=448, max_num=12):    image = Image.open(image_file).convert('RGB')    transform = build_transform(input_size=input_size)    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)    pixel_values = [transform(image) for image in images]    pixel_values = torch.stack(pixel_values)    return pixel_values
# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.path = 'OpenGVLab/InternVL2_5-4B'model = AutoModel.from_pretrained(    path,    torch_dtype=torch.bfloat16,    low_cpu_mem_usage=True,    use_flash_attn=True,    trust_remote_code=True).eval().cuda()tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# set the max number of tiles in `max_num`pixel_values = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()generation_config = dict(max_new_tokens=1024, do_sample=True)
# pure-text conversation (纯文本对话)question = 'Hello, who are you?'response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)print(f'User: {question}\nAssistant: {response}')
# single-image single-round conversation (单图单轮对话)question = '<image>\nPlease describe the image shortly.'response = model.chat(tokenizer, pixel_values, question, generation_config)print(f'User: {question}\nAssistant: {response}')
# single-image multi-round conversation (单图多轮对话)question = '<image>\nPlease describe the image in detail.'response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               history=None, return_history=True)print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               history=history, return_history=True)print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               num_patches_list=num_patches_list,                               history=None, return_history=True)print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               num_patches_list=num_patches_list,                               history=history, return_history=True)print(f'User: {question}\nAssistant: {response}')
# batch inference, single image per sample (单图批处理)pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)responses = model.batch_chat(tokenizer, pixel_values,                             num_patches_list=num_patches_list,                             questions=questions,                             generation_config=generation_config)for question, response in zip(questions, responses):    print(f'User: {question}\nAssistant: {response}')
# video multi-round conversation (视频多轮对话)def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):    if bound:        start, end = bound[0], bound[1]    else:        start, end = -100000, 100000    start_idx = max(first_idx, round(start * fps))    end_idx = min(round(end * fps), max_frame)    seg_size = float(end_idx - start_idx) / num_segments    frame_indices = np.array([        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))        for idx in range(num_segments)    ])    return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)    max_frame = len(vr) - 1    fps = float(vr.get_avg_fps())
    pixel_values_list, num_patches_list = [], []    transform = build_transform(input_size=input_size)    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)    for frame_index in frame_indices:        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)        pixel_values = [transform(tile) for tile in img]        pixel_values = torch.stack(pixel_values)        num_patches_list.append(pixel_values.shape[0])        pixel_values_list.append(pixel_values)    pixel_values = torch.cat(pixel_values_list)    return pixel_values, num_patches_list
video_path = './showcase.mp4'pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)pixel_values = pixel_values.to(torch.bfloat16).cuda()video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])question = video_prefix + 'What is the red panda doing?'# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               num_patches_list=num_patches_list, history=None, return_history=True)print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail. Don\'t repeat.'response, history = model.chat(tokenizer, pixel_values, question, generation_config,                               num_patches_list=num_patches_list, history=history, return_history=True)print(f'User: {question}\nAssistant: {response}')

流式输出：

from transformers import TextIteratorStreamerfrom threading import Thread
# Initialize the streamerstreamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)# Define the generation configurationgeneration_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)# Start the model chat in a separate threadthread = Thread(target=model.chat, kwargs=dict(    tokenizer=tokenizer, pixel_values=pixel_values, question=question,    history=None, return_history=False, generation_config=generation_config,))thread.start()
# Initialize an empty string to store the generated textgenerated_text = ''# Loop through the streamer to get the new text as it is generatedfor new_text in streamer:    if new_text == model.conv_template.sep:        break    generated_text += new_text    print(new_text, end='', flush=True)  # Print each new chunk of generated text on the same line

显存占用：

lmdeploy推理

安装依赖：

pip install lmdeploy -U

示例代码：

from lmdeploy import pipeline, TurbomindEngineConfigfrom lmdeploy.vl import load_imagefrom modelscope import snapshot_download
model = snapshot_download('OpenGVLab/InternVL2_5-4B')image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))response = pipe(('describe this image', image))print(response.text)

lmdeploy部署本地服务：

lmdeploy serve api_server ./InternVL2_5-4B/ --backend turbomind --server-port 23333

推理服务：

from openai import OpenAI
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')model_name = client.models.list().data[0].idresponse = client.chat.completions.create(    model=model_name,    messages=[{        'role':        'user',        'content': [{            'type': 'text',            'text': 'describe this image',        }, {            'type': 'image_url',            'image_url': {                'url':                'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',            },        }],    }],    temperature=0.8,    top_p=0.8)print(response)

模型训练

我们使用ms-swift 3.0对InternVL2.5-2B进行微调. ms-swift是魔搭社区官方提供的LLM与多模态LLM微调部署框架，支持400+LLM和100+多模态LLM。

这里我们使用python的方式对InternVL2.5-2B进行Latex-OCR的微调。借此我们可以快速了解微调中的一些细节，这对自定义训练过程很有帮助。

如果出现兼容问题，请关注：

https://github.com/modelscope/ms-swift/tree/main/examples/train/notebook

首先我们需要从源代码安装ms-swift3.0

git clone https://github.com/modelscope/ms-swift.gitcd ms-swiftpip install -e '.[llm]'

首先我们需要导入一些包：

import osos.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.llm import (    get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,    get_multimodal_target_regex, LazyLLMDataset)from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everythingfrom swift.tuners import Swift, LoraConfigfrom swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArgumentsfrom functools import partial
logger = get_logger()seed_everything(42)

设置一些训练的超参数：

# modelmodel_id_or_path = 'OpenGVLab/InternVL2_5-2B'system = None  # 使用template中定义的默认systemoutput_dir = 'output/InternVL2_5-2B'
# datasetdataset = ['AI-ModelScope/LaTeX_OCR#20000']  # dataset_id或者dataset_path。这里我们采样20000条数据样本data_seed = 42max_length = 8192split_dataset_ratio = 0.01  # 切分验证集的比例num_proc = 4  # 数据处理的进程数strict = False
# loralora_rank = 8lora_alpha = 32freeze_llm = Falsefreeze_vit = Truefreeze_aligner = True
# training_argstraining_args = Seq2SeqTrainingArguments(    output_dir=output_dir,    learning_rate=1e-4,    per_device_train_batch_size=1,    per_device_eval_batch_size=1,    gradient_checkpointing=True,    weight_decay=0.1,    lr_scheduler_type='cosine',    warmup_ratio=0.05,    report_to=['tensorboard'],    logging_first_step=True,    save_strategy='steps',    save_steps=100,    eval_strategy='steps',    eval_steps=100,    gradient_accumulation_steps=16,    # 为了更快查看训练效果，这里设置为1。通常情况下，需要设置为更大的数。    num_train_epochs=1,    metric_for_best_model='loss',    save_total_limit=2,    logging_steps=5,    dataloader_num_workers=4,    data_seed=data_seed,    remove_unused_columns=False)
output_dir = os.path.abspath(os.path.expanduser(output_dir))logger.info(f'output_dir: {output_dir}')

准备模型和对话模板：

# 获取model和templatemodel, processor = get_model_tokenizer(model_id_or_path)logger.info(f'model_info: {model.model_info}')template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length)template.set_mode('train')
# 获取target_modules并在模型中加入可训练的LoRA模块model_arch = get_model_arch(model.model_meta.model_arch)target_modules = get_multimodal_target_regex(model_arch, freeze_llm=freeze_llm, freeze_vit=freeze_vit,                             freeze_aligner=freeze_aligner)lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,                         target_modules=target_modules)model = Swift.prepare_model(model, lora_config)logger.info(f'lora_config: {lora_config}')
# 打印模型结构和可训练参数logger.info(f'model: {model}')model_parameter_info = get_model_parameter_info(model)logger.info(f'model_parameter_info: {model_parameter_info}')

准备训练和验证数据集：

# 下载并载入数据集，切分成训练集和验证集train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,                                          strict=strict, seed=data_seed)
logger.info(f'train_dataset: {train_dataset}')logger.info(f'val_dataset: {val_dataset}')logger.info(f'train_dataset[0]: {train_dataset[0]}')
# 将文本encode成tokenstrain_dataset = LazyLLMDataset(            train_dataset, template.encode, strict=strict, random_state=data_seed)val_dataset = LazyLLMDataset(            val_dataset, template.encode, strict=strict, random_state=data_seed)data = train_dataset[0]logger.info(f'encoded_train_dataset[0]: {data}')
template.print_inputs(data)

使用trainer开启训练：

model.enable_input_require_grads()  # 兼容gradient checkpointingtemplate.register_post_encode_hook([model])  # 将post_encode注册到forward_pre_hook中trainer = Seq2SeqTrainer(    model=model,    args=training_args,    data_collator=template.data_collator,    train_dataset=train_dataset,    eval_dataset=val_dataset,    template=template,)trainer.model_accepts_loss_kwargs = True  # 兼容transformers>=4.46trainer.train()
last_model_checkpoint = trainer.state.last_model_checkpointlogger.info(f'last_model_checkpoint: {last_model_checkpoint}')

可视化训练loss: （这里我们只训练了400个steps）

你也可以使用tensorboard在训练过程中可视化训练loss，输入以下命令：`tensorboard --logdir '{output_dir}/runs'`

images_dir = os.path.join(output_dir, 'images')logger.info(f'images_dir: {images_dir}')plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9)  # 保存训练loss图

训练后推理

导入一些包：

import osos.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.llm import (    InferEngine, InferRequest, PtEngine, RequestConfig, get_template, load_dataset, load_image)from swift.tuners import Swiftfrom swift.utils import get_model_parameter_info, get_logger, seed_everythinglogger = get_logger()seed_everything(42)

推理超参数设置：

last_model_checkpoint = 'output/InternVL2_5-2B/vx-xxx/checkpoint-xxx'
# modelmodel_id_or_path = 'OpenGVLab/InternVL2_5-2B'  # model_id or model_path
# datasetdataset = ['AI-ModelScope/LaTeX_OCR#20000']data_seed = 42split_dataset_ratio = 0.01num_proc = 4strict = False
# generation_configmax_new_tokens = 512temperature = 0

我们使用infer_backend 'pt'来对训练后的模型进行推理，如果要使用vllm/lmdeploy进行加速，可以参考：https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py

engine = PtEngine(model)engine.model = Swift.from_pretrained(engine.model, last_model_checkpoint)engine.model.requires_grad_()  # 修复peft将embedding层的requires_grad设置为Truetemplate = get_template(engine.model.model_meta.template, engine.tokenizer)
model_parameter_info = get_model_parameter_info(engine.model)logger.info(f'model_parameter_info: {model_parameter_info}')

获取验证集：

# 由于设置了data_seed，这里的验证集即为训练时的验证集_, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,                                          strict=strict, seed=data_seed)val_dataset = val_dataset.select(range(10))  # 取前10条

流式推理，并保存验证集中的图片：

def infer_stream(engine: InferEngine, infer_request: InferRequest):    request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature, stream=True)    gen = engine.infer([infer_request], request_config)    query = infer_request.messages[0]['content']    print(f'query: {query}\nresponse: ', end='')    for resp_list in gen:        print(resp_list[0].choices[0].delta.content, end='', flush=True)    print()
os.makedirs('images', exist_ok=True)for i, data in enumerate(val_dataset):    image = load_image(data['images'][0]['bytes'])    image.save(f'images/{i}.png')    infer_stream(engine, InferRequest(**data))    print('-' * 50)

推理效果：

点击阅读原文，直达模型合集

?点击关注ModelScope公众号获取

更多技术信息~

训练后推理

Fish AI Reader

FishAI

联系邮箱 441953276@qq.com

相关标签