support frames packing for minicpmv4_5 video processing by fanqiNO1 · Pull Request #8046 · modelscope/ms-swift

fanqiNO1 · 2026-02-13T09:47:17Z

PR type

Bug Fix
New Feature
Document Updates
More Models or Datasets Support

PR information

According to the "Chat With Video" feature of MiniCPM-V-4.5 on https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5, I implemented frames packing to enable more efficient video processing for MiniCPM-V 4.5.

Experiment results

I verified that the behavior of my implementation is consistent with that of the official provided script. The script is as follows:

script

import math
import json
import os
from copy import deepcopy
from decord import VideoReader, cpu

import numpy as np
import torch
from PIL import Image
from scipy.spatial import cKDTree
from transformers import AutoProcessor

from swift.model import get_processor
from swift.template import get_template


MAX_NUM_FRAMES = 180
MAX_NUM_PACKING = 3
TIME_SCALE = 0.1

video_path = "./test_video.mp4"
user_prompt = "Describe the video"
fps = 5
force_packing = None


def map_to_nearest_scale(values, scale):
    tree = cKDTree(np.asarray(scale)[:, None])
    _, indices = tree.query(np.asarray(values)[:, None])
    return np.asarray(scale)[indices]


def group_array(arr, size):
    return [arr[i:i+size] for i in range(0, len(arr), size)]


def encode_video(video_path, choose_fps=3, force_packing=None):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]
    vr = VideoReader(video_path, ctx=cpu(0))
    fps = vr.get_avg_fps()
    video_duration = len(vr) / fps
        
    if choose_fps * int(video_duration) <= MAX_NUM_FRAMES:
        packing_nums = 1
        choose_frames = round(min(choose_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
        
    else:
        packing_nums = math.ceil(video_duration * choose_fps / MAX_NUM_FRAMES)
        if packing_nums <= MAX_NUM_PACKING:
            choose_frames = round(video_duration * choose_fps)
        else:
            choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
            packing_nums = MAX_NUM_PACKING

    frame_idx = [i for i in range(0, len(vr))]      
    frame_idx =  np.array(uniform_sample(frame_idx, choose_frames))

    if force_packing:
        packing_nums = min(force_packing, MAX_NUM_PACKING)
    
    print(video_path, ' duration:', video_duration)
    print(f'get video frames={len(frame_idx)}, packing_nums={packing_nums}')
    
    frames = vr.get_batch(frame_idx).asnumpy()

    frame_idx_ts = frame_idx / fps
    scale = np.arange(0, video_duration, TIME_SCALE)

    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
    frame_ts_id = frame_ts_id.astype(np.int32)

    assert len(frames) == len(frame_ts_id)

    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
    frame_ts_id_group = group_array(frame_ts_id, packing_nums)
    
    return frames, frame_ts_id_group


def minicpmv4_5_official():
    processor = AutoProcessor.from_pretrained("OpenBMB/MiniCPM-V-4_5", trust_remote_code=True)

    frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)

    messages_list = [[{'role': 'user', 'content': frames + [user_prompt]}]]
    images_list = [None]

    prompts_lists = []
    input_images_lists = []

    for image, msgs in zip(images_list, messages_list):
        if isinstance(msgs, str):
            msgs = json.loads(msgs)
        copy_msgs = deepcopy(msgs)

        if image is not None and isinstance(copy_msgs[0]["content"], str):
            copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]

        images = []
        for i, msg in enumerate(copy_msgs):
            role = msg["role"]
            content = msg["content"]
            assert role in ["system", "user", "assistant"]
            if isinstance(content, str):
                content = [content]
            cur_msgs = []
            for c in content:
                if isinstance(c, Image.Image):
                    images.append(c)
                    cur_msgs.append("(<image>./</image>)")
                elif isinstance(c, str):
                    cur_msgs.append(c)
            msg["content"] = "\n".join(cur_msgs)

        prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True, enable_thinking=False))
        input_images_lists.append(images)

    inputs = processor(
        prompts_lists,
        input_images_lists,
        max_slice_nums=1,
        use_image_id=False,
        temporal_ids=frame_ts_id_group,
        return_tensors="pt"
    )

    input_string = processor.tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=False)
    print("Official Decoded input string:", input_string[0])
    return inputs


def swift_template_test():

    os.environ["VIDEO_MAX_SLICE_NUMS"] = "1"
    os.environ["MAX_NUM_FRAMES"] = str(MAX_NUM_FRAMES)
    os.environ["MAX_NUM_PACKING"] = str(MAX_NUM_PACKING)
    os.environ["TIME_SCALE"] = str(TIME_SCALE)
    os.environ["CHOOSE_FPS"] = str(fps)

    processor = get_processor("OpenBMB/MiniCPM-V-4_5")
    template = get_template(processor, enable_thinking=False)

    inputs = {
        "messages": [
            {"role": "user", "content": f"<video>{user_prompt}"}
        ],
        "videos": [video_path]
    }

    inputs = template.encode(inputs)
    input_string = template.safe_decode(inputs["input_ids"])
    print("Swift Decoded input string:", input_string)
    return inputs


def is_equal(value1, value2):
    if isinstance(value1, list) and isinstance(value2, list):
        if len(value1) != len(value2):
            return False
        for v1, v2 in zip(value1, value2):
            if not is_equal(v1, v2):
                return False
        return True
    elif isinstance(value1, torch.Tensor) and isinstance(value2, torch.Tensor):
        if value1.shape != value2.shape:
            print(f"Tensor shapes differ: {value1.shape} vs {value2.shape}")
            return False
        if not torch.equal(value1, value2):
            print(f"Tensor values differ at some positions.")
            return False
        return True
    else:
        return value1 == value2


def main():
    official_inputs = minicpmv4_5_official()
    swift_inputs = swift_template_test()

    print("Official inputs keys:", list(official_inputs.keys()))
    print("Swift inputs keys:", list(swift_inputs.keys()))

    for key in swift_inputs.keys():
        assert key in official_inputs, f"Key '{key}' not found in official inputs"
        print(f"Comparing key: {key}")

        if key == "input_ids":
            official_value = official_inputs[key][0].tolist()
        elif key == "pixel_values":
            official_value = official_inputs[key]
            for i in range(len(official_value[0])):
                official_value[0][i] = official_value[0][i].to(torch.bfloat16)
        else:
            official_value = official_inputs[key]
        swift_value = swift_inputs[key]

        assert isinstance(official_value, list)
        assert isinstance(swift_value, list)
        assert len(official_value) == len(swift_value), f"len(official[{key}])={len(official_value)} vs len(swift[{key}])={len(swift_value)})"

        for i, (o, s) in enumerate(zip(official_value, swift_value)):
            if not is_equal(o, s):
                print(f"❌ Difference found in key '{key}' at index {i}")
                break
                # raise AssertionError(f"Values for key '{key}' at index {i} do not match.")
        else:
            print(f"✔️ Values match for key '{key}'.")

if __name__ == "__main__":
    main()

gemini-code-assist · 2026-02-13T09:47:35Z

Summary of Changes

Hello @fanqiNO1, I'm Gemini Code Assist¹! I'm currently reviewing this pull request and will post my feedback shortly. In the meantime, here's a summary to help you and other reviewers quickly get up to speed!

This pull request significantly enhances the video processing capabilities of the MiniCPM-V-4.5 model by implementing a frames packing mechanism. This feature allows for more optimized handling of video inputs through intelligent frame sampling and grouping, which is expected to improve performance and resource utilization for video-based tasks. The changes include the introduction of new configuration parameters and core logic for managing video frames.

Highlights

Frames Packing for MiniCPM-V-4.5: Introduced functionality to pack video frames for the MiniCPM-V-4.5 model, enabling more efficient video processing.
New Command-Line Parameters: Added several new command-line parameters specifically for minicpmv4_5 to control video processing and frame packing behavior.
Dedicated Video Loading Function: Implemented a new utility function, load_video_minicpmv4_5, responsible for advanced video frame sampling, time scaling, and packing logic.
Template Integration: Integrated the new video processing and frames packing logic into the MiniCPMV4_5Template to handle video inputs efficiently.

🧠 New Feature in Public Preview: You can now enable Memory to help Gemini Code Assist learn from your team's feedback. This makes future code reviews more consistent and personalized to your project's style. Click here to enable Memory in your admin console.

Changelog

docs/source/Instruction/Command-line-parameters.md
- Added new command-line parameters for minicpmv4_5, including MAX_SLICE_NUMS, VIDEO_MAX_SLICE_NUMS, MAX_NUM_FRAMES, MAX_NUM_PACKING, TIME_SCALE, CHOOSE_FPS, and FORCE_PACKING.
docs/source_en/Instruction/Command-line-parameters.md
- Added new English command-line parameters for minicpmv4_5, mirroring the Chinese documentation.
swift/template/templates/minicpm.py
- Imported the new load_video_minicpmv4_5 utility function.
- Implemented init_env_args in MiniCPMV4_5Template to load new environment arguments for video processing.
- Overrode replace_tag to utilize the new video loading function and introduced replace_video2image for video-to-image conversion logic.
- Modified the _encode method to pass temporal_ids to the image processor, supporting the new frame packing.
swift/template/vision_utils.py
- Extracted _uniform_sample into a global helper function for reuse.
- Added load_video_minicpmv4_5 function, which encapsulates the core logic for video frame sampling, time scaling, and packing based on various configurable parameters.

Activity

No activity found in the pull request.

Using Gemini Code Assist

The full guide for Gemini Code Assist can be found on our documentation page, here are some quick tips.

Invoking Gemini

You can request assistance from Gemini at any point by creating a comment using either /gemini <command> or @gemini-code-assist <command>. Below is a summary of the supported commands on the current page.

Feature	Command	Description
Code Review	`/gemini review`	Performs a code review for the current pull request in its current state.
Pull Request Summary	`/gemini summary`	Provides a summary of the current pull request in its current state.
Comment	@gemini-code-assist	Responds in comments when explicitly tagged, both in pull request comments and review comments.
Help	`/gemini help`	Displays a list of available commands.

Customization

To customize Gemini Code Assist for GitHub experience, repository maintainers can create a configuration file and/or provide a custom code review style guide (such as PEP-8 for Python) by creating and adding files to a .gemini/ folder in the base of the repository. Detailed instructions can be found here.

Limitations & Feedback

Gemini Code Assist may make mistakes. Please leave feedback on any instances where its feedback is incorrect or counter productive. You can react with 👍 and 👎 on @gemini-code-assist comments. If you're interested in giving your feedback about your experience with Gemini Code Assist for Github and other Google products, sign up here.

You can also get AI-powered code generation, chat, as well as code reviews directly in the IDE at no cost with the Gemini Code Assist IDE Extension.

Review the Privacy Notices, Generative AI Prohibited Use Policy, Terms of Service, and learn how to configure Gemini Code Assist in GitHub here. Gemini can make mistakes, so double check it and use code with caution. ↩

gemini-code-assist

Code Review

This pull request adds support for frames packing in video processing for the MiniCPM-V 4.5 model. The changes include adding a new video loading utility, updating the corresponding model template, and documenting the new command-line parameters.

My review focuses on improving code maintainability and documentation consistency. I've pointed out some code duplication in the template class and suggested refactoring opportunities in the vision utilities. I also found some minor formatting inconsistencies in the documentation files. Overall, the implementation of the new feature seems correct and follows the project's patterns.

docs/source/Instruction/Command-line-parameters.md

docs/source_en/Instruction/Command-line-parameters.md

gemini-code-assist · 2026-02-13T09:49:48Z

swift/template/vision_utils.py

+    def map_to_nearest_scale(values, scale):
+        tree = cKDTree(np.asarray(scale)[:, None])
+        _, indices = tree.query(np.asarray(values)[:, None])
+        return np.asarray(scale)[indices]
+
+    def group_array(arr, size):
+        return [arr[i:i+size] for i in range(0, len(arr), size)]


The helper functions map_to_nearest_scale and group_array are defined inside load_video_minicpmv4_5. To improve code organization and potential reusability, consider moving them to the module level as private functions (e.g., _map_to_nearest_scale and _group_array), similar to how _uniform_sample was refactored.

swift/template/vision_utils.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

support frames packing for minicpmv4_5 video processing

bdfb151

gemini-code-assist bot reviewed Feb 13, 2026

View reviewed changes

fanqiNO1 and others added 4 commits February 13, 2026 17:53

fix inconsistent colons

d88e07c

fix commas

c3a2949

remove whitespace to enhance code style consistency

69ad49c

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

fix get_new_tokens with skip_image_idx

3c462d0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

support frames packing for minicpmv4_5 video processing#8046

support frames packing for minicpmv4_5 video processing#8046
fanqiNO1 wants to merge 5 commits intomodelscope:mainfrom
fanqiNO1:minicpmv4_5

fanqiNO1 commented Feb 13, 2026 •

edited

Loading

Uh oh!

gemini-code-assist bot commented Feb 13, 2026

Uh oh!

gemini-code-assist bot left a comment

Uh oh!

Uh oh!

Uh oh!

gemini-code-assist bot Feb 13, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

Conversation

fanqiNO1 commented Feb 13, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

PR type

PR information

Experiment results

Uh oh!

gemini-code-assist bot commented Feb 13, 2026

Summary of Changes

Highlights

Footnotes

Uh oh!

gemini-code-assist bot left a comment

Choose a reason for hiding this comment

Code Review

Uh oh!

Uh oh!

Uh oh!

gemini-code-assist bot Feb 13, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

1 participant

fanqiNO1 commented Feb 13, 2026 •

edited

Loading