Skip to content

support frames packing for minicpmv4_5 video processing#8046

Open
fanqiNO1 wants to merge 5 commits intomodelscope:mainfrom
fanqiNO1:minicpmv4_5
Open

support frames packing for minicpmv4_5 video processing#8046
fanqiNO1 wants to merge 5 commits intomodelscope:mainfrom
fanqiNO1:minicpmv4_5

Conversation

@fanqiNO1
Copy link

@fanqiNO1 fanqiNO1 commented Feb 13, 2026

PR type

  • Bug Fix
  • New Feature
  • Document Updates
  • More Models or Datasets Support

PR information

According to the "Chat With Video" feature of MiniCPM-V-4.5 on https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5, I implemented frames packing to enable more efficient video processing for MiniCPM-V 4.5.

Experiment results

I verified that the behavior of my implementation is consistent with that of the official provided script. The script is as follows:

script
import math
import json
import os
from copy import deepcopy
from decord import VideoReader, cpu

import numpy as np
import torch
from PIL import Image
from scipy.spatial import cKDTree
from transformers import AutoProcessor

from swift.model import get_processor
from swift.template import get_template


MAX_NUM_FRAMES = 180
MAX_NUM_PACKING = 3
TIME_SCALE = 0.1

video_path = "./test_video.mp4"
user_prompt = "Describe the video"
fps = 5
force_packing = None


def map_to_nearest_scale(values, scale):
    tree = cKDTree(np.asarray(scale)[:, None])
    _, indices = tree.query(np.asarray(values)[:, None])
    return np.asarray(scale)[indices]


def group_array(arr, size):
    return [arr[i:i+size] for i in range(0, len(arr), size)]


def encode_video(video_path, choose_fps=3, force_packing=None):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]
    vr = VideoReader(video_path, ctx=cpu(0))
    fps = vr.get_avg_fps()
    video_duration = len(vr) / fps
        
    if choose_fps * int(video_duration) <= MAX_NUM_FRAMES:
        packing_nums = 1
        choose_frames = round(min(choose_fps, round(fps)) * min(MAX_NUM_FRAMES, video_duration))
        
    else:
        packing_nums = math.ceil(video_duration * choose_fps / MAX_NUM_FRAMES)
        if packing_nums <= MAX_NUM_PACKING:
            choose_frames = round(video_duration * choose_fps)
        else:
            choose_frames = round(MAX_NUM_FRAMES * MAX_NUM_PACKING)
            packing_nums = MAX_NUM_PACKING

    frame_idx = [i for i in range(0, len(vr))]      
    frame_idx =  np.array(uniform_sample(frame_idx, choose_frames))

    if force_packing:
        packing_nums = min(force_packing, MAX_NUM_PACKING)
    
    print(video_path, ' duration:', video_duration)
    print(f'get video frames={len(frame_idx)}, packing_nums={packing_nums}')
    
    frames = vr.get_batch(frame_idx).asnumpy()

    frame_idx_ts = frame_idx / fps
    scale = np.arange(0, video_duration, TIME_SCALE)

    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / TIME_SCALE
    frame_ts_id = frame_ts_id.astype(np.int32)

    assert len(frames) == len(frame_ts_id)

    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
    frame_ts_id_group = group_array(frame_ts_id, packing_nums)
    
    return frames, frame_ts_id_group


def minicpmv4_5_official():
    processor = AutoProcessor.from_pretrained("OpenBMB/MiniCPM-V-4_5", trust_remote_code=True)

    frames, frame_ts_id_group = encode_video(video_path, fps, force_packing=force_packing)

    messages_list = [[{'role': 'user', 'content': frames + [user_prompt]}]]
    images_list = [None]

    prompts_lists = []
    input_images_lists = []

    for image, msgs in zip(images_list, messages_list):
        if isinstance(msgs, str):
            msgs = json.loads(msgs)
        copy_msgs = deepcopy(msgs)

        if image is not None and isinstance(copy_msgs[0]["content"], str):
            copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]

        images = []
        for i, msg in enumerate(copy_msgs):
            role = msg["role"]
            content = msg["content"]
            assert role in ["system", "user", "assistant"]
            if isinstance(content, str):
                content = [content]
            cur_msgs = []
            for c in content:
                if isinstance(c, Image.Image):
                    images.append(c)
                    cur_msgs.append("(<image>./</image>)")
                elif isinstance(c, str):
                    cur_msgs.append(c)
            msg["content"] = "\n".join(cur_msgs)

        prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True, enable_thinking=False))
        input_images_lists.append(images)

    inputs = processor(
        prompts_lists,
        input_images_lists,
        max_slice_nums=1,
        use_image_id=False,
        temporal_ids=frame_ts_id_group,
        return_tensors="pt"
    )

    input_string = processor.tokenizer.batch_decode(inputs["input_ids"], skip_special_tokens=False)
    print("Official Decoded input string:", input_string[0])
    return inputs


def swift_template_test():

    os.environ["VIDEO_MAX_SLICE_NUMS"] = "1"
    os.environ["MAX_NUM_FRAMES"] = str(MAX_NUM_FRAMES)
    os.environ["MAX_NUM_PACKING"] = str(MAX_NUM_PACKING)
    os.environ["TIME_SCALE"] = str(TIME_SCALE)
    os.environ["CHOOSE_FPS"] = str(fps)

    processor = get_processor("OpenBMB/MiniCPM-V-4_5")
    template = get_template(processor, enable_thinking=False)

    inputs = {
        "messages": [
            {"role": "user", "content": f"<video>{user_prompt}"}
        ],
        "videos": [video_path]
    }

    inputs = template.encode(inputs)
    input_string = template.safe_decode(inputs["input_ids"])
    print("Swift Decoded input string:", input_string)
    return inputs


def is_equal(value1, value2):
    if isinstance(value1, list) and isinstance(value2, list):
        if len(value1) != len(value2):
            return False
        for v1, v2 in zip(value1, value2):
            if not is_equal(v1, v2):
                return False
        return True
    elif isinstance(value1, torch.Tensor) and isinstance(value2, torch.Tensor):
        if value1.shape != value2.shape:
            print(f"Tensor shapes differ: {value1.shape} vs {value2.shape}")
            return False
        if not torch.equal(value1, value2):
            print(f"Tensor values differ at some positions.")
            return False
        return True
    else:
        return value1 == value2


def main():
    official_inputs = minicpmv4_5_official()
    swift_inputs = swift_template_test()

    print("Official inputs keys:", list(official_inputs.keys()))
    print("Swift inputs keys:", list(swift_inputs.keys()))

    for key in swift_inputs.keys():
        assert key in official_inputs, f"Key '{key}' not found in official inputs"
        print(f"Comparing key: {key}")

        if key == "input_ids":
            official_value = official_inputs[key][0].tolist()
        elif key == "pixel_values":
            official_value = official_inputs[key]
            for i in range(len(official_value[0])):
                official_value[0][i] = official_value[0][i].to(torch.bfloat16)
        else:
            official_value = official_inputs[key]
        swift_value = swift_inputs[key]

        assert isinstance(official_value, list)
        assert isinstance(swift_value, list)
        assert len(official_value) == len(swift_value), f"len(official[{key}])={len(official_value)} vs len(swift[{key}])={len(swift_value)})"

        for i, (o, s) in enumerate(zip(official_value, swift_value)):
            if not is_equal(o, s):
                print(f"❌ Difference found in key '{key}' at index {i}")
                break
                # raise AssertionError(f"Values for key '{key}' at index {i} do not match.")
        else:
            print(f"✔️ Values match for key '{key}'.")

if __name__ == "__main__":
    main()

@gemini-code-assist
Copy link
Contributor

Summary of Changes

Hello @fanqiNO1, I'm Gemini Code Assist1! I'm currently reviewing this pull request and will post my feedback shortly. In the meantime, here's a summary to help you and other reviewers quickly get up to speed!

This pull request significantly enhances the video processing capabilities of the MiniCPM-V-4.5 model by implementing a frames packing mechanism. This feature allows for more optimized handling of video inputs through intelligent frame sampling and grouping, which is expected to improve performance and resource utilization for video-based tasks. The changes include the introduction of new configuration parameters and core logic for managing video frames.

Highlights

  • Frames Packing for MiniCPM-V-4.5: Introduced functionality to pack video frames for the MiniCPM-V-4.5 model, enabling more efficient video processing.
  • New Command-Line Parameters: Added several new command-line parameters specifically for minicpmv4_5 to control video processing and frame packing behavior.
  • Dedicated Video Loading Function: Implemented a new utility function, load_video_minicpmv4_5, responsible for advanced video frame sampling, time scaling, and packing logic.
  • Template Integration: Integrated the new video processing and frames packing logic into the MiniCPMV4_5Template to handle video inputs efficiently.

🧠 New Feature in Public Preview: You can now enable Memory to help Gemini Code Assist learn from your team's feedback. This makes future code reviews more consistent and personalized to your project's style. Click here to enable Memory in your admin console.

Changelog
  • docs/source/Instruction/Command-line-parameters.md
    • Added new command-line parameters for minicpmv4_5, including MAX_SLICE_NUMS, VIDEO_MAX_SLICE_NUMS, MAX_NUM_FRAMES, MAX_NUM_PACKING, TIME_SCALE, CHOOSE_FPS, and FORCE_PACKING.
  • docs/source_en/Instruction/Command-line-parameters.md
    • Added new English command-line parameters for minicpmv4_5, mirroring the Chinese documentation.
  • swift/template/templates/minicpm.py
    • Imported the new load_video_minicpmv4_5 utility function.
    • Implemented init_env_args in MiniCPMV4_5Template to load new environment arguments for video processing.
    • Overrode replace_tag to utilize the new video loading function and introduced replace_video2image for video-to-image conversion logic.
    • Modified the _encode method to pass temporal_ids to the image processor, supporting the new frame packing.
  • swift/template/vision_utils.py
    • Extracted _uniform_sample into a global helper function for reuse.
    • Added load_video_minicpmv4_5 function, which encapsulates the core logic for video frame sampling, time scaling, and packing based on various configurable parameters.
Activity
  • No activity found in the pull request.
Using Gemini Code Assist

The full guide for Gemini Code Assist can be found on our documentation page, here are some quick tips.

Invoking Gemini

You can request assistance from Gemini at any point by creating a comment using either /gemini <command> or @gemini-code-assist <command>. Below is a summary of the supported commands on the current page.

Feature Command Description
Code Review /gemini review Performs a code review for the current pull request in its current state.
Pull Request Summary /gemini summary Provides a summary of the current pull request in its current state.
Comment @gemini-code-assist Responds in comments when explicitly tagged, both in pull request comments and review comments.
Help /gemini help Displays a list of available commands.

Customization

To customize Gemini Code Assist for GitHub experience, repository maintainers can create a configuration file and/or provide a custom code review style guide (such as PEP-8 for Python) by creating and adding files to a .gemini/ folder in the base of the repository. Detailed instructions can be found here.

Limitations & Feedback

Gemini Code Assist may make mistakes. Please leave feedback on any instances where its feedback is incorrect or counter productive. You can react with 👍 and 👎 on @gemini-code-assist comments. If you're interested in giving your feedback about your experience with Gemini Code Assist for Github and other Google products, sign up here.

You can also get AI-powered code generation, chat, as well as code reviews directly in the IDE at no cost with the Gemini Code Assist IDE Extension.

Footnotes

  1. Review the Privacy Notices, Generative AI Prohibited Use Policy, Terms of Service, and learn how to configure Gemini Code Assist in GitHub here. Gemini can make mistakes, so double check it and use code with caution.

Copy link
Contributor

@gemini-code-assist gemini-code-assist bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Review

This pull request adds support for frames packing in video processing for the MiniCPM-V 4.5 model. The changes include adding a new video loading utility, updating the corresponding model template, and documenting the new command-line parameters.

My review focuses on improving code maintainability and documentation consistency. I've pointed out some code duplication in the template class and suggested refactoring opportunities in the vision utilities. I also found some minor formatting inconsistencies in the documentation files. Overall, the implementation of the new feature seems correct and follows the project's patterns.

Comment on lines +311 to +317
def map_to_nearest_scale(values, scale):
tree = cKDTree(np.asarray(scale)[:, None])
_, indices = tree.query(np.asarray(values)[:, None])
return np.asarray(scale)[indices]

def group_array(arr, size):
return [arr[i:i+size] for i in range(0, len(arr), size)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The helper functions map_to_nearest_scale and group_array are defined inside load_video_minicpmv4_5. To improve code organization and potential reusability, consider moving them to the module level as private functions (e.g., _map_to_nearest_scale and _group_array), similar to how _uniform_sample was refactored.

fanqiNO1 and others added 4 commits February 13, 2026 17:53
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant