modelscope · fanqiNO1 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
@@ -846,6 +846,15 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - VIDEO_MAX_SLICE_NUMS: 默认为1，视频的MAX_SLICE_NUMS，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
 - MAX_NUM_FRAMES: 默认为64，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
 
+### minicpmv4_5
+- MAX_SLICE_NUMS: 默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。
+- VIDEO_MAX_SLICE_NUMS: 默认为1，视频的MAX_SLICE_NUMS，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_FRAMES: 默认为180，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_PACKING: 默认为3，合法范围为1到6，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- TIME_SCALE: 默认为0.1，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- CHOOSE_FPS: 默认为3，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- FORCE_PACKING: 默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+
 ### minicpmo
 - INIT_TTS: 默认为False。
 - INIT_AUDIO: 默认为False。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -871,6 +871,15 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
 - MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
 
+### minicpmv4_5
+- MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)
+- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- CHOOSE_FPS: Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- FORCE_PACKING: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+
 ### minicpmo
 - INIT_TTS: Default is False
 - INIT_AUDIO: Default is False

diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py
@@ -12,7 +12,7 @@
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
 from ..utils import Context, Prompt, findall
-from ..vision_utils import load_video_minicpmv_mplug_owl3
+from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5
 from .llama import Llama3TemplateMeta
 from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta
 from .utils import ChatmlTemplateMeta
@@ -251,9 +251,50 @@ def _get_new_tokens(i):
 
 class MiniCPMV4_5Template(MiniCPMV2_6Template):
 
+    def init_env_args(self):
+        super().init_env_args()
+        self.max_num_frames = get_env_args('max_num_frames', int, 180)
+        self.max_num_packing = get_env_args('max_num_packing', int, 3)
+        assert 1 <= self.max_num_packing <= 6
+        self.time_scale = get_env_args('time_scale', float, 0.1)
+        self.choose_fps = get_env_args('choose_fps', float, 3)
+        self.force_packing = get_env_args('force_packing', int, None)
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        load_video = partial(load_video_minicpmv4_5,
+                             max_num_frames=self.max_num_frames,
+                             max_num_packing=self.max_num_packing,
+                             time_scale=self.time_scale,
+                             choose_fps=self.choose_fps,
+                             force_packing=self.force_packing)
+        image_context = super().replace_tag('image', index, inputs)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            return self.replace_video2image(load_video, inputs, lambda i: image_context)
+
+    def replace_video2image(self, load_video_func, inputs: StdTemplateInputs, replace_tag) -> List[Context]:
+        context_list = []
+        if self.mode in {'vllm', 'lmdeploy'}:
+            video = inputs.videos.pop(inputs.video_idx)
+            inputs.video_idx -= 1
+        else:
+            video = inputs.videos[inputs.video_idx]
+        images = inputs.images
+        new_images, temporal_ids = load_video_func(video)
+        inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:]
+        for i in range(len(new_images)):
+            context_list += replace_tag(i)
+        inputs.image_idx += len(new_images)
+        inputs.extra_kwargs['temporal_ids'] = temporal_ids
+        return context_list
+
     def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded = Template._encode(self, inputs)
         images = inputs.images
+        temporal_ids = inputs.extra_kwargs.get('temporal_ids', None)
         use_video = bool(inputs.videos)
         use_image_id = True
         max_slice_nums = self.max_slice_nums
@@ -267,12 +308,16 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
 
         image_processor = self.processor.image_processor
         image_inputs = image_processor([images], return_tensors='pt',
-                                       max_slice_nums=max_slice_nums).to(self.model_info.torch_dtype)
+                                       max_slice_nums=max_slice_nums,
+                                       temporal_ids=temporal_ids).to(self.model_info.torch_dtype)
 
         def _get_new_tokens(i):
-            placeholder = image_processor.get_slice_image_placeholder(
-                image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
-            placeholder += '\n'
+            if i in image_inputs.skip_image_idx[0]:
+                placeholder = ''
+            else:
+                placeholder = image_processor.get_slice_image_placeholder(
+                    image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
+                placeholder += '\n'
             return self.processor.encode(placeholder, add_special_tokens=False)
 
         input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens)
@@ -311,6 +356,9 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         res.update(Template._data_collator(self, batch, padding_to=padding_to))
         return res
 
+    def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
 
 register_template(
     Qwen3MixedTemplateMeta(

diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py
@@ -276,27 +276,87 @@ def load_video_llava(video: Union[str, bytes]) -> np.ndarray:
     return np.stack([x.to_ndarray(format='rgb24') for x in frames])
 
 
+def _uniform_sample(_l, _n):
+    gap = len(_l) / _n
+    idxs = [int(i * gap + gap / 2) for i in range(_n)]
+    return [_l[i] for i in idxs]
+
+
 def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames):
 
     from decord import VideoReader, cpu  # pip install decord
 
-    def uniform_sample(_l, _n):
-        gap = len(_l) / _n
-        idxs = [int(i * gap + gap / 2) for i in range(_n)]
-        return [_l[i] for i in idxs]
-
     video_io = load_file(video)
     vr = VideoReader(video_io, ctx=cpu(0))
     sample_fps = round(vr.get_avg_fps() / 1)  # FPS
     frame_idx = [i for i in range(0, len(vr), sample_fps)]
 
     if len(frame_idx) > max_num_frames:
-        frame_idx = uniform_sample(frame_idx, max_num_frames)
+        frame_idx = _uniform_sample(frame_idx, max_num_frames)
     frames = vr.get_batch(frame_idx).asnumpy()
     frames = [Image.fromarray(v.astype('uint8')) for v in frames]
     return frames
 
 
+def load_video_minicpmv4_5(video: Union[str, bytes],
+                           max_num_frames: int,
+                           max_num_packing: int,
+                           time_scale: float,
+                           choose_fps: float = None,
+                           force_packing: int = None):
+
+    from decord import VideoReader, cpu  # pip install decord
+    from scipy.spatial import cKDTree
+
+    def map_to_nearest_scale(values, scale):
+        tree = cKDTree(np.asarray(scale)[:, None])
+        _, indices = tree.query(np.asarray(values)[:, None])
+        return np.asarray(scale)[indices]
+
+    def group_array(arr, size):
+        return [arr[i:i+size] for i in range(0, len(arr), size)]
+
+    video_io = load_file(video)
+    vr = VideoReader(video_io, ctx=cpu(0))
+    fps = vr.get_avg_fps()
+    duration = len(vr) / fps
+
+    if choose_fps is None:
+        choose_fps = round(fps / 1)  # Get choose FPS based on the original FPS
+
+    # Prepare packing
+    if choose_fps * int(duration) <= max_num_frames:
+        packing_nums = 1
+        choose_frames = round(min(choose_fps, round(fps)) * min(max_num_frames, duration))
+    else:
+        packing_nums = math.ceil(duration * choose_fps / max_num_frames)
+        if packing_nums <= max_num_packing:
+            choose_frames = round(duration * choose_fps)
+        else:
+            choose_frames = round(max_num_frames * max_num_packing)
+            packing_nums = max_num_packing
+
+    frame_idx = [i for i in range(0, len(vr))]
+    frame_idx = np.array(_uniform_sample(frame_idx, choose_frames))
+
+    if force_packing:
+        packing_nums = min(force_packing, max_num_packing)
+
+    frames = vr.get_batch(frame_idx).asnumpy()
+
+    frame_idx_ts = frame_idx / fps
+    scale = np.arange(0, duration, time_scale)
+
+    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale
+    frame_ts_id = frame_ts_id.astype(np.int32)
+    assert len(frames) == len(frame_ts_id)
+
+    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
+    frame_ts_id_group = group_array(frame_ts_id, packing_nums)
+
+    return frames, frame_ts_id_group
+
+
 def load_audio(audio: Union[str, bytes], sampling_rate: int, return_sr: bool = False):
     import librosa
     audio_io = load_file(audio)