From bdfb15113c89bb0f0fde618e52d70cc66bab23e6 Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Fri, 13 Feb 2026 17:42:33 +0800
Subject: [PATCH 1/7] support frames packing for minicpmv4_5 video processing

---
 .../Instruction/Command-line-parameters.md    |  9 +++
 .../Instruction/Command-line-parameters.md    |  9 +++
 swift/template/templates/minicpm.py           | 46 +++++++++++-
 swift/template/vision_utils.py                | 72 +++++++++++++++++--
 4 files changed, 128 insertions(+), 8 deletions(-)

diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
index 7dd4c43025..b4b844c33c 100644
--- a/docs/source/Instruction/Command-line-parameters.md
+++ b/docs/source/Instruction/Command-line-parameters.md
@@ -846,6 +846,15 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - VIDEO_MAX_SLICE_NUMS: 默认为1，视频的MAX_SLICE_NUMS，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
 - MAX_NUM_FRAMES: 默认为64，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
 
+### minicpmv4_5
+- MAX_SLICE_NUMS: 默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。
+- VIDEO_MAX_SLICE_NUMS: 默认为1，视频的MAX_SLICE_NUMS，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_FRAMES：默认为180，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_PACKING：默认为3，合法范围为1到6，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- TIME_SCALE：默认为0.1，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- CHOOSE_FPS，默认为3，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- FORCE_PACKING，默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+
 ### minicpmo
 - INIT_TTS: 默认为False。
 - INIT_AUDIO: 默认为False。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 7ad244c061..77d8782d10 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -871,6 +871,15 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
 - MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
 
+### minicpmv4_5
+- MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)
+- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_FRAMES：Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_PACKING：Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- TIME_SCALE：Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+
 ### minicpmo
 - INIT_TTS: Default is False
 - INIT_AUDIO: Default is False
diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py
index 491d2c5a23..a8059b039b 100644
--- a/swift/template/templates/minicpm.py
+++ b/swift/template/templates/minicpm.py
@@ -12,7 +12,7 @@
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
 from ..utils import Context, Prompt, findall
-from ..vision_utils import load_video_minicpmv_mplug_owl3
+from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5
 from .llama import Llama3TemplateMeta
 from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta
 from .utils import ChatmlTemplateMeta
@@ -251,9 +251,50 @@ def _get_new_tokens(i):
 
 class MiniCPMV4_5Template(MiniCPMV2_6Template):
 
+    def init_env_args(self):
+        super().init_env_args()
+        self.max_num_frames = get_env_args('max_num_frames', int, 180)
+        self.max_num_packing = get_env_args('max_num_packing', int, 3)
+        assert 1 <= self.max_num_packing <= 6
+        self.time_scale = get_env_args('time_scale', float, 0.1)
+        self.choose_fps = get_env_args('choose_fps', float, 3)
+        self.force_packing = get_env_args('force_packing', int, None)
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index,
+                    inputs: StdTemplateInputs) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        load_video = partial(load_video_minicpmv4_5,
+                             max_num_frames=self.max_num_frames,
+                             max_num_packing=self.max_num_packing,
+                             time_scale=self.time_scale,
+                             choose_fps=self.choose_fps,
+                             force_packing=self.force_packing)
+        image_context = super().replace_tag('image', index, inputs)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            return self.replace_video2image(load_video, inputs, lambda i: image_context)
+
+    def replace_video2image(self, load_video_func, inputs: StdTemplateInputs, replace_tag) -> List[Context]:
+        context_list = []
+        if self.mode in {'vllm', 'lmdeploy'}:
+            video = inputs.videos.pop(inputs.video_idx)
+            inputs.video_idx -= 1
+        else:
+            video = inputs.videos[inputs.video_idx]
+        images = inputs.images
+        new_images, temporal_ids = load_video_func(video)
+        inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:]
+        for i in range(len(new_images)):
+            context_list += replace_tag(i)
+        inputs.image_idx += len(new_images)
+        inputs.extra_kwargs['temporal_ids'] = temporal_ids
+        return context_list
+
     def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded = Template._encode(self, inputs)
         images = inputs.images
+        temporal_ids = inputs.extra_kwargs.get('temporal_ids', None)
         use_video = bool(inputs.videos)
         use_image_id = True
         max_slice_nums = self.max_slice_nums
@@ -267,7 +308,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
 
         image_processor = self.processor.image_processor
         image_inputs = image_processor([images], return_tensors='pt',
-                                       max_slice_nums=max_slice_nums).to(self.model_info.torch_dtype)
+                                       max_slice_nums=max_slice_nums,
+                                       temporal_ids=temporal_ids).to(self.model_info.torch_dtype)
 
         def _get_new_tokens(i):
             placeholder = image_processor.get_slice_image_placeholder(
diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py
index dbd44ef2b5..c53218de27 100644
--- a/swift/template/vision_utils.py
+++ b/swift/template/vision_utils.py
@@ -276,27 +276,87 @@ def load_video_llava(video: Union[str, bytes]) -> np.ndarray:
     return np.stack([x.to_ndarray(format='rgb24') for x in frames])
 
 
+def _uniform_sample(_l, _n):
+    gap = len(_l) / _n
+    idxs = [int(i * gap + gap / 2) for i in range(_n)]
+    return [_l[i] for i in idxs]
+
+
 def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames):
 
     from decord import VideoReader, cpu  # pip install decord
 
-    def uniform_sample(_l, _n):
-        gap = len(_l) / _n
-        idxs = [int(i * gap + gap / 2) for i in range(_n)]
-        return [_l[i] for i in idxs]
-
     video_io = load_file(video)
     vr = VideoReader(video_io, ctx=cpu(0))
     sample_fps = round(vr.get_avg_fps() / 1)  # FPS
     frame_idx = [i for i in range(0, len(vr), sample_fps)]
 
     if len(frame_idx) > max_num_frames:
-        frame_idx = uniform_sample(frame_idx, max_num_frames)
+        frame_idx = _uniform_sample(frame_idx, max_num_frames)
     frames = vr.get_batch(frame_idx).asnumpy()
     frames = [Image.fromarray(v.astype('uint8')) for v in frames]
     return frames
 
 
+def load_video_minicpmv4_5(video: Union[str, bytes],
+                           max_num_frames: int,
+                           max_num_packing: int,
+                           time_scale: float,
+                           choose_fps: float = None,
+                           force_packing: int = None):
+
+    from decord import VideoReader, cpu  # pip install decord
+    from scipy.spatial import cKDTree
+
+    def map_to_nearest_scale(values, scale):
+        tree = cKDTree(np.asarray(scale)[:, None])
+        _, indices = tree.query(np.asarray(values)[:, None])
+        return np.asarray(scale)[indices]
+
+    def group_array(arr, size):
+        return [arr[i:i+size] for i in range(0, len(arr), size)]
+
+    video_io = load_file(video)
+    vr = VideoReader(video_io, ctx=cpu(0))
+    fps = vr.get_avg_fps()
+    duration = len(vr) / fps
+
+    if choose_fps is None:
+        choose_fps = round(fps / 1)  # Get choose FPS based on the original FPS
+
+    # Prepare packing
+    if choose_fps * int(duration) <= max_num_frames:
+        packing_nums = 1
+        choose_frames = round(min(choose_fps, round(fps)) * min(max_num_frames, duration))
+    else:
+        packing_nums = math.ceil(duration * choose_fps / max_num_frames)
+        if packing_nums <= max_num_packing:
+            choose_frames = round(duration * choose_fps)
+        else:
+            choose_frames = round(max_num_frames * max_num_packing)
+            packing_nums = max_num_packing
+
+    frame_idx = [i for i in range(0, len(vr))]
+    frame_idx =  np.array(_uniform_sample(frame_idx, choose_frames))
+
+    if force_packing:
+        packing_nums = min(force_packing, max_num_packing)
+
+    frames = vr.get_batch(frame_idx).asnumpy()
+
+    frame_idx_ts = frame_idx / fps
+    scale = np.arange(0, duration, time_scale)
+
+    frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale
+    frame_ts_id = frame_ts_id.astype(np.int32)
+    assert len(frames) == len(frame_ts_id)
+
+    frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
+    frame_ts_id_group = group_array(frame_ts_id, packing_nums)
+    
+    return frames, frame_ts_id_group
+
+
 def load_audio(audio: Union[str, bytes], sampling_rate: int, return_sr: bool = False):
     import librosa
     audio_io = load_file(audio)

From d88e07cb81481a632d15276a22bf84db7ff08bbf Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Fri, 13 Feb 2026 17:53:55 +0800
Subject: [PATCH 2/7] fix inconsistent colons

---
 docs/source/Instruction/Command-line-parameters.md    | 6 +++---
 docs/source_en/Instruction/Command-line-parameters.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
index b4b844c33c..79267a306d 100644
--- a/docs/source/Instruction/Command-line-parameters.md
+++ b/docs/source/Instruction/Command-line-parameters.md
@@ -849,9 +849,9 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 ### minicpmv4_5
 - MAX_SLICE_NUMS: 默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。
 - VIDEO_MAX_SLICE_NUMS: 默认为1，视频的MAX_SLICE_NUMS，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
-- MAX_NUM_FRAMES：默认为180，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
-- MAX_NUM_PACKING：默认为3，合法范围为1到6，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
-- TIME_SCALE：默认为0.1，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_FRAMES: 默认为180，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- MAX_NUM_PACKING: 默认为3，合法范围为1到6，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- TIME_SCALE: 默认为0.1，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 - CHOOSE_FPS，默认为3，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 - FORCE_PACKING，默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 77d8782d10..e79cf25704 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -874,9 +874,9 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 ### minicpmv4_5
 - MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)
 - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
-- MAX_NUM_FRAMES：Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
-- MAX_NUM_PACKING：Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
-- TIME_SCALE：Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 - CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 - FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 

From c3a2949a38dfc87b08e707b7cff8a2c2e3c4ceba Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Fri, 13 Feb 2026 17:56:13 +0800
Subject: [PATCH 3/7] fix commas

---
 docs/source/Instruction/Command-line-parameters.md    | 4 ++--
 docs/source_en/Instruction/Command-line-parameters.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md
index 79267a306d..232f65819f 100644
--- a/docs/source/Instruction/Command-line-parameters.md
+++ b/docs/source/Instruction/Command-line-parameters.md
@@ -852,8 +852,8 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - MAX_NUM_FRAMES: 默认为180，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 - MAX_NUM_PACKING: 默认为3，合法范围为1到6，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 - TIME_SCALE: 默认为0.1，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
-- CHOOSE_FPS，默认为3，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
-- FORCE_PACKING，默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- CHOOSE_FPS: 默认为3，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
+- FORCE_PACKING: 默认为None，参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
 
 ### minicpmo
 - INIT_TTS: 默认为False。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index e79cf25704..6449517e30 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -877,8 +877,8 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 - MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 - TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
-- CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
-- FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- CHOOSE_FPS: Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
+- FORCE_PACKING: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
 
 ### minicpmo
 - INIT_TTS: Default is False

From 69ad49c5137969d2286b79eeabddf4c82bf55333 Mon Sep 17 00:00:00 2001
From: Qi Fan <75657629+fanqiNO1@users.noreply.github.com>
Date: Fri, 13 Feb 2026 17:59:00 +0800
Subject: [PATCH 4/7] remove whitespace to enhance code style consistency

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 swift/template/vision_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py
index c53218de27..97641217fc 100644
--- a/swift/template/vision_utils.py
+++ b/swift/template/vision_utils.py
@@ -337,7 +337,7 @@ def group_array(arr, size):
             packing_nums = max_num_packing
 
     frame_idx = [i for i in range(0, len(vr))]
-    frame_idx =  np.array(_uniform_sample(frame_idx, choose_frames))
+    frame_idx = np.array(_uniform_sample(frame_idx, choose_frames))
 
     if force_packing:
         packing_nums = min(force_packing, max_num_packing)
@@ -353,7 +353,7 @@ def group_array(arr, size):
 
     frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
     frame_ts_id_group = group_array(frame_ts_id, packing_nums)
-    
+
     return frames, frame_ts_id_group
 
 

From 3c462d090f39181bd2a27eafac412dd18e680251 Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Sun, 15 Feb 2026 19:56:33 +0800
Subject: [PATCH 5/7] fix get_new_tokens with skip_image_idx

---
 swift/template/templates/minicpm.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py
index a8059b039b..3ff41e2c84 100644
--- a/swift/template/templates/minicpm.py
+++ b/swift/template/templates/minicpm.py
@@ -312,9 +312,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
                                        temporal_ids=temporal_ids).to(self.model_info.torch_dtype)
 
         def _get_new_tokens(i):
-            placeholder = image_processor.get_slice_image_placeholder(
-                image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
-            placeholder += '\n'
+            if i in image_inputs.skip_image_idx[0]:
+                placeholder = ''
+            else:
+                placeholder = image_processor.get_slice_image_placeholder(
+                    image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
+                placeholder += '\n'
             return self.processor.encode(placeholder, add_special_tokens=False)
 
         input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens)

From d9f4b23fc957c8af8a37fabe4803d50e724a53a4 Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Mon, 16 Feb 2026 17:29:34 +0800
Subject: [PATCH 6/7] hack post_encode

---
 swift/template/templates/minicpm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py
index 3ff41e2c84..3ff83dc1a3 100644
--- a/swift/template/templates/minicpm.py
+++ b/swift/template/templates/minicpm.py
@@ -356,6 +356,9 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         res.update(Template._data_collator(self, batch, padding_to=padding_to))
         return res
 
+    def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
 
 register_template(
     Qwen3MixedTemplateMeta(

From aa178b62864c9e59e9720388ba7d0c94665f17ec Mon Sep 17 00:00:00 2001
From: fanqiNO1 <1848839264@qq.com>
Date: Thu, 19 Feb 2026 20:52:42 +0800
Subject: [PATCH 7/7] fix lint

---
 swift/template/templates/minicpm.py | 23 ++++++++++++++---------
 swift/template/vision_utils.py      |  2 +-
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py
index 3ff83dc1a3..8b554e62f7 100644
--- a/swift/template/templates/minicpm.py
+++ b/swift/template/templates/minicpm.py
@@ -12,7 +12,7 @@
 from ..register import TemplateMeta, register_template
 from ..template_inputs import StdTemplateInputs
 from ..utils import Context, Prompt, findall
-from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5
+from ..vision_utils import load_video_minicpmv4_5, load_video_minicpmv_mplug_owl3
 from .llama import Llama3TemplateMeta
 from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta
 from .utils import ChatmlTemplateMeta
@@ -263,12 +263,13 @@ def init_env_args(self):
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index,
                     inputs: StdTemplateInputs) -> List[Context]:
         assert media_type in {'image', 'video'}
-        load_video = partial(load_video_minicpmv4_5,
-                             max_num_frames=self.max_num_frames,
-                             max_num_packing=self.max_num_packing,
-                             time_scale=self.time_scale,
-                             choose_fps=self.choose_fps,
-                             force_packing=self.force_packing)
+        load_video = partial(
+            load_video_minicpmv4_5,
+            max_num_frames=self.max_num_frames,
+            max_num_packing=self.max_num_packing,
+            time_scale=self.time_scale,
+            choose_fps=self.choose_fps,
+            force_packing=self.force_packing)
         image_context = super().replace_tag('image', index, inputs)
         if media_type == 'image':
             return image_context
@@ -307,7 +308,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         idx_list = findall(input_ids, -100)
 
         image_processor = self.processor.image_processor
-        image_inputs = image_processor([images], return_tensors='pt',
+        image_inputs = image_processor([images],
+                                       return_tensors='pt',
                                        max_slice_nums=max_slice_nums,
                                        temporal_ids=temporal_ids).to(self.model_info.torch_dtype)
 
@@ -316,7 +318,10 @@ def _get_new_tokens(i):
                 placeholder = ''
             else:
                 placeholder = image_processor.get_slice_image_placeholder(
-                    image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
+                    image_inputs.image_sizes[0][i],
+                    image_idx=i,
+                    max_slice_nums=max_slice_nums,
+                    use_image_id=use_image_id)
                 placeholder += '\n'
             return self.processor.encode(placeholder, add_special_tokens=False)
 
diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py
index 97641217fc..dbd4ec14f6 100644
--- a/swift/template/vision_utils.py
+++ b/swift/template/vision_utils.py
@@ -314,7 +314,7 @@ def map_to_nearest_scale(values, scale):
         return np.asarray(scale)[indices]
 
     def group_array(arr, size):
-        return [arr[i:i+size] for i in range(0, len(arr), size)]
+        return [arr[i:i + size] for i in range(0, len(arr), size)]
 
     video_io = load_file(video)
     vr = VideoReader(video_io, ctx=cpu(0))