From bdfb15113c89bb0f0fde618e52d70cc66bab23e6 Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Fri, 13 Feb 2026 17:42:33 +0800 Subject: [PATCH 1/7] support frames packing for minicpmv4_5 video processing --- .../Instruction/Command-line-parameters.md | 9 +++ .../Instruction/Command-line-parameters.md | 9 +++ swift/template/templates/minicpm.py | 46 +++++++++++- swift/template/vision_utils.py | 72 +++++++++++++++++-- 4 files changed, 128 insertions(+), 8 deletions(-) diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 7dd4c43025..b4b844c33c 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -846,6 +846,15 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 - VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。 - MAX_NUM_FRAMES: 默认为64,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。 +### minicpmv4_5 +- MAX_SLICE_NUMS: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。 +- VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_FRAMES:默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_PACKING:默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- TIME_SCALE:默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- CHOOSE_FPS,默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- FORCE_PACKING,默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 + ### minicpmo - INIT_TTS: 默认为False。 - INIT_AUDIO: 默认为False。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 7ad244c061..77d8782d10 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -871,6 +871,15 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) - MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) +### minicpmv4_5 +- MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1) +- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_FRAMES:Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_PACKING:Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- TIME_SCALE:Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) + ### minicpmo - INIT_TTS: Default is False - INIT_AUDIO: Default is False diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py index 491d2c5a23..a8059b039b 100644 --- a/swift/template/templates/minicpm.py +++ b/swift/template/templates/minicpm.py @@ -12,7 +12,7 @@ from ..register import TemplateMeta, register_template from ..template_inputs import StdTemplateInputs from ..utils import Context, Prompt, findall -from ..vision_utils import load_video_minicpmv_mplug_owl3 +from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5 from .llama import Llama3TemplateMeta from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta from .utils import ChatmlTemplateMeta @@ -251,9 +251,50 @@ def _get_new_tokens(i): class MiniCPMV4_5Template(MiniCPMV2_6Template): + def init_env_args(self): + super().init_env_args() + self.max_num_frames = get_env_args('max_num_frames', int, 180) + self.max_num_packing = get_env_args('max_num_packing', int, 3) + assert 1 <= self.max_num_packing <= 6 + self.time_scale = get_env_args('time_scale', float, 0.1) + self.choose_fps = get_env_args('choose_fps', float, 3) + self.force_packing = get_env_args('force_packing', int, None) + + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, + inputs: StdTemplateInputs) -> List[Context]: + assert media_type in {'image', 'video'} + load_video = partial(load_video_minicpmv4_5, + max_num_frames=self.max_num_frames, + max_num_packing=self.max_num_packing, + time_scale=self.time_scale, + choose_fps=self.choose_fps, + force_packing=self.force_packing) + image_context = super().replace_tag('image', index, inputs) + if media_type == 'image': + return image_context + elif media_type == 'video': + return self.replace_video2image(load_video, inputs, lambda i: image_context) + + def replace_video2image(self, load_video_func, inputs: StdTemplateInputs, replace_tag) -> List[Context]: + context_list = [] + if self.mode in {'vllm', 'lmdeploy'}: + video = inputs.videos.pop(inputs.video_idx) + inputs.video_idx -= 1 + else: + video = inputs.videos[inputs.video_idx] + images = inputs.images + new_images, temporal_ids = load_video_func(video) + inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:] + for i in range(len(new_images)): + context_list += replace_tag(i) + inputs.image_idx += len(new_images) + inputs.extra_kwargs['temporal_ids'] = temporal_ids + return context_list + def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: encoded = Template._encode(self, inputs) images = inputs.images + temporal_ids = inputs.extra_kwargs.get('temporal_ids', None) use_video = bool(inputs.videos) use_image_id = True max_slice_nums = self.max_slice_nums @@ -267,7 +308,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: image_processor = self.processor.image_processor image_inputs = image_processor([images], return_tensors='pt', - max_slice_nums=max_slice_nums).to(self.model_info.torch_dtype) + max_slice_nums=max_slice_nums, + temporal_ids=temporal_ids).to(self.model_info.torch_dtype) def _get_new_tokens(i): placeholder = image_processor.get_slice_image_placeholder( diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py index dbd44ef2b5..c53218de27 100644 --- a/swift/template/vision_utils.py +++ b/swift/template/vision_utils.py @@ -276,27 +276,87 @@ def load_video_llava(video: Union[str, bytes]) -> np.ndarray: return np.stack([x.to_ndarray(format='rgb24') for x in frames]) +def _uniform_sample(_l, _n): + gap = len(_l) / _n + idxs = [int(i * gap + gap / 2) for i in range(_n)] + return [_l[i] for i in idxs] + + def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames): from decord import VideoReader, cpu # pip install decord - def uniform_sample(_l, _n): - gap = len(_l) / _n - idxs = [int(i * gap + gap / 2) for i in range(_n)] - return [_l[i] for i in idxs] - video_io = load_file(video) vr = VideoReader(video_io, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_num_frames: - frame_idx = uniform_sample(frame_idx, max_num_frames) + frame_idx = _uniform_sample(frame_idx, max_num_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames +def load_video_minicpmv4_5(video: Union[str, bytes], + max_num_frames: int, + max_num_packing: int, + time_scale: float, + choose_fps: float = None, + force_packing: int = None): + + from decord import VideoReader, cpu # pip install decord + from scipy.spatial import cKDTree + + def map_to_nearest_scale(values, scale): + tree = cKDTree(np.asarray(scale)[:, None]) + _, indices = tree.query(np.asarray(values)[:, None]) + return np.asarray(scale)[indices] + + def group_array(arr, size): + return [arr[i:i+size] for i in range(0, len(arr), size)] + + video_io = load_file(video) + vr = VideoReader(video_io, ctx=cpu(0)) + fps = vr.get_avg_fps() + duration = len(vr) / fps + + if choose_fps is None: + choose_fps = round(fps / 1) # Get choose FPS based on the original FPS + + # Prepare packing + if choose_fps * int(duration) <= max_num_frames: + packing_nums = 1 + choose_frames = round(min(choose_fps, round(fps)) * min(max_num_frames, duration)) + else: + packing_nums = math.ceil(duration * choose_fps / max_num_frames) + if packing_nums <= max_num_packing: + choose_frames = round(duration * choose_fps) + else: + choose_frames = round(max_num_frames * max_num_packing) + packing_nums = max_num_packing + + frame_idx = [i for i in range(0, len(vr))] + frame_idx = np.array(_uniform_sample(frame_idx, choose_frames)) + + if force_packing: + packing_nums = min(force_packing, max_num_packing) + + frames = vr.get_batch(frame_idx).asnumpy() + + frame_idx_ts = frame_idx / fps + scale = np.arange(0, duration, time_scale) + + frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale + frame_ts_id = frame_ts_id.astype(np.int32) + assert len(frames) == len(frame_ts_id) + + frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames] + frame_ts_id_group = group_array(frame_ts_id, packing_nums) + + return frames, frame_ts_id_group + + def load_audio(audio: Union[str, bytes], sampling_rate: int, return_sr: bool = False): import librosa audio_io = load_file(audio) From d88e07cb81481a632d15276a22bf84db7ff08bbf Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Fri, 13 Feb 2026 17:53:55 +0800 Subject: [PATCH 2/7] fix inconsistent colons --- docs/source/Instruction/Command-line-parameters.md | 6 +++--- docs/source_en/Instruction/Command-line-parameters.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index b4b844c33c..79267a306d 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -849,9 +849,9 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 ### minicpmv4_5 - MAX_SLICE_NUMS: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。 - VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 -- MAX_NUM_FRAMES:默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 -- MAX_NUM_PACKING:默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 -- TIME_SCALE:默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_FRAMES: 默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_PACKING: 默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- TIME_SCALE: 默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 - CHOOSE_FPS,默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 - FORCE_PACKING,默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 77d8782d10..e79cf25704 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -874,9 +874,9 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m ### minicpmv4_5 - MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1) - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) -- MAX_NUM_FRAMES:Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) -- MAX_NUM_PACKING:Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) -- TIME_SCALE:Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) - CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) - FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) From c3a2949a38dfc87b08e707b7cff8a2c2e3c4ceba Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Fri, 13 Feb 2026 17:56:13 +0800 Subject: [PATCH 3/7] fix commas --- docs/source/Instruction/Command-line-parameters.md | 4 ++-- docs/source_en/Instruction/Command-line-parameters.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 79267a306d..232f65819f 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -852,8 +852,8 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 - MAX_NUM_FRAMES: 默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 - MAX_NUM_PACKING: 默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 - TIME_SCALE: 默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 -- CHOOSE_FPS,默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 -- FORCE_PACKING,默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- CHOOSE_FPS: 默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- FORCE_PACKING: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 ### minicpmo - INIT_TTS: 默认为False。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index e79cf25704..6449517e30 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -877,8 +877,8 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m - MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) - MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) - TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) -- CHOOSE_FPS, Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) -- FORCE_PACKING, Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- CHOOSE_FPS: Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- FORCE_PACKING: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) ### minicpmo - INIT_TTS: Default is False From 69ad49c5137969d2286b79eeabddf4c82bf55333 Mon Sep 17 00:00:00 2001 From: Qi Fan <75657629+fanqiNO1@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:59:00 +0800 Subject: [PATCH 4/7] remove whitespace to enhance code style consistency Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- swift/template/vision_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py index c53218de27..97641217fc 100644 --- a/swift/template/vision_utils.py +++ b/swift/template/vision_utils.py @@ -337,7 +337,7 @@ def group_array(arr, size): packing_nums = max_num_packing frame_idx = [i for i in range(0, len(vr))] - frame_idx = np.array(_uniform_sample(frame_idx, choose_frames)) + frame_idx = np.array(_uniform_sample(frame_idx, choose_frames)) if force_packing: packing_nums = min(force_packing, max_num_packing) @@ -353,7 +353,7 @@ def group_array(arr, size): frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames] frame_ts_id_group = group_array(frame_ts_id, packing_nums) - + return frames, frame_ts_id_group From 3c462d090f39181bd2a27eafac412dd18e680251 Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Sun, 15 Feb 2026 19:56:33 +0800 Subject: [PATCH 5/7] fix get_new_tokens with skip_image_idx --- swift/template/templates/minicpm.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py index a8059b039b..3ff41e2c84 100644 --- a/swift/template/templates/minicpm.py +++ b/swift/template/templates/minicpm.py @@ -312,9 +312,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: temporal_ids=temporal_ids).to(self.model_info.torch_dtype) def _get_new_tokens(i): - placeholder = image_processor.get_slice_image_placeholder( - image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id) - placeholder += '\n' + if i in image_inputs.skip_image_idx[0]: + placeholder = '' + else: + placeholder = image_processor.get_slice_image_placeholder( + image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id) + placeholder += '\n' return self.processor.encode(placeholder, add_special_tokens=False) input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens) From d9f4b23fc957c8af8a37fabe4803d50e724a53a4 Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Mon, 16 Feb 2026 17:29:34 +0800 Subject: [PATCH 6/7] hack post_encode --- swift/template/templates/minicpm.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py index 3ff41e2c84..3ff83dc1a3 100644 --- a/swift/template/templates/minicpm.py +++ b/swift/template/templates/minicpm.py @@ -356,6 +356,9 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in res.update(Template._data_collator(self, batch, padding_to=padding_to)) return res + def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs + register_template( Qwen3MixedTemplateMeta( From aa178b62864c9e59e9720388ba7d0c94665f17ec Mon Sep 17 00:00:00 2001 From: fanqiNO1 <1848839264@qq.com> Date: Thu, 19 Feb 2026 20:52:42 +0800 Subject: [PATCH 7/7] fix lint --- swift/template/templates/minicpm.py | 23 ++++++++++++++--------- swift/template/vision_utils.py | 2 +- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py index 3ff83dc1a3..8b554e62f7 100644 --- a/swift/template/templates/minicpm.py +++ b/swift/template/templates/minicpm.py @@ -12,7 +12,7 @@ from ..register import TemplateMeta, register_template from ..template_inputs import StdTemplateInputs from ..utils import Context, Prompt, findall -from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5 +from ..vision_utils import load_video_minicpmv4_5, load_video_minicpmv_mplug_owl3 from .llama import Llama3TemplateMeta from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta from .utils import ChatmlTemplateMeta @@ -263,12 +263,13 @@ def init_env_args(self): def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, inputs: StdTemplateInputs) -> List[Context]: assert media_type in {'image', 'video'} - load_video = partial(load_video_minicpmv4_5, - max_num_frames=self.max_num_frames, - max_num_packing=self.max_num_packing, - time_scale=self.time_scale, - choose_fps=self.choose_fps, - force_packing=self.force_packing) + load_video = partial( + load_video_minicpmv4_5, + max_num_frames=self.max_num_frames, + max_num_packing=self.max_num_packing, + time_scale=self.time_scale, + choose_fps=self.choose_fps, + force_packing=self.force_packing) image_context = super().replace_tag('image', index, inputs) if media_type == 'image': return image_context @@ -307,7 +308,8 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: idx_list = findall(input_ids, -100) image_processor = self.processor.image_processor - image_inputs = image_processor([images], return_tensors='pt', + image_inputs = image_processor([images], + return_tensors='pt', max_slice_nums=max_slice_nums, temporal_ids=temporal_ids).to(self.model_info.torch_dtype) @@ -316,7 +318,10 @@ def _get_new_tokens(i): placeholder = '' else: placeholder = image_processor.get_slice_image_placeholder( - image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id) + image_inputs.image_sizes[0][i], + image_idx=i, + max_slice_nums=max_slice_nums, + use_image_id=use_image_id) placeholder += '\n' return self.processor.encode(placeholder, add_special_tokens=False) diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py index 97641217fc..dbd4ec14f6 100644 --- a/swift/template/vision_utils.py +++ b/swift/template/vision_utils.py @@ -314,7 +314,7 @@ def map_to_nearest_scale(values, scale): return np.asarray(scale)[indices] def group_array(arr, size): - return [arr[i:i+size] for i in range(0, len(arr), size)] + return [arr[i:i + size] for i in range(0, len(arr), size)] video_io = load_file(video) vr = VideoReader(video_io, ctx=cpu(0))