diff --git a/docs/source/Instruction/Command-line-parameters.md b/docs/source/Instruction/Command-line-parameters.md index 7dd4c43025..232f65819f 100644 --- a/docs/source/Instruction/Command-line-parameters.md +++ b/docs/source/Instruction/Command-line-parameters.md @@ -846,6 +846,15 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还 - VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。 - MAX_NUM_FRAMES: 默认为64,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。 +### minicpmv4_5 +- MAX_SLICE_NUMS: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。 +- VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_FRAMES: 默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- MAX_NUM_PACKING: 默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- TIME_SCALE: 默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- CHOOSE_FPS: 默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 +- FORCE_PACKING: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。 + ### minicpmo - INIT_TTS: 默认为False。 - INIT_AUDIO: 默认为False。 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 7ad244c061..6449517e30 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -871,6 +871,15 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m - VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) - MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) +### minicpmv4_5 +- MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1) +- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- CHOOSE_FPS: Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) +- FORCE_PACKING: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5) + ### minicpmo - INIT_TTS: Default is False - INIT_AUDIO: Default is False diff --git a/swift/template/templates/minicpm.py b/swift/template/templates/minicpm.py index 491d2c5a23..3ff83dc1a3 100644 --- a/swift/template/templates/minicpm.py +++ b/swift/template/templates/minicpm.py @@ -12,7 +12,7 @@ from ..register import TemplateMeta, register_template from ..template_inputs import StdTemplateInputs from ..utils import Context, Prompt, findall -from ..vision_utils import load_video_minicpmv_mplug_owl3 +from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5 from .llama import Llama3TemplateMeta from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta from .utils import ChatmlTemplateMeta @@ -251,9 +251,50 @@ def _get_new_tokens(i): class MiniCPMV4_5Template(MiniCPMV2_6Template): + def init_env_args(self): + super().init_env_args() + self.max_num_frames = get_env_args('max_num_frames', int, 180) + self.max_num_packing = get_env_args('max_num_packing', int, 3) + assert 1 <= self.max_num_packing <= 6 + self.time_scale = get_env_args('time_scale', float, 0.1) + self.choose_fps = get_env_args('choose_fps', float, 3) + self.force_packing = get_env_args('force_packing', int, None) + + def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, + inputs: StdTemplateInputs) -> List[Context]: + assert media_type in {'image', 'video'} + load_video = partial(load_video_minicpmv4_5, + max_num_frames=self.max_num_frames, + max_num_packing=self.max_num_packing, + time_scale=self.time_scale, + choose_fps=self.choose_fps, + force_packing=self.force_packing) + image_context = super().replace_tag('image', index, inputs) + if media_type == 'image': + return image_context + elif media_type == 'video': + return self.replace_video2image(load_video, inputs, lambda i: image_context) + + def replace_video2image(self, load_video_func, inputs: StdTemplateInputs, replace_tag) -> List[Context]: + context_list = [] + if self.mode in {'vllm', 'lmdeploy'}: + video = inputs.videos.pop(inputs.video_idx) + inputs.video_idx -= 1 + else: + video = inputs.videos[inputs.video_idx] + images = inputs.images + new_images, temporal_ids = load_video_func(video) + inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:] + for i in range(len(new_images)): + context_list += replace_tag(i) + inputs.image_idx += len(new_images) + inputs.extra_kwargs['temporal_ids'] = temporal_ids + return context_list + def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: encoded = Template._encode(self, inputs) images = inputs.images + temporal_ids = inputs.extra_kwargs.get('temporal_ids', None) use_video = bool(inputs.videos) use_image_id = True max_slice_nums = self.max_slice_nums @@ -267,12 +308,16 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: image_processor = self.processor.image_processor image_inputs = image_processor([images], return_tensors='pt', - max_slice_nums=max_slice_nums).to(self.model_info.torch_dtype) + max_slice_nums=max_slice_nums, + temporal_ids=temporal_ids).to(self.model_info.torch_dtype) def _get_new_tokens(i): - placeholder = image_processor.get_slice_image_placeholder( - image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id) - placeholder += '\n' + if i in image_inputs.skip_image_idx[0]: + placeholder = '' + else: + placeholder = image_processor.get_slice_image_placeholder( + image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id) + placeholder += '\n' return self.processor.encode(placeholder, add_special_tokens=False) input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens) @@ -311,6 +356,9 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in res.update(Template._data_collator(self, batch, padding_to=padding_to)) return res + def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs + register_template( Qwen3MixedTemplateMeta( diff --git a/swift/template/vision_utils.py b/swift/template/vision_utils.py index dbd44ef2b5..97641217fc 100644 --- a/swift/template/vision_utils.py +++ b/swift/template/vision_utils.py @@ -276,27 +276,87 @@ def load_video_llava(video: Union[str, bytes]) -> np.ndarray: return np.stack([x.to_ndarray(format='rgb24') for x in frames]) +def _uniform_sample(_l, _n): + gap = len(_l) / _n + idxs = [int(i * gap + gap / 2) for i in range(_n)] + return [_l[i] for i in idxs] + + def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames): from decord import VideoReader, cpu # pip install decord - def uniform_sample(_l, _n): - gap = len(_l) / _n - idxs = [int(i * gap + gap / 2) for i in range(_n)] - return [_l[i] for i in idxs] - video_io = load_file(video) vr = VideoReader(video_io, ctx=cpu(0)) sample_fps = round(vr.get_avg_fps() / 1) # FPS frame_idx = [i for i in range(0, len(vr), sample_fps)] if len(frame_idx) > max_num_frames: - frame_idx = uniform_sample(frame_idx, max_num_frames) + frame_idx = _uniform_sample(frame_idx, max_num_frames) frames = vr.get_batch(frame_idx).asnumpy() frames = [Image.fromarray(v.astype('uint8')) for v in frames] return frames +def load_video_minicpmv4_5(video: Union[str, bytes], + max_num_frames: int, + max_num_packing: int, + time_scale: float, + choose_fps: float = None, + force_packing: int = None): + + from decord import VideoReader, cpu # pip install decord + from scipy.spatial import cKDTree + + def map_to_nearest_scale(values, scale): + tree = cKDTree(np.asarray(scale)[:, None]) + _, indices = tree.query(np.asarray(values)[:, None]) + return np.asarray(scale)[indices] + + def group_array(arr, size): + return [arr[i:i+size] for i in range(0, len(arr), size)] + + video_io = load_file(video) + vr = VideoReader(video_io, ctx=cpu(0)) + fps = vr.get_avg_fps() + duration = len(vr) / fps + + if choose_fps is None: + choose_fps = round(fps / 1) # Get choose FPS based on the original FPS + + # Prepare packing + if choose_fps * int(duration) <= max_num_frames: + packing_nums = 1 + choose_frames = round(min(choose_fps, round(fps)) * min(max_num_frames, duration)) + else: + packing_nums = math.ceil(duration * choose_fps / max_num_frames) + if packing_nums <= max_num_packing: + choose_frames = round(duration * choose_fps) + else: + choose_frames = round(max_num_frames * max_num_packing) + packing_nums = max_num_packing + + frame_idx = [i for i in range(0, len(vr))] + frame_idx = np.array(_uniform_sample(frame_idx, choose_frames)) + + if force_packing: + packing_nums = min(force_packing, max_num_packing) + + frames = vr.get_batch(frame_idx).asnumpy() + + frame_idx_ts = frame_idx / fps + scale = np.arange(0, duration, time_scale) + + frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale + frame_ts_id = frame_ts_id.astype(np.int32) + assert len(frames) == len(frame_ts_id) + + frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames] + frame_ts_id_group = group_array(frame_ts_id, packing_nums) + + return frames, frame_ts_id_group + + def load_audio(audio: Union[str, bytes], sampling_rate: int, return_sr: bool = False): import librosa audio_io = load_file(audio)