Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,15 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外,还
- VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。
- MAX_NUM_FRAMES: 默认为64,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)。

### minicpmv4_5
- MAX_SLICE_NUMS: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)。
- VIDEO_MAX_SLICE_NUMS: 默认为1,视频的MAX_SLICE_NUMS,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
- MAX_NUM_FRAMES: 默认为180,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
- MAX_NUM_PACKING: 默认为3,合法范围为1到6,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
- TIME_SCALE: 默认为0.1,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
- CHOOSE_FPS: 默认为3,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。
- FORCE_PACKING: 默认为None,参考[这里](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)。

### minicpmo
- INIT_TTS: 默认为False。
- INIT_AUDIO: 默认为False。
Expand Down
9 changes: 9 additions & 0 deletions docs/source_en/Instruction/Command-line-parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,15 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)
- MAX_NUM_FRAMES: Default is 64, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6)

### minicpmv4_5
- MAX_SLICE_NUMS: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/file/view/master?fileName=config.json&status=1)
- VIDEO_MAX_SLICE_NUMS: Default is 1, which is the MAX_SLICE_NUMS for videos, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
- MAX_NUM_FRAMES: Default is 180, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
- MAX_NUM_PACKING: Default is 3, where the valid range is 1-6, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
- TIME_SCALE: Default is 0.1, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
- CHOOSE_FPS: Default is 3, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)
- FORCE_PACKING: Default is None, refer to [here](https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5)

### minicpmo
- INIT_TTS: Default is False
- INIT_AUDIO: Default is False
Expand Down
58 changes: 53 additions & 5 deletions swift/template/templates/minicpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ..register import TemplateMeta, register_template
from ..template_inputs import StdTemplateInputs
from ..utils import Context, Prompt, findall
from ..vision_utils import load_video_minicpmv_mplug_owl3
from ..vision_utils import load_video_minicpmv_mplug_owl3, load_video_minicpmv4_5
from .llama import Llama3TemplateMeta
from .qwen import Qwen2_5TemplateMeta, Qwen3MixedTemplateMeta, QwenTemplateMeta
from .utils import ChatmlTemplateMeta
Expand Down Expand Up @@ -251,9 +251,50 @@ def _get_new_tokens(i):

class MiniCPMV4_5Template(MiniCPMV2_6Template):

def init_env_args(self):
super().init_env_args()
self.max_num_frames = get_env_args('max_num_frames', int, 180)
self.max_num_packing = get_env_args('max_num_packing', int, 3)
assert 1 <= self.max_num_packing <= 6
self.time_scale = get_env_args('time_scale', float, 0.1)
self.choose_fps = get_env_args('choose_fps', float, 3)
self.force_packing = get_env_args('force_packing', int, None)

def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index,
inputs: StdTemplateInputs) -> List[Context]:
assert media_type in {'image', 'video'}
load_video = partial(load_video_minicpmv4_5,
max_num_frames=self.max_num_frames,
max_num_packing=self.max_num_packing,
time_scale=self.time_scale,
choose_fps=self.choose_fps,
force_packing=self.force_packing)
image_context = super().replace_tag('image', index, inputs)
if media_type == 'image':
return image_context
elif media_type == 'video':
return self.replace_video2image(load_video, inputs, lambda i: image_context)

def replace_video2image(self, load_video_func, inputs: StdTemplateInputs, replace_tag) -> List[Context]:
context_list = []
if self.mode in {'vllm', 'lmdeploy'}:
video = inputs.videos.pop(inputs.video_idx)
inputs.video_idx -= 1
else:
video = inputs.videos[inputs.video_idx]
images = inputs.images
new_images, temporal_ids = load_video_func(video)
inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:]
for i in range(len(new_images)):
context_list += replace_tag(i)
inputs.image_idx += len(new_images)
inputs.extra_kwargs['temporal_ids'] = temporal_ids
return context_list

def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
encoded = Template._encode(self, inputs)
images = inputs.images
temporal_ids = inputs.extra_kwargs.get('temporal_ids', None)
use_video = bool(inputs.videos)
use_image_id = True
max_slice_nums = self.max_slice_nums
Expand All @@ -267,12 +308,16 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:

image_processor = self.processor.image_processor
image_inputs = image_processor([images], return_tensors='pt',
max_slice_nums=max_slice_nums).to(self.model_info.torch_dtype)
max_slice_nums=max_slice_nums,
temporal_ids=temporal_ids).to(self.model_info.torch_dtype)

def _get_new_tokens(i):
placeholder = image_processor.get_slice_image_placeholder(
image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
placeholder += '\n'
if i in image_inputs.skip_image_idx[0]:
placeholder = ''
else:
placeholder = image_processor.get_slice_image_placeholder(
image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
placeholder += '\n'
return self.processor.encode(placeholder, add_special_tokens=False)

input_ids, labels, loss_scale = self._extend_tokens(input_ids, labels, loss_scale, idx_list, _get_new_tokens)
Expand Down Expand Up @@ -311,6 +356,9 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
res.update(Template._data_collator(self, batch, padding_to=padding_to))
return res

def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
return inputs


register_template(
Qwen3MixedTemplateMeta(
Expand Down
72 changes: 66 additions & 6 deletions swift/template/vision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,27 +276,87 @@ def load_video_llava(video: Union[str, bytes]) -> np.ndarray:
return np.stack([x.to_ndarray(format='rgb24') for x in frames])


def _uniform_sample(_l, _n):
gap = len(_l) / _n
idxs = [int(i * gap + gap / 2) for i in range(_n)]
return [_l[i] for i in idxs]


def load_video_minicpmv_mplug_owl3(video: Union[str, bytes], max_num_frames):

from decord import VideoReader, cpu # pip install decord

def uniform_sample(_l, _n):
gap = len(_l) / _n
idxs = [int(i * gap + gap / 2) for i in range(_n)]
return [_l[i] for i in idxs]

video_io = load_file(video)
vr = VideoReader(video_io, ctx=cpu(0))
sample_fps = round(vr.get_avg_fps() / 1) # FPS
frame_idx = [i for i in range(0, len(vr), sample_fps)]

if len(frame_idx) > max_num_frames:
frame_idx = uniform_sample(frame_idx, max_num_frames)
frame_idx = _uniform_sample(frame_idx, max_num_frames)
frames = vr.get_batch(frame_idx).asnumpy()
frames = [Image.fromarray(v.astype('uint8')) for v in frames]
return frames


def load_video_minicpmv4_5(video: Union[str, bytes],
max_num_frames: int,
max_num_packing: int,
time_scale: float,
choose_fps: float = None,
force_packing: int = None):

from decord import VideoReader, cpu # pip install decord
from scipy.spatial import cKDTree

def map_to_nearest_scale(values, scale):
tree = cKDTree(np.asarray(scale)[:, None])
_, indices = tree.query(np.asarray(values)[:, None])
return np.asarray(scale)[indices]

def group_array(arr, size):
return [arr[i:i+size] for i in range(0, len(arr), size)]
Comment on lines +311 to +317
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The helper functions map_to_nearest_scale and group_array are defined inside load_video_minicpmv4_5. To improve code organization and potential reusability, consider moving them to the module level as private functions (e.g., _map_to_nearest_scale and _group_array), similar to how _uniform_sample was refactored.


video_io = load_file(video)
vr = VideoReader(video_io, ctx=cpu(0))
fps = vr.get_avg_fps()
duration = len(vr) / fps

if choose_fps is None:
choose_fps = round(fps / 1) # Get choose FPS based on the original FPS

# Prepare packing
if choose_fps * int(duration) <= max_num_frames:
packing_nums = 1
choose_frames = round(min(choose_fps, round(fps)) * min(max_num_frames, duration))
else:
packing_nums = math.ceil(duration * choose_fps / max_num_frames)
if packing_nums <= max_num_packing:
choose_frames = round(duration * choose_fps)
else:
choose_frames = round(max_num_frames * max_num_packing)
packing_nums = max_num_packing

frame_idx = [i for i in range(0, len(vr))]
frame_idx = np.array(_uniform_sample(frame_idx, choose_frames))

if force_packing:
packing_nums = min(force_packing, max_num_packing)

frames = vr.get_batch(frame_idx).asnumpy()

frame_idx_ts = frame_idx / fps
scale = np.arange(0, duration, time_scale)

frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale
frame_ts_id = frame_ts_id.astype(np.int32)
assert len(frames) == len(frame_ts_id)

frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames]
frame_ts_id_group = group_array(frame_ts_id, packing_nums)

return frames, frame_ts_id_group


def load_audio(audio: Union[str, bytes], sampling_rate: int, return_sr: bool = False):
import librosa
audio_io = load_file(audio)
Expand Down