使用Qwen3VL,输入视频进行训练时会报错,经查发现是因为async模式下还不支持视频模态(没有视频模态)verltool_agent_loop.py 、vllm_async_server.py和agent_loop.py三个文件需要修改,但修改后还是会报错,想问还有没有什么解决方案?
Using Qwen3VL, an error occurs when inputting a video for training. After investigation, it was found that this is because video modality is not supported in async mode (no video modality). The files verltool_agent_loop.py, vllm_async_server.py and agent_loop.py need to be modified, but errors still occur after the modifications. Would like to ask if there are any other solutions?
运行环境:
My environment
vllm 0.11.0
torch 2.8.0
torch-geometric 2.6.1
torch_memory_saver 0.0.9rc1
torchao 0.9.0
torchaudio 2.8.0
torchdata 0.11.0
torchprofile 0.0.4
torchvision 0.23.0
transformers 4.57.1
sentence-transformers 5.1.1
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
|
async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: |
|
prompt_ids = list(kwargs["raw_prompt_ids"]) |
|
multi_modal_data = kwargs.get("multi_modal_data") or {} |
|
image_data = multi_modal_data.get("image") |
|
encoded_image_data = [encode_image_url(img) for img in image_data] if image_data is not None else None |
|
audio_data = multi_modal_data.get("audio") |
|
encoded_audio_data = [encode_audio_data(audio) for audio in audio_data] if audio_data is not None else None |
|
use_tool = kwargs.get("use_tool", self.agent_config.enable_agent) |
|
async def generate( |
|
self, |
|
prompt_ids: list[int], |
|
sampling_params: dict[str, Any], |
|
request_id: str, |
|
image_data: Optional[list[Any]] = None, |
|
audio_data: Optional[list[Any]] = None, |
|
) -> VerlToolTokenOutput: |
|
"""Generate sequence with token-in-token-out.""" |
|
# TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready. |
|
max_tokens = min(self.config.max_model_len - len(prompt_ids), sampling_params.get("max_tokens", self.config.response_length)) |
|
sampling_params["max_tokens"] = max_tokens |
|
sampling_params["logprobs"] = 0 if sampling_params.pop("logprobs", False) else None |
|
sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0)) |
|
sampling_params = SamplingParams(**sampling_params) |
|
prompt_ids = _qwen2_5_dedup_multimodal_tokens(prompt_ids, self.model_config.processor) |
|
|
|
multi_modal_data: dict[str, Any] = {} |
|
if image_data: |
|
multi_modal_data["image"] = image_data |
|
if audio_data: |
|
multi_modal_data["audio"] = audio_data |
|
prompt = TokensPrompt( |
|
prompt_token_ids=prompt_ids, multi_modal_data=multi_modal_data or None |
|
) |
|
async def generate( |
|
self, |
|
request_id, |
|
*, |
|
prompt_ids: list[int], |
|
sampling_params: dict[str, Any], |
|
image_data: Optional[list[Any]] = None, |
|
audio_data: Optional[list[Any]] = None, |
|
) -> TokenOutput: |
|
"""Generate tokens from prompt ids. |
|
|
|
Args: |
|
request_id (str): request id for sticky session. |
|
prompt_ids (List[int]): List of prompt token ids. |
|
sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. |
|
|
|
Returns: |
|
TokenOutput: token output |
|
""" |
|
server = self._choose_server(request_id) |
|
output = await server.generate.remote( |
|
request_id=str(uuid.uuid4()), |
|
prompt_ids=prompt_ids, |
|
sampling_params=sampling_params, |
|
image_data=image_data, |
|
audio_data=audio_data, |
|
) |
|
return output |
verltool_agent_loop.py修改如下
verltool_agent_loop.py modified as follows
async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
prompt_ids = list(kwargs["raw_prompt_ids"])
multi_modal_data = kwargs.get("multi_modal_data") or {}
image_data = multi_modal_data.get("image")
encoded_image_data = [encode_image_url(img) for img in image_data] if image_data is not None else None
audio_data = multi_modal_data.get("audio")
encoded_audio_data = [encode_audio_data(audio) for audio in audio_data] if audio_data is not None else None
video_data = multi_modal_data.get("video")
if video_data is not None:
videos, video_metadatas = zip(*video_data, strict=False)
videos, video_metadatas = list(videos), list(video_metadatas)
......
if image_data or audio_data or video_data:
raw_prompt_text = self.tokenizer.decode(prompt_ids, skip_special_tokens=False)
processor_kwargs = {"text": [raw_prompt_text], "return_tensors": "pt"}
if image_data:
processor_kwargs["images"] = image_data
if audio_data:
processor_kwargs["audio"] = audio_data
if video_data:
processor_kwargs["videos"] = videos
processor_kwargs["video_metadatas"] = video_metadatas
......
with simple_timer("generate_sequences", metrics):
output = await self.server_manager.generate(
request_id=request_id,
prompt_ids=running_prompt_ids,
sampling_params=agent_sampling_params,
image_data=running_image_data,
audio_data=running_audio_data,
video_data=running_video_data,
) # request_id here should be unique for each generate call, otherwise vllm can generate empty response
vllm_async_server.py修改如下
vllm_async_server.py modified as follows
async def generate(
self,
prompt_ids: list[int],
sampling_params: dict[str, Any],
request_id: str,
image_data: Optional[list[Any]] = None,
audio_data: Optional[list[Any]] = None,
video_data: Optional[list[Any]] = None,
) -> VerlToolTokenOutput:
"""Generate sequence with token-in-token-out."""
# TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready.
max_tokens = min(self.config.max_model_len - len(prompt_ids), sampling_params.get("max_tokens", self.config.response_length))
sampling_params["max_tokens"] = max_tokens
sampling_params["logprobs"] = 0 if sampling_params.pop("logprobs", False) else None
sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0))
sampling_params = SamplingParams(**sampling_params)
prompt_ids = _qwen2_5_dedup_multimodal_tokens(prompt_ids, self.model_config.processor)
multi_modal_data: dict[str, Any] = {}
if image_data:
multi_modal_data["image"] = image_data
if audio_data:
multi_modal_data["audio"] = audio_data
if video_data:
multi_modal_data["video"] = video_data
prompt = TokensPrompt(
prompt_token_ids=prompt_ids, multi_modal_data=multi_modal_data or None
)
agent_loop.py修改如下
agent_loop.py modified as follows
@rollout_trace_op
async def generate(
self,
request_id,
*,
prompt_ids: list[int],
sampling_params: dict[str, Any],
image_data: Optional[list[Any]] = None,
audio_data: Optional[list[Any]] = None,
video_data: Optional[list[Any]] = None,
) -> TokenOutput:
"""Generate tokens from prompt ids.
Args:
request_id (str): request id for sticky session.
prompt_ids (List[int]): List of prompt token ids.
sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
Returns:
TokenOutput: token output
"""
server = self._choose_server(request_id)
output = await server.generate.remote(
request_id=str(uuid.uuid4()),
prompt_ids=prompt_ids,
sampling_params=sampling_params,
image_data=image_data,
audio_data=audio_data,
video_data=video_data,
)
return output
然后出现报错提示:
Then, an error message appeared:
File "/zideliu/code/verl-tool/verl_tool/agent_loop/agent_loop.py", line 533, in _run_agent_loop Fri09 [30/1953]
output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/zideliu/code/verl-tool/verl_tool/agent_loop/verltool_agent_loop.py", line 535, in run
output = await self.server_manager.generate(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/zideliu/code/verl-tool/verl/verl/utils/rollout_trace.py", line 146, in async_wrapper
return await func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/zideliu/code/verl-tool/verl_tool/agent_loop/agent_loop.py", line 109, in generate
output = await server.generate.remote(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayTaskError(EngineGenerateError): ray::VerlToolvLLMHttpServer.generate() (pid=169750, ip=10.72.0.110, actor_id=6df22e8152d95916d2cc78f902000000, repr=<verl_tool.workers.r
ollout.vllm_rollout.vllm_async_server.VerlToolvLLMHttpServer object at 0x7ed264855580>)
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 284, in add_request
prompt_str, request = self.processor.process_inputs(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/processor.py", line 377, in process_inputs
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 644, in preprocess
return self._process_decoder_only_prompt(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 614, in _process_decoder_only_prompt
prompt_comps = self._prompt_to_llm_inputs(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 388, in _prompt_to_llm_inputs
return self._process_tokens(
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 317, in _process_tokens
inputs = self._process_multimodal(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 242, in _process_multimodal
mm_input = mm_processor.apply(
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 2045, in apply
prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 1997, in _maybe_apply_prompt_updates
) = self._apply_prompt_updates(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 1919, in _apply_prompt_updates
assert update_idx is not None, (
^^^^^^^^^^^^^^^^^^^^^^
AssertionError: Failed to apply prompt replacement for mm_items['video'][0]
使用Qwen3VL,输入视频进行训练时会报错,经查发现是因为async模式下还不支持视频模态(没有视频模态)verltool_agent_loop.py 、vllm_async_server.py和agent_loop.py三个文件需要修改,但修改后还是会报错,想问还有没有什么解决方案?
运行环境:
verl-tool/verl_tool/agent_loop/verltool_agent_loop.py
Lines 451 to 458 in 09103b0
verl-tool/verl_tool/workers/rollout/vllm_rollout/vllm_async_server.py
Lines 48 to 72 in 09103b0
verl-tool/verl_tool/agent_loop/agent_loop.py
Lines 88 to 115 in 09103b0
verltool_agent_loop.py修改如下
vllm_async_server.py修改如下
agent_loop.py修改如下
然后出现报错提示: