Skip to content

Qwen3VL 视频输入有问题(有升级到verl0.7.0正式版的计划吗) #142

@zideliu

Description

@zideliu

使用Qwen3VL,输入视频进行训练时会报错,经查发现是因为async模式下还不支持视频模态(没有视频模态)verltool_agent_loop.py 、vllm_async_server.py和agent_loop.py三个文件需要修改,但修改后还是会报错,想问还有没有什么解决方案?

Using Qwen3VL, an error occurs when inputting a video for training. After investigation, it was found that this is because video modality is not supported in async mode (no video modality). The files verltool_agent_loop.py, vllm_async_server.py and agent_loop.py need to be modified, but errors still occur after the modifications. Would like to ask if there are any other solutions?

运行环境:

My environment

vllm                                     0.11.0
torch                                    2.8.0
torch-geometric                          2.6.1
torch_memory_saver                       0.0.9rc1
torchao                                  0.9.0
torchaudio                               2.8.0
torchdata                                0.11.0
torchprofile                             0.0.4
torchvision                              0.23.0
transformers                             4.57.1
sentence-transformers                    5.1.1


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0

async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
prompt_ids = list(kwargs["raw_prompt_ids"])
multi_modal_data = kwargs.get("multi_modal_data") or {}
image_data = multi_modal_data.get("image")
encoded_image_data = [encode_image_url(img) for img in image_data] if image_data is not None else None
audio_data = multi_modal_data.get("audio")
encoded_audio_data = [encode_audio_data(audio) for audio in audio_data] if audio_data is not None else None
use_tool = kwargs.get("use_tool", self.agent_config.enable_agent)

async def generate(
self,
prompt_ids: list[int],
sampling_params: dict[str, Any],
request_id: str,
image_data: Optional[list[Any]] = None,
audio_data: Optional[list[Any]] = None,
) -> VerlToolTokenOutput:
"""Generate sequence with token-in-token-out."""
# TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready.
max_tokens = min(self.config.max_model_len - len(prompt_ids), sampling_params.get("max_tokens", self.config.response_length))
sampling_params["max_tokens"] = max_tokens
sampling_params["logprobs"] = 0 if sampling_params.pop("logprobs", False) else None
sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0))
sampling_params = SamplingParams(**sampling_params)
prompt_ids = _qwen2_5_dedup_multimodal_tokens(prompt_ids, self.model_config.processor)
multi_modal_data: dict[str, Any] = {}
if image_data:
multi_modal_data["image"] = image_data
if audio_data:
multi_modal_data["audio"] = audio_data
prompt = TokensPrompt(
prompt_token_ids=prompt_ids, multi_modal_data=multi_modal_data or None
)

async def generate(
self,
request_id,
*,
prompt_ids: list[int],
sampling_params: dict[str, Any],
image_data: Optional[list[Any]] = None,
audio_data: Optional[list[Any]] = None,
) -> TokenOutput:
"""Generate tokens from prompt ids.
Args:
request_id (str): request id for sticky session.
prompt_ids (List[int]): List of prompt token ids.
sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
Returns:
TokenOutput: token output
"""
server = self._choose_server(request_id)
output = await server.generate.remote(
request_id=str(uuid.uuid4()),
prompt_ids=prompt_ids,
sampling_params=sampling_params,
image_data=image_data,
audio_data=audio_data,
)
return output

verltool_agent_loop.py修改如下

verltool_agent_loop.py modified as follows

    async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
        prompt_ids = list(kwargs["raw_prompt_ids"])
        multi_modal_data = kwargs.get("multi_modal_data") or {}
        image_data = multi_modal_data.get("image")
        encoded_image_data = [encode_image_url(img) for img in image_data] if image_data is not None else None
        audio_data = multi_modal_data.get("audio")
        encoded_audio_data = [encode_audio_data(audio) for audio in audio_data] if audio_data is not None else None
        
        video_data = multi_modal_data.get("video")
        if video_data is not None:
            videos, video_metadatas = zip(*video_data, strict=False)
            videos, video_metadatas = list(videos), list(video_metadatas)
......
        if image_data or audio_data or video_data:
            raw_prompt_text = self.tokenizer.decode(prompt_ids, skip_special_tokens=False)
            processor_kwargs = {"text": [raw_prompt_text], "return_tensors": "pt"}
            if image_data:
                processor_kwargs["images"] = image_data
            if audio_data:
                processor_kwargs["audio"] = audio_data

            if video_data: 
                processor_kwargs["videos"] = videos
                processor_kwargs["video_metadatas"] = video_metadatas
......
           with simple_timer("generate_sequences", metrics):
                output = await self.server_manager.generate(
                    request_id=request_id,
                    prompt_ids=running_prompt_ids,
                    sampling_params=agent_sampling_params,
                    image_data=running_image_data,
                    audio_data=running_audio_data,
                    video_data=running_video_data,
                ) # request_id here should be unique for each generate call, otherwise vllm can generate empty response
                

vllm_async_server.py修改如下

vllm_async_server.py modified as follows

    async def generate(
        self,
        prompt_ids: list[int],
        sampling_params: dict[str, Any],
        request_id: str,
        image_data: Optional[list[Any]] = None,
        audio_data: Optional[list[Any]] = None,
        video_data: Optional[list[Any]] = None,
    ) -> VerlToolTokenOutput:
        """Generate sequence with token-in-token-out."""
        # TODO(@wuxibin): switch to `/generate` http endpoint once multi-modal support ready.
        max_tokens = min(self.config.max_model_len - len(prompt_ids), sampling_params.get("max_tokens", self.config.response_length))
        sampling_params["max_tokens"] = max_tokens
        sampling_params["logprobs"] = 0 if sampling_params.pop("logprobs", False) else None
        sampling_params.setdefault("repetition_penalty", self.config.get("repetition_penalty", 1.0))
        sampling_params = SamplingParams(**sampling_params)
        prompt_ids = _qwen2_5_dedup_multimodal_tokens(prompt_ids, self.model_config.processor)
        
        multi_modal_data: dict[str, Any] = {}
        if image_data:
            multi_modal_data["image"] = image_data
        if audio_data:
            multi_modal_data["audio"] = audio_data
        if video_data:
            multi_modal_data["video"] = video_data
        prompt = TokensPrompt(
            prompt_token_ids=prompt_ids, multi_modal_data=multi_modal_data or None
        )

agent_loop.py修改如下

agent_loop.py modified as follows

    @rollout_trace_op
    async def generate(
        self,
        request_id,
        *,
        prompt_ids: list[int],
        sampling_params: dict[str, Any],
        image_data: Optional[list[Any]] = None,
        audio_data: Optional[list[Any]] = None,
        video_data: Optional[list[Any]] = None,
    ) -> TokenOutput:
        """Generate tokens from prompt ids.

        Args:
            request_id (str): request id for sticky session.
            prompt_ids (List[int]): List of prompt token ids.
            sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.

        Returns:
            TokenOutput: token output
        """
        server = self._choose_server(request_id)
        output = await server.generate.remote(
            request_id=str(uuid.uuid4()),
            prompt_ids=prompt_ids,
            sampling_params=sampling_params,
            image_data=image_data,
            audio_data=audio_data,
            video_data=video_data,
        )
        return output

然后出现报错提示:

Then, an error message appeared:

  File "/zideliu/code/verl-tool/verl_tool/agent_loop/agent_loop.py", line 533, in _run_agent_loop                                                Fri09 [30/1953]
    output: AgentLoopOutput = await agent_loop.run(sampling_params, **kwargs)                                                                                                             
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                             
  File "/zideliu/code/verl-tool/verl_tool/agent_loop/verltool_agent_loop.py", line 535, in run                                                                  
    output = await self.server_manager.generate(                                                                                                                                          
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                          
  File "/zideliu/code/verl-tool/verl/verl/utils/rollout_trace.py", line 146, in async_wrapper                                                                   
    return await func(self, *args, **kwargs)                                                                                                                                              
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                              
  File "/zideliu/code/verl-tool/verl_tool/agent_loop/agent_loop.py", line 109, in generate                                                                      
    output = await server.generate.remote(                                                                                                                                                
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                                
ray.exceptions.RayTaskError(EngineGenerateError): ray::VerlToolvLLMHttpServer.generate() (pid=169750, ip=10.72.0.110, actor_id=6df22e8152d95916d2cc78f902000000, repr=<verl_tool.workers.r
ollout.vllm_rollout.vllm_async_server.VerlToolvLLMHttpServer object at 0x7ed264855580>)                                                                                                   
        ^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                                           
  File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 284, in add_request                                                                                    
    prompt_str, request = self.processor.process_inputs(                                                                                                                                  
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                  
  File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/processor.py", line 377, in process_inputs                                                                                 
    processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(                                                                                                               
                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                               
  File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 644, in preprocess                                                                                       
    return self._process_decoder_only_prompt(                                                                                                                                             
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                             
  File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 614, in _process_decoder_only_prompt                                                                     
    prompt_comps = self._prompt_to_llm_inputs(                                                                                                                                            
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                                                                                                            
  File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 388, in _prompt_to_llm_inputs                                                                            
    return self._process_tokens(                                                                                                                                                          
           ^^^^^^^^^^^^^^^^^^^^^                                                                                                                                                          
  File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 317, in _process_tokens                                                                                  
    inputs = self._process_multimodal(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/inputs/preprocess.py", line 242, in _process_multimodal
    mm_input = mm_processor.apply(
               ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 2045, in apply
    prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 1997, in _maybe_apply_prompt_updates
    ) = self._apply_prompt_updates(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/vllm/multimodal/processing.py", line 1919, in _apply_prompt_updates
    assert update_idx is not None, (
           ^^^^^^^^^^^^^^^^^^^^^^
AssertionError: Failed to apply prompt replacement for mm_items['video'][0]

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions