-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Checklist / 检查清单
- I have searched existing issues, and this is a new bug report. / 我已经搜索过现有的 issues,确认这是一个新的 bug report。
Bug Description / Bug 描述
报错信息
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/utils/utils.py", line 379, in __new_getattr__
[rank0]: return super(self.__class__, self).__getattr__(key)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: AttributeError: 'super' object has no attribute '__getattr__'. Did you mean: '__setattr__'?
[rank0]: During handling of the above exception, another exception occurred:
[rank0]: Traceback (most recent call last):
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/cli/_megatron/rlhf.py", line 5, in <module>
[rank0]: megatron_rlhf_main()
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/train/rlhf.py", line 71, in megatron_rlhf_main
[rank0]: return MegatronRLHF(args).main()
[rank0]: ^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/train/sft.py", line 57, in __init__
[rank0]: self.trainer = self.prepare_trainer()
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/train/rlhf.py", line 32, in prepare_trainer
[rank0]: return trainer_cls(args, self.template, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/trainers/grpo_trainer.py", line 59, in __init__
[rank0]: self._prepare_rollout_engine()
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/trainers/grpo_trainer.py", line 172, in _prepare_rollout_engine
[rank0]: self.engine = self.prepare_vllm()
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/megatron/trainers/grpo_trainer.py", line 188, in prepare_vllm
[rank0]: engine = GRPOVllmEngine(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/llm/infer/infer_engine/grpo_vllm_engine.py", line 63, in __init__
[rank0]: super().__init__(
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/llm/infer/infer_engine/vllm_engine.py", line 144, in __init__
[rank0]: self._prepare_engine()
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/llm/infer/infer_engine/vllm_engine.py", line 154, in _prepare_engine
[rank0]: engine = llm_engine_cls.from_engine_args(self.engine_args)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 177, in from_engine_args
[rank0]: return cls(vllm_config=vllm_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 114, in __init__
[rank0]: self.engine_core = EngineCoreClient.make_client(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 82, in make_client
[rank0]: return InprocClient(vllm_config, executor_class, log_stats)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 245, in __init__
[rank0]: self.engine_core = EngineCore(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 83, in __init__
[rank0]: self.model_executor = executor_class(vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/executor/executor_base.py", line 54, in __init__
[rank0]: self._init_executor()
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 144, in _init_executor
[rank0]: super()._init_executor()
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 54, in _init_executor
[rank0]: self.collective_rpc("init_device")
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc
[rank0]: return [run_method(self.driver_worker, method, args, kwargs)]
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/utils/__init__.py", line 3122, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/worker/worker_base.py", line 259, in init_device
[rank0]: self.worker.init_device() # type: ignore
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py", line 201, in init_device
[rank0]: self.model_runner: GPUModelRunner = GPUModelRunner(
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 421, in __init__
[rank0]: self.mm_budget = MultiModalBudget(
[rank0]: ^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/v1/worker/utils.py", line 48, in __init__
[rank0]: .get_max_tokens_per_item_by_nonzero_modality(model_config,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/multimodal/registry.py", line 167, in get_max_tokens_per_item_by_nonzero_modality
[rank0]: max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/multimodal/registry.py", line 143, in get_max_tokens_per_item_by_modality
[rank0]: return profiler.get_mm_max_contiguous_tokens(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/multimodal/profiling.py", line 282, in get_mm_max_contiguous_tokens
[rank0]: return self._get_mm_max_tokens(seq_len,
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/multimodal/profiling.py", line 255, in _get_mm_max_tokens
[rank0]: max_tokens_per_item = self.processing_info.get_mm_max_tokens_per_item(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/model_executor/models/transformers.py", line 226, in get_mm_max_tokens_per_item
[rank0]: return {"image": self.get_max_image_tokens()}
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/vllm/model_executor/models/transformers.py", line 233, in get_max_image_tokens
[rank0]: mm_tokens = processor._get_num_multimodal_tokens(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/swift/utils/utils.py", line 383, in __new_getattr__
[rank0]: return getattr(item, key)
[rank0]: ^^^^^^^^^^^^^^^^^^
[rank0]: File "/root/miniconda3/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 1128, in __getattr__
[rank0]: raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
[rank0]: AttributeError: CachedQwen2TokenizerFast has no attribute _get_num_multimodal_tokens
How to Reproduce / 如何复现
版本信息
transformers 4.57.6
vllm 0.11.0
ms_swift 3.11.0
flash_attn 2.8.3
python 3.11.13
torch 2.8.0
transformer_engine_torch 2.5.0
运行脚本
#!/bin/bash
export MEGATRON_LM_PATH='./Megatron-LM'
export HF_ENDPOINT=https://hf-mirror.com
export NPROC_PER_NODE=8
export IMAGE_MAX_TOKEN_NUM=49
export IMAGE_MIN_TOKEN_NUM=49
export LOG_LEVEL="ERROR"
# export NCCL_P2P_DISABLE=1
# export NCCL_SHM_DISABLE=1
# export VLLM_ATTENTION_BACKEND=TORCH_SDPA
# ===== 多个 JSON 路径列表 =====
DATASETS=(
)
# SAMPLING_RATE: 默认为16000。
LOGDIR
LOGFILE
# USE_FLASH_ATTENTION=0 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \
PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
megatron rlhf \
--rlhf_type grpo \
--model Qwen/Qwen3-Omni-30B-A3B-Instruct \
--model_type qwen3_omni \
--save ./grpo \
--add_version false \
--load_safetensors true \
--save_safetensors true \
--context_parallel_size 1 \
--tensor_model_parallel_size 1 \
--expert_model_parallel_size 1 \
--pipeline_model_parallel_size 1 \
--dataset xxx.json \
--max_epochs 1 \
--global_batch_size 64 \
--micro_batch_size 1 \
--steps_per_generation 1 \
--num_generations 8 \
--num_iterations 1 \
--beta 0.03 \
--importance_sampling_level sequence \
--epsilon 3e-4 \
--epsilon_high 4e-4 \
--dynamic_sample false \
--overlong_filter true \
--loss_type grpo \
--external_plugins examples/train/grpo/plugin/plugin.py \
--reward_funcs diffcult_audio \
--use_vllm true \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.3 \
--vllm_tensor_parallel_size 8 \
--vllm_max_model_len 16384 \
--max_length 16384 \
--max_completion_length 2048 \
--train_type full \
--freeze_vit true \
--freeze_aligner true \
--freeze_parameters talker code2wav \
--lr 1e-6 \
--lr_warmup_fraction 0.05 \
--min_lr 1e-7 \
--bf16 true \
--use_precision_aware_optimizer \
--moe_permute_fusion true \
--moe_grouped_gemm true \
--moe_shared_expert_overlap true \
--moe_aux_loss_coeff 1e-3 \
--sleep_level 2 \
--offload_model true \
--optimizer_cpu_offload true \
--recompute_granularity selective \
--vit_gradient_checkpointing true \
--padding_free true \
--sequence_parallel true \
--save_interval 50 \
--no_save_optim true \
--no_save_rng true \
--log_interval 1 \
--num_workers 8 \
--dataset_num_proc 8 \
--attention_backend flash \
--temperature 1.0 \
--torch_dtype bfloat16 \
--no_gradient_accumulation_fusion true \
2>&1 | tee "$LOGFILE"
Additional Information / 补充信息
No response
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working