diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py index 4f84df0581..55c4e3398a 100644 --- a/examples/vllm_serve/fakequant_worker.py +++ b/examples/vllm_serve/fakequant_worker.py @@ -20,7 +20,6 @@ import torch from transformers import AutoTokenizer -from vllm.v1.worker.gpu_worker import Worker as BaseWorker from vllm_ptq_utils import calibrate_fun, get_quant_config from vllm_reload_utils import ( convert_dict_to_vllm, @@ -38,6 +37,7 @@ ) from modelopt.torch.utils import safe_load from modelopt.torch.utils.dataset_utils import get_dataset_dataloader +from vllm.v1.worker.gpu_worker import Worker as BaseWorker quant_config: dict[str, Any] = { "dataset": os.environ.get("QUANT_DATASET", "cnn_dailymail"), diff --git a/examples/vllm_serve/vllm_ptq_utils.py b/examples/vllm_serve/vllm_ptq_utils.py index 88b31d54a7..55616df314 100644 --- a/examples/vllm_serve/vllm_ptq_utils.py +++ b/examples/vllm_serve/vllm_ptq_utils.py @@ -20,11 +20,11 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm -from vllm.sampling_params import SamplingParams -from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput import modelopt.torch.quantization as mtq from modelopt.recipe import ModelOptPTQRecipe, load_recipe +from vllm.sampling_params import SamplingParams +from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput def _create_new_data_cls(data_cls, **kwargs): diff --git a/examples/vllm_serve/vllm_reload_utils.py b/examples/vllm_serve/vllm_reload_utils.py index 6b658551f1..0a2e95cdf0 100644 --- a/examples/vllm_serve/vllm_reload_utils.py +++ b/examples/vllm_serve/vllm_reload_utils.py @@ -20,7 +20,6 @@ from typing import Any import torch -from vllm.distributed.parallel_state import get_tp_group from modelopt.torch.export.plugins.vllm_fakequant_hf import ( infer_quantizer_prefix_remap, @@ -38,6 +37,7 @@ ) from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer from modelopt.torch.quantization.utils import is_quantized +from vllm.distributed.parallel_state import get_tp_group def _union_quantizer_keys_across_ranks(local_quantizer_keys: list[str]) -> set[str]: diff --git a/examples/vllm_serve/vllm_serve_fakequant.py b/examples/vllm_serve/vllm_serve_fakequant.py index 458aaa908a..aa4ef3e4da 100644 --- a/examples/vllm_serve/vllm_serve_fakequant.py +++ b/examples/vllm_serve/vllm_serve_fakequant.py @@ -55,8 +55,9 @@ from pathlib import Path import uvloop -import vllm from packaging import version + +import vllm from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import make_arg_parser diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index a76783ac17..ed6ed2fcf2 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -1189,12 +1189,25 @@ def export_hf_checkpoint( try: post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype) - if hf_quant_config is not None: + # Only treat the export as quantized when at least one quant_algo field is set. + # get_quant_config always returns a dict (even for sparsity-only or unmodified models), + # so emitting hf_quant_config.json unconditionally produces a file with + # "quant_algo": null that downstream loaders (e.g. TensorRT-LLM) reject as a + # malformed pre-quantized checkpoint. + quantization_details = (hf_quant_config or {}).get("quantization", {}) + is_quantized_export = ( + quantization_details.get("quant_algo") is not None + or quantization_details.get("kv_cache_quant_algo") is not None + ) + + if is_quantized_export: # Save hf_quant_config.json for backward compatibility with open(f"{export_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) + else: + hf_quant_config = None # Remove hf_quantizer from model so post_state_dict can be exported. if getattr(model, "hf_quantizer", None) is not None: