NVIDIA · jingyu-ml · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -31,7 +31,7 @@ Changelog
 - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/puzzletron>`_ for more details.
 - Added iterator interface using CalibrationDataReader in ONNX quantization workflow.
 - Add N:M sparse softmax support to the Triton flash attention kernel (``modelopt.torch.kernels.common.attention.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
-- Add skip-softmax skipping to the Triton flash attention kernel (``modelopt.torch.kernels.common.attention.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
+- Add skip-softmax skipping to the Triton flash attention kernel for both language models and video diffusion models (``modelopt.torch.kernels.common.attention.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ and `examples/diffusers/sparsity/ <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/diffusers/sparsity>`_ for usage.
 - Add Video Sparse Attention (VSA) method for video diffusion models (``modelopt.torch.sparsity.attention_sparsity``). VSA uses 3D block tiling with a two-branch architecture for attention speedup.
 - Enable PTQ workflow for the Step3.5-Flash MoE model with NVFP4 W4A4 + FP8 KV cache quantization. See `modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml>`_ for more details.
 - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.

@@ -59,6 +59,7 @@
 from diffusers.utils import export_to_video
 
 import modelopt.torch.sparsity.attention_sparsity as mtsa
+from modelopt.torch.export import export_hf_checkpoint
 from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
 
 DEFAULT_MODEL_PATH = os.environ.get("WAN22_MODEL_PATH", "Wan-AI/Wan2.2-TI2V-5B-Diffusers")
@@ -199,6 +200,16 @@ def parse_args() -> argparse.Namespace:
         default=4,
         help="Number of calibration prompts from OpenVid-1M dataset",
     )
+
+    # Export options
+    parser.add_argument(
+        "--export-dir",
+        type=str,
+        default=None,
+        help="Export sparsified model as a HuggingFace checkpoint to this directory. "
+        "The sparse_attention_config (calibration params, disabled layers, etc.) "
+        "is written into each component's config.json.",
+    )
     return parser.parse_args()
 
 
@@ -442,6 +453,11 @@ def main() -> None:
         torch.cuda.empty_cache()
         print("Cleared CUDA cache after calibration")
 
+    # ---- Export (optional) ----
+    if args.export_dir and not args.baseline:
+        print(f"Exporting sparsified checkpoint to {args.export_dir}...")
+        export_hf_checkpoint(pipe, export_dir=args.export_dir)
+
     # ---- Generate (optional) ----
     if args.prompt:
         # Enable runtime sparsity measurement before generation

@@ -28,6 +28,7 @@
 
 import torch
 import torch.nn as nn
+import yaml
 from safetensors.torch import save_file
 
 try:
@@ -949,6 +950,9 @@ def _export_diffusers_checkpoint(
             is_diffusers_pipe = False
 
     # Step 3: Export each nn.Module component with quantization handling
+    # Collect sparse attention configs across all components for a unified sparse.yaml
+    pipeline_sparse_configs: dict[str, Any] = {}
+
     for component_name, component in module_components.items():
         is_quantized = has_quantized_modules(component)
         status = "quantized" if is_quantized else "non-quantized"
@@ -1015,8 +1019,33 @@ def _export_diffusers_checkpoint(
                 model_type=model_type,
             )
 
+        # Step 8: Update config.json with sparse attention info (both quantized and non-quantized)
+        if export_sparse_attention_config is not None:
+            sparse_attn_config = export_sparse_attention_config(component)
+            if sparse_attn_config is not None:
+                config_path = component_export_dir / "config.json"
+                if config_path.exists():
+                    with open(config_path) as file:
+                        config_data = json.load(file)
+                    config_data["sparse_attention_config"] = sparse_attn_config
+                    with open(config_path, "w") as file:
+                        json.dump(config_data, file, indent=4)
+                    print(f"  Added sparse_attention_config to {config_path.name}")
+
+                # Collect for the unified sparse.yaml (keyed by component name)
+                pipeline_sparse_configs[component_name] = sparse_attn_config
+
         print(f"  Saved to: {component_export_dir}")
 
+    # Step 8.5: Write unified sparse.yaml at the top-level export directory.
+    # Combines sparse configs from all components (keyed by pipeline component name)
+    # so downstream consumers get the full pipeline's sparse config in one file.
+    if pipeline_sparse_configs:
+        yaml_path = export_dir / "sparse.yaml"
+        with open(yaml_path, "w") as file:
+            yaml.dump(pipeline_sparse_configs, file, default_flow_style=False, sort_keys=False)
+        print(f"Saved unified sparse config to {yaml_path}")
+
     # Step 4: Export non-nn.Module components (tokenizers, schedulers, feature extractors, etc.)
     if is_diffusers_pipe:
         for component_name, component in all_components.items():
@@ -1249,6 +1278,13 @@ def export_hf_checkpoint(
             if sparse_attn_config is not None:
                 config_data["sparse_attention_config"] = sparse_attn_config
 
+                # Also save as standalone YAML for easy inspection and reuse
+                import yaml
+
+                yaml_path = Path(export_dir) / "sparse.yaml"
+                with open(yaml_path, "w") as file:
+                    yaml.dump(sparse_attn_config, file, default_flow_style=False, sort_keys=False)
+
         with open(original_config, "w") as file:
             json.dump(config_data, file, indent=4)
 

@@ -346,6 +346,9 @@ def calibrate_sparse_attention(
                 "a": result["a"],
                 "b": result["b"],
             }
+            if result.get("fit_logspace"):
+                params["log_a"] = result["log_a"]
+                params["fit_logspace"] = True
             if "min_observed_sparsity" in result:
                 params["min_observed_sparsity"] = result["min_observed_sparsity"]
             if "max_observed_sparsity" in result:

@@ -275,17 +275,21 @@ def exponential(sparsity, a, b):
             avg_s = np.mean([p["sparsity"] for p in points])
             print(f"  {threshold:<12.4f} {avg_sf:<12.2f} {avg_s:<12.2%} {len(points):<8}")
 
-        return {
+        result = {
             "phase": phase,
             "a": float(a),
             "b": float(b),
             "r_squared": float(r_squared),
             "num_data_points": int(np.sum(valid_mask)),
             "total_samples": len(all_data_points),
             "calibration_type": "exponential",
+            "fit_logspace": self.fit_logspace,
             "min_observed_sparsity": min_observed_sparsity,
             "max_observed_sparsity": max_observed_sparsity,
         }
+        if self.fit_logspace:
+            result["log_a"] = float(log_a)
+        return result
 
     def _enable_calibration_mode(self, modules: list[nn.Module]):
         """Enable calibration mode on sparse attention modules."""

@@ -349,73 +349,148 @@ def update_sparse_attention_metadata(
 def export_sparse_attention_config(model: nn.Module) -> dict[str, Any] | None:
     """Extract sparse attention config for export to config.json.
 
-    Extracts the calibration parameters (a, b) for the exponential threshold model
-    from the first sparse attention module that has calibrated thresholds.
+    Extracts calibration parameters, method metadata, and per-layer enable/disable
+    state from sparse attention modules. Supports both LLM and diffusion models.
 
-    The exported config allows computing threshold at runtime:
-        scale_factor = a * exp(b * target_sparsity)
-        threshold = scale_factor / seqlen
+    Algorithm-specific parameters (``threshold_scale_factor``, ``raw_threshold``,
+    ``disabled_layers``) are nested inside the config group that owns them.
+    This allows future sparse attention methods to define their own parameter
+    schemas in separate groups without collision.
+
+    The formula in the export reflects the actual fitting mode used during
+    calibration:
+
+    - **Linear-space fit** (default, LLMs): ``scale_factor = a * exp(b * S)``
+      exports ``a`` and ``b``.
+    - **Log-space fit** (diffusion): ``log_a + b * S``
+      exports ``log_a`` and ``b``.
+
+    At runtime: ``threshold = scale_factor / seqlen``.
 
     Args:
         model: Model with sparse attention applied
 
     Returns:
         Dictionary with sparse attention config for HuggingFace config.json export.
-        Returns None if no calibrated sparse attention modules found.
+        Returns None if no sparse attention modules are found, or if no calibration
+        parameters and no raw threshold are available.
 
-    Example output::
+    Example output (LLM, linear-space fit)::
 
         {
             "config_groups": {
-                "group_0": {"sparse_algo": "softmax_skip", "targets": ["LlamaAttention"]}
+                "group_0": {
+                    "sparse_algo": "softmax_skip",
+                    "targets": ["LlamaAttention"],
+                    "threshold_scale_factor": {
+                        "formula": "a * exp(b * target_sparsity)",
+                        "prefill": {"a": 7.93, "b": 8.61},
+                        "decode": {"a": 0.12, "b": 9.85},
+                    },
+                }
             },
-            "threshold_scale_factor": {
-                "formula": "a * exp(b * target_sparsity)",
-                "prefill": {"a": 7.93, "b": 8.61},
-                "decode": {"a": 0.12, "b": 9.85},
+            "producer": {"name": "modelopt", "version": "0.37.0"},
+        }
+
+    Example output (diffusion, log-space fit)::
+
+        {
+            "config_groups": {
+                "group_0": {
+                    "sparse_algo": "softmax_skip",
+                    "targets": ["Attention"],
+                    "threshold_scale_factor": {
+                        "formula": "log_a + b * target_sparsity",
+                        "prefill": {"log_a": 0.21, "b": 3.45},
+                    },
+                    "disabled_layers": ["blocks.0.attn1", "blocks.39.attn1"],
+                }
             },
             "producer": {"name": "modelopt", "version": "0.37.0"},
         }
     """
     # Collect sparse attention module info
     calibration_params = None
+    raw_threshold = None
     target_classes: set[str] = set()
+    disabled_layer_names: list[str] = []
 
-    for module in get_sparse_attention_modules(model):
+    for name, module in get_named_sparse_attention_modules(model):
         # Get the original wrapped module's class name
         if hasattr(module, "get_original_cls_by_level"):
             original_cls = module.get_original_cls_by_level(level=0)
             if original_cls is not None:
                 target_classes.add(original_cls.__name__)
 
-        # Get calibration params from first module that has them
+        if not module.is_enabled:
+            disabled_layer_names.append(get_unwrapped_name(name, model))
+            continue
+
+        # Get calibration params from first enabled module that has them
         if calibration_params is None:
             calibration_params = getattr(module._sparse_method_instance, "calibration_params", None)
 
-    # Return None if no calibration params found
-    if calibration_params is None:
+        # Get raw threshold from first enabled module that has one
+        if raw_threshold is None:
+            raw_threshold = getattr(
+                module._sparse_method_instance, "skip_softmax_raw_threshold", None
+            )
+
+    # Nothing exportable if no calibration params and no raw threshold
+    if calibration_params is None and raw_threshold is None:
         return None
 
-    # Build threshold_scale_factor with model parameters
-    threshold_scale_factor: dict[str, Any] = {
-        "formula": "a * exp(b * target_sparsity)",
+    # Build the config group for softmax_skip.
+    # All algorithm-specific parameters live inside the group so that future
+    # sparse attention methods can define their own parameter schemas in
+    # separate groups without collision.
+    group_0: dict[str, Any] = {
+        "sparse_algo": "softmax_skip",
+        "targets": sorted(target_classes) if target_classes else ["Attention"],
     }
-    for phase in ["prefill", "decode"]:
-        if phase in calibration_params:
-            threshold_scale_factor[phase] = {
-                "a": calibration_params[phase]["a"],
-                "b": calibration_params[phase]["b"],
+
+    # Build threshold_scale_factor from calibration params.
+    # The formula depends on the fitting mode used during calibration:
+    #   - Linear-space fit: scale_factor = a * exp(b * target_sparsity)
+    #   - Log-space fit:    log(scale_factor) = log_a + b * target_sparsity
+    if calibration_params is not None:
+        first_phase = next((p for p in ["prefill", "decode"] if p in calibration_params), None)
+        fit_logspace = first_phase is not None and calibration_params[first_phase].get(
+            "fit_logspace", False
+        )
+
+        if fit_logspace:
+            threshold_scale_factor: dict[str, Any] = {
+                "formula": "log_a + b * target_sparsity",
             }
+            for phase in ["prefill", "decode"]:
+                if phase in calibration_params and "log_a" in calibration_params[phase]:
+                    threshold_scale_factor[phase] = {
+                        "log_a": calibration_params[phase]["log_a"],
+                        "b": calibration_params[phase]["b"],
+                    }
+        else:
+            threshold_scale_factor = {
+                "formula": "a * exp(b * target_sparsity)",
+            }
+            for phase in ["prefill", "decode"]:
+                if phase in calibration_params:
+                    threshold_scale_factor[phase] = {
+                        "a": calibration_params[phase]["a"],
+                        "b": calibration_params[phase]["b"],
+                    }
+
+        group_0["threshold_scale_factor"] = threshold_scale_factor
+
+    if raw_threshold is not None:
+        group_0["raw_threshold"] = raw_threshold
+
+    if disabled_layer_names:
+        group_0["disabled_layers"] = disabled_layer_names
 
     # Build the export config
     export_config: dict[str, Any] = {
-        "config_groups": {
-            "group_0": {
-                "sparse_algo": "softmax_skip",
-                "targets": sorted(target_classes) if target_classes else ["Attention"],
-            }
-        },
-        "threshold_scale_factor": threshold_scale_factor,
+        "config_groups": {"group_0": group_0},
         "producer": {
             "name": "modelopt",
             "version": mo_version,