diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py
new file mode 100644
index 00000000000..aeac903f8f4
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Subblock runtime statistics API for ModelOpt NAS.
+
+This module provides utilities for measuring and calculating runtime statistics
+of subblocks (e.g., Attention, FFN) within transformer architectures.
+
+Primary API:
+    - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations
+"""
+from .calc_runtime_stats import calc_runtime_for_subblocks
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
new file mode 100644
index 00000000000..d3b997f4525
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+"""Runtime statistics calculation for NAS subblock benchmarking via vLLM."""
+
+import json
+import os
+import subprocess
+import tempfile
+from dataclasses import dataclass, replace
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter
+from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
+from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
+from modelopt.torch.puzzletron.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    SubblockConfig,
+)
+
+
+def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig:
+    return BlockConfig(
+        attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads),
+        ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None),
+        parallel_blocks=None,
+    )
+
+
+def create_benchmark_model(
+    vocab_size: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    prefill_seq_len: int,
+    generation_seq_len: int,
+    block_config: BlockConfig | None,
+    repeat_block_n_times: int = 10,
+) -> LlamaForCausalLM:
+    """Build a small Llama model with repeated subblocks for latency benchmarking."""
+    block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)]
+
+    if block_config:
+        block_configs.extend([block_config] * repeat_block_n_times)
+
+    model_config = LlamaConfig(
+        max_position_embeddings=prefill_seq_len + generation_seq_len,
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_hidden_layers=len(block_configs),
+        head_dim=None,  # Compute from hidden_size // num_attention_heads instead of using default 128
+        # this is required for trt-llm convertion to know which model classes to use to the checkpoint
+        auto_map={
+            "AutoConfig": "transformers.models.llama.configuration_llama.LlamaConfig",
+            "AutoModelForCausalLM": "transformers.models.llama.modeling_llama.LlamaForCausalLM",
+        },
+    )
+
+    for idx, bc in enumerate(block_configs):
+        block_configs[idx] = bc.to_dict()
+    model_config.block_configs = block_configs
+
+    with deci_x_patcher(LlamaModelDescriptor, block_configs):
+        model = AutoModelForCausalLM.from_config(model_config)
+
+    model.config.architectures = ["AnyModel"]
+    model.config.base_architecture = "LlamaForCausalLM"
+
+    return model
+
+
+def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
+    """Save a model checkpoint in AnyModel subblock-safetensors format."""
+    # Save standard model checkpoint (as safetensors, HF format)
+    model.save_pretrained(output_dir, safe_serialization=True)
+
+    # Convert/slice weights into AnyModel subblock_safetensors format
+    Converter.convert_model_weights(
+        input_dir=output_dir,
+        output_dir=output_dir,
+        descriptor=descriptor,
+        num_hidden_layers=num_hidden_layers,
+    )
+    # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk.
+
+    config_path = output_dir / "config.json"
+    if config_path.exists():
+        with open(config_path) as f:
+            config_data = json.load(f)
+        config_data["architectures"] = ["AnyModel"]
+        with open(config_path, "w") as f:
+            json.dump(config_data, f, indent=2)
+
+
+def save_model(
+    model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
+) -> None:
+    """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+    save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.save_pretrained(output_path)
+
+
+@dataclass(frozen=True)
+class RuntimeConfig:
+    """Configuration for a vLLM latency benchmark run."""
+
+    vocab_size: int
+    hidden_size: int
+    num_attention_heads: int
+    master_puzzle_dir: str
+    tokenizer_path: str
+    synth_dataset_num_requests: int
+    repeat_block_n_times: int
+    prefill_seq_len: int
+    generation_seq_len: int
+    batch_size: int
+    num_iters: int
+    num_warmup_iters: int
+
+
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
+    """Run ``vllm bench latency`` and return the average latency in milliseconds."""
+    output_json_path = model_path / "vllm_latency_benchmark.json"
+
+    cmd = [
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        str(model_path),
+        "--input-len",
+        str(runtime_config.prefill_seq_len),
+        "--output-len",
+        str(runtime_config.generation_seq_len),
+        "--batch-size",
+        str(runtime_config.batch_size),
+        "--output-json",
+        str(output_json_path),
+        "--max-model-len",
+        str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len),
+        "--num-iters-warmup",
+        str(runtime_config.num_warmup_iters),
+        "--num-iters",
+        str(runtime_config.num_iters),
+        "--max-num-seqs",
+        "1",
+        "--distributed-executor-backend",
+        "external_launcher",
+        "--tensor-parallel-size",
+        "1",
+        "--pipeline-parallel-size",
+        "1",
+    ]
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    subprocess.run(cmd)
+
+    with open(output_json_path) as f:
+        vllm_results = json.load(f)
+    print(vllm_results)
+    return vllm_results["avg_latency"] * 1000  # convert to milliseconds
+
+
+def calc_subblock_runtime(
+    runtime_config: RuntimeConfig,
+    subblock_config: SubblockConfig,
+) -> float:
+    """Measure total runtime of a repeated subblock via vLLM latency benchmark."""
+    block_config: BlockConfig | None = None
+
+    if subblock_config is not None:
+        if isinstance(subblock_config, BlockConfig):
+            block_config = subblock_config
+        elif isinstance(subblock_config, (AttentionConfig, FFNConfig)):
+            block_config = subblock_config.to_blockconfig()
+        else:
+            raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}")
+
+    model = create_benchmark_model(
+        runtime_config.vocab_size,
+        runtime_config.hidden_size,
+        runtime_config.num_attention_heads,
+        runtime_config.prefill_seq_len,
+        runtime_config.generation_seq_len,
+        block_config=block_config,
+        repeat_block_n_times=runtime_config.repeat_block_n_times,
+    )
+    with tempfile.TemporaryDirectory() as model_tmpdir:
+        save_model(
+            model,
+            Path(runtime_config.tokenizer_path),
+            Path(model_tmpdir),
+            num_hidden_layers=runtime_config.repeat_block_n_times + 1,
+        )
+        subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
+
+    return subblock_total_runtime_ms
+
+
+def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
+    """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
+    runtime_config1 = replace(runtime_config, repeat_block_n_times=0)
+    runtime_config10 = replace(runtime_config, repeat_block_n_times=9)
+
+    block_config = _make_standard_block_config(
+        runtime_config.hidden_size, runtime_config.num_attention_heads
+    )
+
+    runtime_ms1 = calc_subblock_runtime(runtime_config1, None)
+    runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config)
+
+    no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9
+
+    return no_block_runtime_ms
+
+
+def calc_runtime_for_subblocks(
+    subblock_config_set: set[SubblockConfig],
+    runtime_stats_config: DictConfig,
+    vocab_size: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    master_puzzle_dir: str,
+    tokenizer_path: str,
+    synth_dataset_num_requests: int,
+    prefill_seq_len: int,
+    generation_seq_len: int,
+) -> tuple[dict[SubblockConfig, float], float]:
+    """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead."""
+    repeat_block_n_times = 10
+    runtime_config = RuntimeConfig(
+        vocab_size,
+        hidden_size,
+        num_attention_heads,
+        master_puzzle_dir,
+        tokenizer_path,
+        synth_dataset_num_requests,
+        repeat_block_n_times,
+        prefill_seq_len,
+        generation_seq_len,
+        runtime_stats_config.get("batch_size", 1),
+        runtime_stats_config.get("num_iters", 30),
+        runtime_stats_config.get("num_warmup_iters", 10),
+    )
+
+    runtime_by_subblock_dict = {}
+
+    baseline_runtime_ms = calc_subblock_runtime(runtime_config, None)
+
+    for subblock_config in tqdm(
+        sorted(subblock_config_set),
+        desc=(
+            f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, "
+            f"num_subblocks={len(subblock_config_set)}]"
+        ),
+    ):
+        if subblock_config.no_op:
+            total_runtime_ms = 0.0
+        else:
+            subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config)
+            total_runtime_ms = (
+                subblock_total_runtime_ms - baseline_runtime_ms
+            ) / repeat_block_n_times
+
+        runtime_by_subblock_dict[subblock_config] = total_runtime_ms
+
+    no_block_runtime_ms = calc_no_block_runtime(runtime_config)
+
+    return runtime_by_subblock_dict, no_block_runtime_ms
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
similarity index 61%
rename from modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
rename to modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
index d893eb55bb3..abe7a1a3884 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
@@ -25,22 +25,21 @@
 import json
 import math
 from pathlib import Path
-from typing import Type
 
 import numpy as np
 import torch
 from transformers import PretrainedConfig
 
-from ..anymodel.model_descriptor import ModelDescriptor
-from ..block_config import (
+from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
+from modelopt.torch.puzzletron.block_config import (
     AttentionConfig,
     BlockConfig,
     FFNConfig,
     MambaConfig,
     maybe_cast_block_configs,
 )
-from ..tools.checkpoint_utils_hf import init_model_from_config
-from ..utils.misc import (
+from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config
+from modelopt.torch.puzzletron.utils.misc import (
     EmptyInitOnDevice,
     calculate_kv_dim,
     raise_unknown_subblock_config_error,
@@ -48,16 +47,16 @@
 )
 
 __all__ = [
-    "calculate_subblock_memory",
-    "calculate_subblock_params",
     "calc_subblock_active_params",
-    "load_moe_stats",
-    "estimate_num_active_experts",
+    "calculate_ffn_memory",
     "calculate_mamba_memory",
     "calculate_mamba_state_size",
-    "calculate_ffn_memory",
     "calculate_non_block_memory",
     "calculate_non_block_params",
+    "calculate_subblock_memory",
+    "calculate_subblock_params",
+    "estimate_num_active_experts",
+    "load_moe_stats",
 ]
 
 
@@ -73,9 +72,29 @@ def calculate_subblock_memory(
     kv_cache_dtype: torch.dtype,
     allocate_prefill_query: bool,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
 ) -> float | dict[str, float]:
-    """``model_config`` / ``descriptor`` are required (puzzletron-style); FFN uses them for meta init."""
+    """Calculate the memory usage of a single subblock (FFN or Attention).
+
+    Given its configuration and runtime dimensions, returns bytes or a detailed dict.
+
+    Args:
+        subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass.
+        batch_size (int): Batch size for memory estimate.
+        prefill_seq_len (int): Sequence length for prefill phase.
+        generation_seq_len (int): Sequence length for generation phase (token-by-token).
+        prefill_queue_size (int): Token queue size for prefill attention memory allocation.
+        n_embd (int): Embedding (hidden) dimension.
+        n_head (int): Number of attention heads (used for non-FFN).
+        weights_dtype (torch.dtype): PyTorch dtype for model weights.
+        kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache.
+        allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens.
+        model_config (PretrainedConfig): HuggingFace-style config instance describing the model.
+        descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types).
+
+    Returns:
+        float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type.
+    """
     if subblock_config.no_op:
         return 0
     if isinstance(subblock_config, FFNConfig):
@@ -116,7 +135,7 @@ def calculate_subblock_memory(
 def calculate_subblock_params(
     config: PretrainedConfig,
     layer_config: BlockConfig | FFNConfig | AttentionConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
 ) -> int:
     """Count parameters on one meta decoder layer.
 
@@ -124,9 +143,7 @@ def calculate_subblock_params(
     ``hybrid_override_pattern``) before passing ``config``; see
     ``ModelDescriptor.truncate_pattern_for_subblock``.
     """
-    if isinstance(layer_config, FFNConfig):
-        block_config = layer_config.to_blockconfig()
-    elif isinstance(layer_config, AttentionConfig):
+    if isinstance(layer_config, (FFNConfig, AttentionConfig)):
         block_config = layer_config.to_blockconfig()
     else:
         block_config = layer_config
@@ -189,12 +206,31 @@ def calculate_subblock_params(
 def calc_subblock_active_params(
     sublayer_config: FFNConfig | AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     n_embd: int,
     moe_stats_file: str,
     batch_size: int,
     block_idx: int,
 ) -> int:
+    """Calculate the number of "active" parameters for a subblock (FFN, Attention, or MoE).
+
+    For non-MoE subblocks, simply calls `calculate_subblock_params` to count all parameters.
+    For MoE (Mixture-of-Experts) FFN subblocks, estimates the expected number of active parameters
+    per batch by leveraging expert activation statistics (from a given stats file) and calculating
+    the expected number of active experts, then multiplies by the number of parameters per expert.
+
+    Args:
+        sublayer_config: The subblock configuration (either FFNConfig or AttentionConfig).
+        model_config: The Hugging Face model configuration.
+        descriptor: The ModelDescriptor class corresponding to this model family.
+        n_embd: The embedding size (hidden dimension).
+        moe_stats_file: Path to file containing expert activation probabilities.
+        batch_size: The batch size used for the estimate.
+        block_idx: The index of the block/subblock within the network, used to index into the stats.
+
+    Returns:
+        int: The expected number of "active" parameters for the given subblock.
+    """
     if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe):
         return calculate_subblock_params(model_config, sublayer_config, descriptor)
     return estimate_moe_active_params(
@@ -203,14 +239,45 @@ def calc_subblock_active_params(
 
 
 def load_moe_stats(stats_file: str) -> dict:
+    """Load MoE (Mixture-of-Experts) routing statistics from a file.
+
+    This function reads a JSON file containing expert activation probabilities or counts for each MoE block.
+    It returns the normalized probability distributions over experts for each block, as a list of numpy arrays.
+
+    Args:
+        stats_file (str): Path to the JSON file containing expert routing statistics for each block.
+
+    Returns:
+        list[np.ndarray]: A list where each element is a numpy array containing the normalized probability
+            distribution over experts for the corresponding block. If a block's expert list is empty,
+            its entry is 0.
+    """
     with open(stats_file) as f:
         stats = json.load(f)
-    return [np.array(l) / np.sum(l) if len(l) > 0 else 0 for l in stats]
+    return [
+        np.array(expert_probs) / np.sum(expert_probs) if len(expert_probs) > 0 else 0
+        for expert_probs in stats
+    ]
 
 
 def estimate_num_active_experts(
     dist_over_experts: np.ndarray, batch_size: int, num_experts: int
 ) -> int:
+    """Estimate the expected number of active experts in a Mixture-of-Experts (MoE) layer.
+
+    This function computes the expected number of unique experts that are selected at least once when performing
+    inference with a given batch size. It assumes, for each input in the batch, an expert is chosen with probability
+    given by `dist_over_experts` (typically a vector of probabilities for each expert). For a batch of size B, the
+    expected number of active (i.e., selected at least once) experts is computed.
+
+    Args:
+        dist_over_experts (np.ndarray): A 1D array of probabilities for each expert.
+        batch_size (int): The number of samples in the batch.
+        num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter).
+
+    Returns:
+        int: The expected number of experts selected at least once across the batch.
+    """
     # cut the tail and renormalize
     dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts]
     dist_over_experts = dist_over_experts / (dist_over_experts.sum())
@@ -226,6 +293,18 @@ def estimate_moe_active_params(
     batch_size: int,
     block_idx: int,
 ) -> int:
+    """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock.
+
+    Args:
+        subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured).
+        n_embd (int): The embedding dimension (input and output size per expert).
+        moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts.
+        batch_size (int): Batch size to simulate/extrapolate expected expert use.
+        block_idx (int): The index of the block/layer whose expert routing statistics should be used.
+
+    Returns:
+        int: Estimated number of parameters actively used for the current batch and expert selection statistics.
+    """
     assert Path(moe_stats_file).exists()
     # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution?
     #     return calculate_subblock_params(subblock_config, n_embd, n_head=None)
@@ -255,7 +334,7 @@ def estimate_moe_active_params(
 def calculate_attention_memory(
     attention_config: AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     batch_size: int,
     prefill_seq_len: int,
     generation_seq_len: int,
@@ -267,6 +346,7 @@ def calculate_attention_memory(
     allocate_prefill_query: bool,
 ) -> dict[str, float]:
     """allocate_prefill_query: infery-llm style.
+
     Infery used a unified Wqkv matrix, so before extracting the kv-cache,
     the query also had to be kept in-memory, once per layer.
     """
@@ -294,11 +374,25 @@ def calculate_attention_memory(
 def calculate_mamba_memory(
     attention_config: AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     batch_size: int,
     weights_dtype: torch.dtype,
     kv_cache_dtype: torch.dtype,
 ) -> int:
+    """Calculate memory usage (MiB) for a Mamba attention subblock.
+
+    Args:
+        attention_config (AttentionConfig): Mamba attention configuration,
+            including Mamba-specific settings.
+        model_config (PretrainedConfig): Model configuration.
+        descriptor (type[ModelDescriptor]): Model descriptor class.
+        batch_size (int): Batch size for memory estimate.
+        weights_dtype (torch.dtype): Data type for model weights.
+        kv_cache_dtype (torch.dtype): Data type for state/kv-cache.
+
+    Returns:
+        int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock.
+    """
     assert attention_config.mamba is not None
     mamba_config = attention_config.mamba
     num_params = calculate_subblock_params(model_config, attention_config, descriptor)
@@ -312,7 +406,16 @@ def calculate_mamba_state_size(
     mamba_config: MambaConfig,
     batch_size: int,
 ) -> int:
-    d_inner, in_proj_dim, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config)
+    """Calculate the total state size for a Mamba attention subblock.
+
+    Args:
+        mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters.
+        batch_size (int): Batch size to estimate the memory/state requirements for.
+
+    Returns:
+        int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state.
+    """
+    _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config)
     conv_state_size = math.prod((batch_size, conv_dim, kernel_size))
     ssm_state_size = math.prod(
         (batch_size, mamba_config.num_heads, mamba_config.head_dim, mamba_config.state_dim)
@@ -333,10 +436,23 @@ def _calculate_mamba_intermediates(mamba_config: MambaConfig) -> tuple[int, ...]
 def calculate_ffn_memory(
     ffn_config: FFNConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     weights_dtype: torch.dtype | str,
     experts_dtype: torch.dtype | str | None = None,
 ) -> float:
+    """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock.
+
+    Args:
+        ffn_config (FFNConfig): FFN configuration for the block.
+        model_config (PretrainedConfig): The parent model configuration.
+        descriptor (type[ModelDescriptor]): Model descriptor class.
+        weights_dtype (torch.dtype | str): Data type for FFN weights.
+        experts_dtype (torch.dtype | str | None, optional): Data type for expert weights
+            (for MoE layers, if present). Defaults to None.
+
+    Returns:
+        float: Estimated FFN memory usage in mebibytes (MiB).
+    """
     # TODO: How to separate between expert weights and the rest for any model (same as puzzletron).
     num_params = calculate_subblock_params(model_config, ffn_config, descriptor)
     return num_params * sizeof_dtype(weights_dtype) / 2**20
@@ -347,6 +463,16 @@ def calculate_non_block_memory(
     vocab_size: int,
     weight_dtype: torch.dtype,
 ) -> float:
+    """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection).
+
+    Args:
+        n_embd (int): Embedding dimension (hidden size).
+        vocab_size (int): Vocabulary size.
+        weight_dtype (torch.dtype): Data type for model weights.
+
+    Returns:
+        float: Estimated non-subblock memory usage in mebibytes (MiB).
+    """
     return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20
 
 
@@ -354,4 +480,13 @@ def calculate_non_block_params(
     n_embd: int,
     vocab_size: int,
 ) -> int:
+    """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection).
+
+    Args:
+        n_embd (int): Embedding dimension (hidden size).
+        vocab_size (int): Vocabulary size.
+
+    Returns:
+        int: Estimated non-subblock parameter count.
+    """
     return vocab_size * n_embd * 2 + n_embd
diff --git a/modelopt/torch/puzzletron/subblock_stats/__init__.py b/modelopt/torch/puzzletron/subblock_stats/__init__.py
index fbbeb3ff709..4964dba0cfa 100644
--- a/modelopt/torch/puzzletron/subblock_stats/__init__.py
+++ b/modelopt/torch/puzzletron/subblock_stats/__init__.py
@@ -15,5 +15,4 @@
 
 """Subblock statistics collection for Puzzletron."""
 
-from .calc_subblock_params_and_memory import *
 from .calc_subblock_stats import *
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index dc89a1f6450..f36a71710a3 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -24,7 +24,7 @@
 from functools import partial
 from itertools import product
 from pathlib import Path
-from typing import Iterable, Optional, Type, TypeVar
+from typing import Iterable, Type, TypeVar
 
 import pandas as pd
 import torch
@@ -41,7 +41,7 @@
 from ..tools.checkpoint_utils import load_model_config
 from ..tools.logger import mprint
 from ..utils.parsing import format_global_config
-from .calc_subblock_params_and_memory import (
+from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import (
     calc_subblock_active_params,
     calculate_non_block_memory,
     calculate_non_block_params,
@@ -52,7 +52,6 @@
 __all__ = [
     "calculate_subblock_stats",
     "launch_calc_subblock_stats",
-    "add_int8_runtime_estimates",
 ]
 
 # Type variable for dataclasses
@@ -60,10 +59,10 @@
 
 """
 Usage:
-python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ]
+python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --runtime_stats ]
 
---benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime,
-  only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker.
+--runtime_stats_enabled=False (the default) means that the code won't benchmark runtime,
+  only memory stats will be calculated. If you want to benchmark runtime, run inside an trtllm docker.
 
 """
 
@@ -82,7 +81,7 @@ def calculate_subblock_stats(
     n_embd: int,
     n_head: int,
     vocab_size: int,
-    benchmark_iterations: Optional[int],
+    runtime_stats_enabled: bool,
     use_cuda_graph: bool,
     weights_dtype: torch.dtype,
     activations_dtype: torch.dtype,
@@ -90,14 +89,12 @@ def calculate_subblock_stats(
     allocate_prefill_query: bool,
     moe_stats_file: str | Path | None = None,
 ) -> dict:
-    is_calc_runtime = benchmark_iterations is not None
-    if is_calc_runtime:
-        raise NotImplementedError("Runtime stats calculation is not implemented yet")
+    if runtime_stats_enabled:
+        from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks
 
     gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name()
     subblock_stats = {
         "args": dict(
-            is_calc_runtime=is_calc_runtime,
             gpu=gpu,
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
@@ -106,7 +103,7 @@ def calculate_subblock_stats(
             n_embd=n_embd,
             n_head=n_head,
             vocab_size=vocab_size,
-            benchmark_iterations=benchmark_iterations,
+            runtime_stats=runtime_stats_enabled,
             use_cuda_graph=use_cuda_graph,
             weights_dtype=str(weights_dtype),
             activations_dtype=str(activations_dtype),
@@ -116,8 +113,7 @@ def calculate_subblock_stats(
         "subblocks": list(),
     }
     # Compute runtime stats for unique subblocks only
-    if is_calc_runtime:
-        raise NotImplementedError("Runtime stats calculation is not implemented yet")
+    if runtime_stats_enabled:
         subblock_configs_nolayerindex = set(
             [subblock_config["subblock_config"] for subblock_config in subblock_configs]
         )
@@ -127,16 +123,19 @@ def calculate_subblock_stats(
         synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get(
             "synth_dataset_num_requests", 200
         )
-        backend = calc_subblock_stats_config.get("runtime_stats", {}).get("backend", "trt_torch")
-        runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_ms_for_subblocks(
+        runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {})
+
+        runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks(
             subblock_configs_nolayerindex,
+            runtime_stats_config,
             vocab_size,
             n_embd,
             n_head,
             master_puzzle_dir,
             teacher_dir,
             synth_dataset_num_requests,
-            backend,
+            prefill_seq_len,
+            generation_seq_len,
         )
 
     sorted_subblock_config = sorted(
@@ -144,7 +143,7 @@ def calculate_subblock_stats(
     )
     it = (
         tqdm(sorted_subblock_config, desc="Measuring subblock runtimes")
-        if is_calc_runtime
+        if runtime_stats_enabled
         else sorted_subblock_config
     )
     for subblock_config_indexed in it:
@@ -156,7 +155,7 @@ def calculate_subblock_stats(
             descriptor.get_language_model_config(layer_model_config), parent_layer_indices[0]
         )
 
-        if is_calc_runtime:
+        if runtime_stats_enabled:
             total_runtime_ms = runtime_by_subblock_dict[subblock_config]
             prefill_runtime_ms = None
             decode_runtime_ms = None
@@ -207,25 +206,13 @@ def calculate_subblock_stats(
             }
         )
 
-    if is_calc_runtime:
-        # TODO: fix
-        # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms
-        # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \
-        #     measure_non_block_runtime_ms(batch_size, prefill_seq_len, generation_seq_len, n_embd, vocab_size,
-        #                                  benchmark_iterations, use_cuda_graph)
-        embedding_runtime_ms, lm_head_runtime_ms = None, None
-    else:
-        non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = None, None, None
+    if not runtime_stats_enabled:
+        non_block_runtime_ms = None
     non_block_memory = calculate_non_block_memory(n_embd, vocab_size, weights_dtype)
     non_block_params = calculate_non_block_params(n_embd, vocab_size)
 
-    # TODO
-    # the semantics here is wrong why do we refer, prefill_runtime_ms as embedding_runtime_ms and lm_head_runtime_ms as decode_runtime_ms ?
-    # Prefill is the first the user prompt inference, and Decode refer to the next generation process. both processes use all the model layers.
     subblock_stats["non_block"] = {
         "runtime_ms": non_block_runtime_ms,
-        "prefill_runtime_ms": embedding_runtime_ms,
-        "decode_runtime_ms": lm_head_runtime_ms,
         "memory_mib": non_block_memory,
         "num_params": non_block_params,
     }
@@ -256,7 +243,9 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None:
         num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None),
         prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size,
         allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False),
-        benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None),
+        runtime_stats_enabled=cfg.calc_subblock_stats.get("runtime_stats", {}).get(
+            "enabled", False
+        ),
         merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats,
         subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename,
         moe_stats_filename=cfg.calc_subblock_stats.moe_stats_filename,
@@ -276,9 +265,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     num_active_tokens_override: int | None = None,
     prefill_queue_size: int = 0,  # it's an infery-llm thing
     allocate_prefill_query: bool = False,
-    benchmark_iterations: (
-        int | None
-    ) = None,  # If set then compute runtime performance statistics. TODO: recommend default value, is 1000 good?
+    runtime_stats_enabled: bool = False,  # Compute runtime statistics.
     merge_with_existing_stats: bool = False,
     subblock_stats_filename: str = "subblock_stats.json",
     moe_stats_filename: str = "moe_stats.json",
@@ -344,8 +331,8 @@ def calculate_subblock_stats_for_puzzle_dir(
         if num_active_tokens_override is not None:
             prefill_seq_len = generation_seq_len = int(num_active_tokens_override / batch_size / 2)
 
-        curr_benchmark_iterations = (
-            benchmark_iterations if weights_dtype == torch.bfloat16 else None
+        curr_runtime_stats_enabled = (
+            runtime_stats_enabled if weights_dtype == torch.bfloat16 else False
         )
 
         curr_subblock_stats = calculate_subblock_stats(
@@ -362,7 +349,7 @@ def calculate_subblock_stats_for_puzzle_dir(
             n_embd=model_hidden_size,
             n_head=lm_config.num_attention_heads,
             vocab_size=lm_config.vocab_size,
-            benchmark_iterations=curr_benchmark_iterations,
+            runtime_stats_enabled=curr_runtime_stats_enabled,
             use_cuda_graph=True,
             weights_dtype=weights_dtype,
             activations_dtype=activations_dtype,
@@ -378,8 +365,6 @@ def calculate_subblock_stats_for_puzzle_dir(
 
         subblock_stats.append(curr_subblock_stats)
 
-    # TODO fix: add_int8_runtime_estimates(subblock_stats)
-
     json_dump(subblock_stats, subblock_stats_file)
 
     mprint(subblock_stats_file)
@@ -503,65 +488,3 @@ def _dataclass_from_dict(
     raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}")
 
 
-def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None:
-    for curr_subblock_stats in subblock_stats:
-        args = curr_subblock_stats["args"]
-        if args["weights_dtype"] == "torch.int8":
-            assert args["activations_dtype"] == "torch.int8"
-            ffn_factor = 0.5
-            attention_factor = 0.5 if args["kv_cache_dtype"] == "torch.int8" else 0.8
-
-            bf16_stats = _find_corresponding_bf16_stats(args, subblock_stats)
-            if bf16_stats is not None:
-                curr_subblocks = curr_subblock_stats["subblocks"] + [
-                    curr_subblock_stats["non_block"]
-                ]
-                bf16_subblocks = bf16_stats["subblocks"] + [bf16_stats["non_block"]]
-                for curr_subblock, bf16_subblock in zip(curr_subblocks, bf16_subblocks):
-                    assert curr_subblock.get("subblock_config", None) == bf16_subblock.get(
-                        "subblock_config", None
-                    )
-                    is_attention = False
-                    if (subblock_config := curr_subblock.get("subblock_config")) is not None:
-                        if hasattr(subblock_config, "__dataclass_fields__"):
-                            subblock_config = dataclasses.asdict(subblock_config)
-                        is_attention = subblock_config.get("num_key_value_heads", None) is not None
-                    runtime_factor = attention_factor if is_attention else ffn_factor
-                    for stat_name, stat_value in bf16_subblock.items():
-                        if "runtime" in stat_name:
-                            curr_subblock[stat_name] = stat_value * runtime_factor
-
-
-def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> dict | None:
-    scenario_keys = [
-        "batch_size",
-        "prefill_seq_len",
-        "generation_seq_len",
-        "prefill_queue_size",
-        "gpu",
-        "n_embd",
-        "n_head",
-        "vocab_size",
-    ]
-    corresponding_bf16_args = {
-        **{k: v for k, v in args.items() if k in scenario_keys},
-        "is_calc_runtime": True,
-        "weights_dtype": "torch.bfloat16",
-        "activations_dtype": "torch.bfloat16",
-        "kv_cache_dtype": "torch.bfloat16",
-    }
-    matching_bf16_stats = [
-        stats
-        for stats in subblock_stats
-        if all(
-            [
-                stats["args"][key] == corresponding_bf16_args[key]
-                for key in corresponding_bf16_args.keys()
-            ]
-        )
-    ]
-    if len(matching_bf16_stats) == 0:
-        return None
-    if len(matching_bf16_stats) == 1:
-        return matching_bf16_stats[0]
-    raise ValueError(f"Found more than 1 matching bf16 stats for {args=}")