diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py new file mode 100644 index 00000000000..aeac903f8f4 --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/__init__.py @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Subblock runtime statistics API for ModelOpt NAS. + +This module provides utilities for measuring and calculating runtime statistics +of subblocks (e.g., Attention, FFN) within transformer architectures. + +Primary API: + - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations +""" +from .calc_runtime_stats import calc_runtime_for_subblocks diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py new file mode 100644 index 00000000000..d3b997f4525 --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -0,0 +1,291 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +"""Runtime statistics calculation for NAS subblock benchmarking via vLLM.""" + +import json +import os +import subprocess +import tempfile +from dataclasses import dataclass, replace +from pathlib import Path + +import torch +from omegaconf import DictConfig +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM + +from modelopt.torch.puzzletron.anymodel.converter import Converter +from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor +from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher +from modelopt.torch.puzzletron.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, + SubblockConfig, +) + + +def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig: + return BlockConfig( + attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads), + ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None), + parallel_blocks=None, + ) + + +def create_benchmark_model( + vocab_size: int, + hidden_size: int, + num_attention_heads: int, + prefill_seq_len: int, + generation_seq_len: int, + block_config: BlockConfig | None, + repeat_block_n_times: int = 10, +) -> LlamaForCausalLM: + """Build a small Llama model with repeated subblocks for latency benchmarking.""" + block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)] + + if block_config: + block_configs.extend([block_config] * repeat_block_n_times) + + model_config = LlamaConfig( + max_position_embeddings=prefill_seq_len + generation_seq_len, + vocab_size=vocab_size, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + num_hidden_layers=len(block_configs), + head_dim=None, # Compute from hidden_size // num_attention_heads instead of using default 128 + # this is required for trt-llm convertion to know which model classes to use to the checkpoint + auto_map={ + "AutoConfig": "transformers.models.llama.configuration_llama.LlamaConfig", + "AutoModelForCausalLM": "transformers.models.llama.modeling_llama.LlamaForCausalLM", + }, + ) + + for idx, bc in enumerate(block_configs): + block_configs[idx] = bc.to_dict() + model_config.block_configs = block_configs + + with deci_x_patcher(LlamaModelDescriptor, block_configs): + model = AutoModelForCausalLM.from_config(model_config) + + model.config.architectures = ["AnyModel"] + model.config.base_architecture = "LlamaForCausalLM" + + return model + + +def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): + """Save a model checkpoint in AnyModel subblock-safetensors format.""" + # Save standard model checkpoint (as safetensors, HF format) + model.save_pretrained(output_dir, safe_serialization=True) + + # Convert/slice weights into AnyModel subblock_safetensors format + Converter.convert_model_weights( + input_dir=output_dir, + output_dir=output_dir, + descriptor=descriptor, + num_hidden_layers=num_hidden_layers, + ) + # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk. + + config_path = output_dir / "config.json" + if config_path.exists(): + with open(config_path) as f: + config_data = json.load(f) + config_data["architectures"] = ["AnyModel"] + with open(config_path, "w") as f: + json.dump(config_data, f, indent=2) + + +def save_model( + model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int +) -> None: + """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + tokenizer.save_pretrained(output_path) + + +@dataclass(frozen=True) +class RuntimeConfig: + """Configuration for a vLLM latency benchmark run.""" + + vocab_size: int + hidden_size: int + num_attention_heads: int + master_puzzle_dir: str + tokenizer_path: str + synth_dataset_num_requests: int + repeat_block_n_times: int + prefill_seq_len: int + generation_seq_len: int + batch_size: int + num_iters: int + num_warmup_iters: int + + +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): + """Run ``vllm bench latency`` and return the average latency in milliseconds.""" + output_json_path = model_path / "vllm_latency_benchmark.json" + + cmd = [ + "vllm", + "bench", + "latency", + "--model", + str(model_path), + "--input-len", + str(runtime_config.prefill_seq_len), + "--output-len", + str(runtime_config.generation_seq_len), + "--batch-size", + str(runtime_config.batch_size), + "--output-json", + str(output_json_path), + "--max-model-len", + str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len), + "--num-iters-warmup", + str(runtime_config.num_warmup_iters), + "--num-iters", + str(runtime_config.num_iters), + "--max-num-seqs", + "1", + "--distributed-executor-backend", + "external_launcher", + "--tensor-parallel-size", + "1", + "--pipeline-parallel-size", + "1", + ] + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + subprocess.run(cmd) + + with open(output_json_path) as f: + vllm_results = json.load(f) + print(vllm_results) + return vllm_results["avg_latency"] * 1000 # convert to milliseconds + + +def calc_subblock_runtime( + runtime_config: RuntimeConfig, + subblock_config: SubblockConfig, +) -> float: + """Measure total runtime of a repeated subblock via vLLM latency benchmark.""" + block_config: BlockConfig | None = None + + if subblock_config is not None: + if isinstance(subblock_config, BlockConfig): + block_config = subblock_config + elif isinstance(subblock_config, (AttentionConfig, FFNConfig)): + block_config = subblock_config.to_blockconfig() + else: + raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}") + + model = create_benchmark_model( + runtime_config.vocab_size, + runtime_config.hidden_size, + runtime_config.num_attention_heads, + runtime_config.prefill_seq_len, + runtime_config.generation_seq_len, + block_config=block_config, + repeat_block_n_times=runtime_config.repeat_block_n_times, + ) + with tempfile.TemporaryDirectory() as model_tmpdir: + save_model( + model, + Path(runtime_config.tokenizer_path), + Path(model_tmpdir), + num_hidden_layers=runtime_config.repeat_block_n_times + 1, + ) + subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) + + return subblock_total_runtime_ms + + +def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: + """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" + runtime_config1 = replace(runtime_config, repeat_block_n_times=0) + runtime_config10 = replace(runtime_config, repeat_block_n_times=9) + + block_config = _make_standard_block_config( + runtime_config.hidden_size, runtime_config.num_attention_heads + ) + + runtime_ms1 = calc_subblock_runtime(runtime_config1, None) + runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config) + + no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9 + + return no_block_runtime_ms + + +def calc_runtime_for_subblocks( + subblock_config_set: set[SubblockConfig], + runtime_stats_config: DictConfig, + vocab_size: int, + hidden_size: int, + num_attention_heads: int, + master_puzzle_dir: str, + tokenizer_path: str, + synth_dataset_num_requests: int, + prefill_seq_len: int, + generation_seq_len: int, +) -> tuple[dict[SubblockConfig, float], float]: + """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead.""" + repeat_block_n_times = 10 + runtime_config = RuntimeConfig( + vocab_size, + hidden_size, + num_attention_heads, + master_puzzle_dir, + tokenizer_path, + synth_dataset_num_requests, + repeat_block_n_times, + prefill_seq_len, + generation_seq_len, + runtime_stats_config.get("batch_size", 1), + runtime_stats_config.get("num_iters", 30), + runtime_stats_config.get("num_warmup_iters", 10), + ) + + runtime_by_subblock_dict = {} + + baseline_runtime_ms = calc_subblock_runtime(runtime_config, None) + + for subblock_config in tqdm( + sorted(subblock_config_set), + desc=( + f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, " + f"num_subblocks={len(subblock_config_set)}]" + ), + ): + if subblock_config.no_op: + total_runtime_ms = 0.0 + else: + subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config) + total_runtime_ms = ( + subblock_total_runtime_ms - baseline_runtime_ms + ) / repeat_block_n_times + + runtime_by_subblock_dict[subblock_config] = total_runtime_ms + + no_block_runtime_ms = calc_no_block_runtime(runtime_config) + + return runtime_by_subblock_dict, no_block_runtime_ms diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py similarity index 61% rename from modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py rename to modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py index d893eb55bb3..abe7a1a3884 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py @@ -25,22 +25,21 @@ import json import math from pathlib import Path -from typing import Type import numpy as np import torch from transformers import PretrainedConfig -from ..anymodel.model_descriptor import ModelDescriptor -from ..block_config import ( +from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor +from modelopt.torch.puzzletron.block_config import ( AttentionConfig, BlockConfig, FFNConfig, MambaConfig, maybe_cast_block_configs, ) -from ..tools.checkpoint_utils_hf import init_model_from_config -from ..utils.misc import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config +from modelopt.torch.puzzletron.utils.misc import ( EmptyInitOnDevice, calculate_kv_dim, raise_unknown_subblock_config_error, @@ -48,16 +47,16 @@ ) __all__ = [ - "calculate_subblock_memory", - "calculate_subblock_params", "calc_subblock_active_params", - "load_moe_stats", - "estimate_num_active_experts", + "calculate_ffn_memory", "calculate_mamba_memory", "calculate_mamba_state_size", - "calculate_ffn_memory", "calculate_non_block_memory", "calculate_non_block_params", + "calculate_subblock_memory", + "calculate_subblock_params", + "estimate_num_active_experts", + "load_moe_stats", ] @@ -73,9 +72,29 @@ def calculate_subblock_memory( kv_cache_dtype: torch.dtype, allocate_prefill_query: bool, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], ) -> float | dict[str, float]: - """``model_config`` / ``descriptor`` are required (puzzletron-style); FFN uses them for meta init.""" + """Calculate the memory usage of a single subblock (FFN or Attention). + + Given its configuration and runtime dimensions, returns bytes or a detailed dict. + + Args: + subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass. + batch_size (int): Batch size for memory estimate. + prefill_seq_len (int): Sequence length for prefill phase. + generation_seq_len (int): Sequence length for generation phase (token-by-token). + prefill_queue_size (int): Token queue size for prefill attention memory allocation. + n_embd (int): Embedding (hidden) dimension. + n_head (int): Number of attention heads (used for non-FFN). + weights_dtype (torch.dtype): PyTorch dtype for model weights. + kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache. + allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens. + model_config (PretrainedConfig): HuggingFace-style config instance describing the model. + descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types). + + Returns: + float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type. + """ if subblock_config.no_op: return 0 if isinstance(subblock_config, FFNConfig): @@ -116,7 +135,7 @@ def calculate_subblock_memory( def calculate_subblock_params( config: PretrainedConfig, layer_config: BlockConfig | FFNConfig | AttentionConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], ) -> int: """Count parameters on one meta decoder layer. @@ -124,9 +143,7 @@ def calculate_subblock_params( ``hybrid_override_pattern``) before passing ``config``; see ``ModelDescriptor.truncate_pattern_for_subblock``. """ - if isinstance(layer_config, FFNConfig): - block_config = layer_config.to_blockconfig() - elif isinstance(layer_config, AttentionConfig): + if isinstance(layer_config, (FFNConfig, AttentionConfig)): block_config = layer_config.to_blockconfig() else: block_config = layer_config @@ -189,12 +206,31 @@ def calculate_subblock_params( def calc_subblock_active_params( sublayer_config: FFNConfig | AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], n_embd: int, moe_stats_file: str, batch_size: int, block_idx: int, ) -> int: + """Calculate the number of "active" parameters for a subblock (FFN, Attention, or MoE). + + For non-MoE subblocks, simply calls `calculate_subblock_params` to count all parameters. + For MoE (Mixture-of-Experts) FFN subblocks, estimates the expected number of active parameters + per batch by leveraging expert activation statistics (from a given stats file) and calculating + the expected number of active experts, then multiplies by the number of parameters per expert. + + Args: + sublayer_config: The subblock configuration (either FFNConfig or AttentionConfig). + model_config: The Hugging Face model configuration. + descriptor: The ModelDescriptor class corresponding to this model family. + n_embd: The embedding size (hidden dimension). + moe_stats_file: Path to file containing expert activation probabilities. + batch_size: The batch size used for the estimate. + block_idx: The index of the block/subblock within the network, used to index into the stats. + + Returns: + int: The expected number of "active" parameters for the given subblock. + """ if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe): return calculate_subblock_params(model_config, sublayer_config, descriptor) return estimate_moe_active_params( @@ -203,14 +239,45 @@ def calc_subblock_active_params( def load_moe_stats(stats_file: str) -> dict: + """Load MoE (Mixture-of-Experts) routing statistics from a file. + + This function reads a JSON file containing expert activation probabilities or counts for each MoE block. + It returns the normalized probability distributions over experts for each block, as a list of numpy arrays. + + Args: + stats_file (str): Path to the JSON file containing expert routing statistics for each block. + + Returns: + list[np.ndarray]: A list where each element is a numpy array containing the normalized probability + distribution over experts for the corresponding block. If a block's expert list is empty, + its entry is 0. + """ with open(stats_file) as f: stats = json.load(f) - return [np.array(l) / np.sum(l) if len(l) > 0 else 0 for l in stats] + return [ + np.array(expert_probs) / np.sum(expert_probs) if len(expert_probs) > 0 else 0 + for expert_probs in stats + ] def estimate_num_active_experts( dist_over_experts: np.ndarray, batch_size: int, num_experts: int ) -> int: + """Estimate the expected number of active experts in a Mixture-of-Experts (MoE) layer. + + This function computes the expected number of unique experts that are selected at least once when performing + inference with a given batch size. It assumes, for each input in the batch, an expert is chosen with probability + given by `dist_over_experts` (typically a vector of probabilities for each expert). For a batch of size B, the + expected number of active (i.e., selected at least once) experts is computed. + + Args: + dist_over_experts (np.ndarray): A 1D array of probabilities for each expert. + batch_size (int): The number of samples in the batch. + num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter). + + Returns: + int: The expected number of experts selected at least once across the batch. + """ # cut the tail and renormalize dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts] dist_over_experts = dist_over_experts / (dist_over_experts.sum()) @@ -226,6 +293,18 @@ def estimate_moe_active_params( batch_size: int, block_idx: int, ) -> int: + """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock. + + Args: + subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured). + n_embd (int): The embedding dimension (input and output size per expert). + moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts. + batch_size (int): Batch size to simulate/extrapolate expected expert use. + block_idx (int): The index of the block/layer whose expert routing statistics should be used. + + Returns: + int: Estimated number of parameters actively used for the current batch and expert selection statistics. + """ assert Path(moe_stats_file).exists() # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution? # return calculate_subblock_params(subblock_config, n_embd, n_head=None) @@ -255,7 +334,7 @@ def estimate_moe_active_params( def calculate_attention_memory( attention_config: AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], batch_size: int, prefill_seq_len: int, generation_seq_len: int, @@ -267,6 +346,7 @@ def calculate_attention_memory( allocate_prefill_query: bool, ) -> dict[str, float]: """allocate_prefill_query: infery-llm style. + Infery used a unified Wqkv matrix, so before extracting the kv-cache, the query also had to be kept in-memory, once per layer. """ @@ -294,11 +374,25 @@ def calculate_attention_memory( def calculate_mamba_memory( attention_config: AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], batch_size: int, weights_dtype: torch.dtype, kv_cache_dtype: torch.dtype, ) -> int: + """Calculate memory usage (MiB) for a Mamba attention subblock. + + Args: + attention_config (AttentionConfig): Mamba attention configuration, + including Mamba-specific settings. + model_config (PretrainedConfig): Model configuration. + descriptor (type[ModelDescriptor]): Model descriptor class. + batch_size (int): Batch size for memory estimate. + weights_dtype (torch.dtype): Data type for model weights. + kv_cache_dtype (torch.dtype): Data type for state/kv-cache. + + Returns: + int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock. + """ assert attention_config.mamba is not None mamba_config = attention_config.mamba num_params = calculate_subblock_params(model_config, attention_config, descriptor) @@ -312,7 +406,16 @@ def calculate_mamba_state_size( mamba_config: MambaConfig, batch_size: int, ) -> int: - d_inner, in_proj_dim, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config) + """Calculate the total state size for a Mamba attention subblock. + + Args: + mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters. + batch_size (int): Batch size to estimate the memory/state requirements for. + + Returns: + int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state. + """ + _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config) conv_state_size = math.prod((batch_size, conv_dim, kernel_size)) ssm_state_size = math.prod( (batch_size, mamba_config.num_heads, mamba_config.head_dim, mamba_config.state_dim) @@ -333,10 +436,23 @@ def _calculate_mamba_intermediates(mamba_config: MambaConfig) -> tuple[int, ...] def calculate_ffn_memory( ffn_config: FFNConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], weights_dtype: torch.dtype | str, experts_dtype: torch.dtype | str | None = None, ) -> float: + """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock. + + Args: + ffn_config (FFNConfig): FFN configuration for the block. + model_config (PretrainedConfig): The parent model configuration. + descriptor (type[ModelDescriptor]): Model descriptor class. + weights_dtype (torch.dtype | str): Data type for FFN weights. + experts_dtype (torch.dtype | str | None, optional): Data type for expert weights + (for MoE layers, if present). Defaults to None. + + Returns: + float: Estimated FFN memory usage in mebibytes (MiB). + """ # TODO: How to separate between expert weights and the rest for any model (same as puzzletron). num_params = calculate_subblock_params(model_config, ffn_config, descriptor) return num_params * sizeof_dtype(weights_dtype) / 2**20 @@ -347,6 +463,16 @@ def calculate_non_block_memory( vocab_size: int, weight_dtype: torch.dtype, ) -> float: + """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection). + + Args: + n_embd (int): Embedding dimension (hidden size). + vocab_size (int): Vocabulary size. + weight_dtype (torch.dtype): Data type for model weights. + + Returns: + float: Estimated non-subblock memory usage in mebibytes (MiB). + """ return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20 @@ -354,4 +480,13 @@ def calculate_non_block_params( n_embd: int, vocab_size: int, ) -> int: + """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection). + + Args: + n_embd (int): Embedding dimension (hidden size). + vocab_size (int): Vocabulary size. + + Returns: + int: Estimated non-subblock parameter count. + """ return vocab_size * n_embd * 2 + n_embd diff --git a/modelopt/torch/puzzletron/subblock_stats/__init__.py b/modelopt/torch/puzzletron/subblock_stats/__init__.py index fbbeb3ff709..4964dba0cfa 100644 --- a/modelopt/torch/puzzletron/subblock_stats/__init__.py +++ b/modelopt/torch/puzzletron/subblock_stats/__init__.py @@ -15,5 +15,4 @@ """Subblock statistics collection for Puzzletron.""" -from .calc_subblock_params_and_memory import * from .calc_subblock_stats import * diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index dc89a1f6450..f36a71710a3 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -24,7 +24,7 @@ from functools import partial from itertools import product from pathlib import Path -from typing import Iterable, Optional, Type, TypeVar +from typing import Iterable, Type, TypeVar import pandas as pd import torch @@ -41,7 +41,7 @@ from ..tools.checkpoint_utils import load_model_config from ..tools.logger import mprint from ..utils.parsing import format_global_config -from .calc_subblock_params_and_memory import ( +from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import ( calc_subblock_active_params, calculate_non_block_memory, calculate_non_block_params, @@ -52,7 +52,6 @@ __all__ = [ "calculate_subblock_stats", "launch_calc_subblock_stats", - "add_int8_runtime_estimates", ] # Type variable for dataclasses @@ -60,10 +59,10 @@ """ Usage: -python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ] +python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --runtime_stats ] ---benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime, - only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker. +--runtime_stats_enabled=False (the default) means that the code won't benchmark runtime, + only memory stats will be calculated. If you want to benchmark runtime, run inside an trtllm docker. """ @@ -82,7 +81,7 @@ def calculate_subblock_stats( n_embd: int, n_head: int, vocab_size: int, - benchmark_iterations: Optional[int], + runtime_stats_enabled: bool, use_cuda_graph: bool, weights_dtype: torch.dtype, activations_dtype: torch.dtype, @@ -90,14 +89,12 @@ def calculate_subblock_stats( allocate_prefill_query: bool, moe_stats_file: str | Path | None = None, ) -> dict: - is_calc_runtime = benchmark_iterations is not None - if is_calc_runtime: - raise NotImplementedError("Runtime stats calculation is not implemented yet") + if runtime_stats_enabled: + from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name() subblock_stats = { "args": dict( - is_calc_runtime=is_calc_runtime, gpu=gpu, batch_size=batch_size, prefill_seq_len=prefill_seq_len, @@ -106,7 +103,7 @@ def calculate_subblock_stats( n_embd=n_embd, n_head=n_head, vocab_size=vocab_size, - benchmark_iterations=benchmark_iterations, + runtime_stats=runtime_stats_enabled, use_cuda_graph=use_cuda_graph, weights_dtype=str(weights_dtype), activations_dtype=str(activations_dtype), @@ -116,8 +113,7 @@ def calculate_subblock_stats( "subblocks": list(), } # Compute runtime stats for unique subblocks only - if is_calc_runtime: - raise NotImplementedError("Runtime stats calculation is not implemented yet") + if runtime_stats_enabled: subblock_configs_nolayerindex = set( [subblock_config["subblock_config"] for subblock_config in subblock_configs] ) @@ -127,16 +123,19 @@ def calculate_subblock_stats( synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get( "synth_dataset_num_requests", 200 ) - backend = calc_subblock_stats_config.get("runtime_stats", {}).get("backend", "trt_torch") - runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_ms_for_subblocks( + runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {}) + + runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks( subblock_configs_nolayerindex, + runtime_stats_config, vocab_size, n_embd, n_head, master_puzzle_dir, teacher_dir, synth_dataset_num_requests, - backend, + prefill_seq_len, + generation_seq_len, ) sorted_subblock_config = sorted( @@ -144,7 +143,7 @@ def calculate_subblock_stats( ) it = ( tqdm(sorted_subblock_config, desc="Measuring subblock runtimes") - if is_calc_runtime + if runtime_stats_enabled else sorted_subblock_config ) for subblock_config_indexed in it: @@ -156,7 +155,7 @@ def calculate_subblock_stats( descriptor.get_language_model_config(layer_model_config), parent_layer_indices[0] ) - if is_calc_runtime: + if runtime_stats_enabled: total_runtime_ms = runtime_by_subblock_dict[subblock_config] prefill_runtime_ms = None decode_runtime_ms = None @@ -207,25 +206,13 @@ def calculate_subblock_stats( } ) - if is_calc_runtime: - # TODO: fix - # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms - # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \ - # measure_non_block_runtime_ms(batch_size, prefill_seq_len, generation_seq_len, n_embd, vocab_size, - # benchmark_iterations, use_cuda_graph) - embedding_runtime_ms, lm_head_runtime_ms = None, None - else: - non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = None, None, None + if not runtime_stats_enabled: + non_block_runtime_ms = None non_block_memory = calculate_non_block_memory(n_embd, vocab_size, weights_dtype) non_block_params = calculate_non_block_params(n_embd, vocab_size) - # TODO - # the semantics here is wrong why do we refer, prefill_runtime_ms as embedding_runtime_ms and lm_head_runtime_ms as decode_runtime_ms ? - # Prefill is the first the user prompt inference, and Decode refer to the next generation process. both processes use all the model layers. subblock_stats["non_block"] = { "runtime_ms": non_block_runtime_ms, - "prefill_runtime_ms": embedding_runtime_ms, - "decode_runtime_ms": lm_head_runtime_ms, "memory_mib": non_block_memory, "num_params": non_block_params, } @@ -256,7 +243,9 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None: num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None), prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size, allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False), - benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None), + runtime_stats_enabled=cfg.calc_subblock_stats.get("runtime_stats", {}).get( + "enabled", False + ), merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats, subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename, moe_stats_filename=cfg.calc_subblock_stats.moe_stats_filename, @@ -276,9 +265,7 @@ def calculate_subblock_stats_for_puzzle_dir( num_active_tokens_override: int | None = None, prefill_queue_size: int = 0, # it's an infery-llm thing allocate_prefill_query: bool = False, - benchmark_iterations: ( - int | None - ) = None, # If set then compute runtime performance statistics. TODO: recommend default value, is 1000 good? + runtime_stats_enabled: bool = False, # Compute runtime statistics. merge_with_existing_stats: bool = False, subblock_stats_filename: str = "subblock_stats.json", moe_stats_filename: str = "moe_stats.json", @@ -344,8 +331,8 @@ def calculate_subblock_stats_for_puzzle_dir( if num_active_tokens_override is not None: prefill_seq_len = generation_seq_len = int(num_active_tokens_override / batch_size / 2) - curr_benchmark_iterations = ( - benchmark_iterations if weights_dtype == torch.bfloat16 else None + curr_runtime_stats_enabled = ( + runtime_stats_enabled if weights_dtype == torch.bfloat16 else False ) curr_subblock_stats = calculate_subblock_stats( @@ -362,7 +349,7 @@ def calculate_subblock_stats_for_puzzle_dir( n_embd=model_hidden_size, n_head=lm_config.num_attention_heads, vocab_size=lm_config.vocab_size, - benchmark_iterations=curr_benchmark_iterations, + runtime_stats_enabled=curr_runtime_stats_enabled, use_cuda_graph=True, weights_dtype=weights_dtype, activations_dtype=activations_dtype, @@ -378,8 +365,6 @@ def calculate_subblock_stats_for_puzzle_dir( subblock_stats.append(curr_subblock_stats) - # TODO fix: add_int8_runtime_estimates(subblock_stats) - json_dump(subblock_stats, subblock_stats_file) mprint(subblock_stats_file) @@ -503,65 +488,3 @@ def _dataclass_from_dict( raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}") -def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None: - for curr_subblock_stats in subblock_stats: - args = curr_subblock_stats["args"] - if args["weights_dtype"] == "torch.int8": - assert args["activations_dtype"] == "torch.int8" - ffn_factor = 0.5 - attention_factor = 0.5 if args["kv_cache_dtype"] == "torch.int8" else 0.8 - - bf16_stats = _find_corresponding_bf16_stats(args, subblock_stats) - if bf16_stats is not None: - curr_subblocks = curr_subblock_stats["subblocks"] + [ - curr_subblock_stats["non_block"] - ] - bf16_subblocks = bf16_stats["subblocks"] + [bf16_stats["non_block"]] - for curr_subblock, bf16_subblock in zip(curr_subblocks, bf16_subblocks): - assert curr_subblock.get("subblock_config", None) == bf16_subblock.get( - "subblock_config", None - ) - is_attention = False - if (subblock_config := curr_subblock.get("subblock_config")) is not None: - if hasattr(subblock_config, "__dataclass_fields__"): - subblock_config = dataclasses.asdict(subblock_config) - is_attention = subblock_config.get("num_key_value_heads", None) is not None - runtime_factor = attention_factor if is_attention else ffn_factor - for stat_name, stat_value in bf16_subblock.items(): - if "runtime" in stat_name: - curr_subblock[stat_name] = stat_value * runtime_factor - - -def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> dict | None: - scenario_keys = [ - "batch_size", - "prefill_seq_len", - "generation_seq_len", - "prefill_queue_size", - "gpu", - "n_embd", - "n_head", - "vocab_size", - ] - corresponding_bf16_args = { - **{k: v for k, v in args.items() if k in scenario_keys}, - "is_calc_runtime": True, - "weights_dtype": "torch.bfloat16", - "activations_dtype": "torch.bfloat16", - "kv_cache_dtype": "torch.bfloat16", - } - matching_bf16_stats = [ - stats - for stats in subblock_stats - if all( - [ - stats["args"][key] == corresponding_bf16_args[key] - for key in corresponding_bf16_args.keys() - ] - ) - ] - if len(matching_bf16_stats) == 0: - return None - if len(matching_bf16_stats) == 1: - return matching_bf16_stats[0] - raise ValueError(f"Found more than 1 matching bf16 stats for {args=}")