From 5b470e51e858e005406387526c233b5312aef2bd Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 20 Feb 2026 16:43:38 -0800 Subject: [PATCH 1/4] --inference-dynamic-batching-num-cuda-graphs -1 sets num cuda graphs automatically --- .../core/inference/batch_dimensions_utils.py | 23 ++++++++++++++++++- megatron/training/arguments.py | 9 +++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 77354d59320..3ced9553ea0 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -235,6 +235,22 @@ def _calculate_cuda_graph_token_counts( (tp_size=2, num_cuda_graphs=4, cuda_graph_max_tokens=1000) [1000, 752, 504, 256] """ + if num_cuda_graphs == -1: + # automatically determine the number of CUDA graphs to capture based on the `max_requests` value + cuda_graph_token_counts = [1, 2, 4] + list(range(8, 256, 8)) + list( + range(256, cuda_graph_max_tokens + 1, 16) + ) + # Align each entry to TP size + cuda_graph_token_counts = list(dict.fromkeys( + math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts + )) + # Clamp to max tokens + cuda_graph_token_counts = [s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens] + if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens: + cuda_graph_token_counts.append(cuda_graph_max_tokens) + cuda_graph_token_counts.reverse() + return cuda_graph_token_counts + assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}" assert ( cuda_graph_max_tokens > 0 @@ -340,7 +356,12 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int or cuda_graph_max_tokens <= 0 ): cuda_graph_max_tokens = max_tokens - num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens) + + if num_cuda_graphs != -1: + # if -1, no need to adjust. This will ne taken care of in + # the _calculate_cuda_graph_token_counts function where we will generate + # the token counts based on the max_tokens value and the step size. + num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens) # Calculate token counts for prefill and mixed graphs. # These need the full cuda_graph_max_tokens to handle variable-length sequences. diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py index da9861d1b54..04cb2bb3979 100644 --- a/megatron/training/arguments.py +++ b/megatron/training/arguments.py @@ -857,6 +857,11 @@ def validate_args(args, defaults={}): assert args.fp8 is None, \ "fp8 is not supported with inference dynamic batching and full_iteration CUDA graph" + if args.cuda_graph_impl == 'local': + assert args.inference_dynamic_batching_num_cuda_graphs > 0 or args.inference_dynamic_batching_num_cuda_graphs == -1, \ + 'inference_dynamic_batching_num_cuda_graphs should be a positive integer or -1' \ + '-1 means that we will automatically determine the number of CUDA graphs to capture based on the `max_requests` value.' + print_rank_0('using {} for parameters ...'.format(args.params_dtype)) if args.dataloader_type is None: @@ -1654,7 +1659,9 @@ def _add_inference_args(parser): 'cuda graph batch sizes range from 1 to `max_requests`. ' '(See `dynamic_context.py` for details on how ' '`max_requests` is computed). Due to rounding, the actual ' - 'number of cuda graphs may not equal this argument.') + 'number of cuda graphs may not equal this argument.' + 'The user can also pass -1, in which case we automatically determine the number of graphs ' \ + 'to capture based on the `max_requests`.') group.add_argument('--inference-dynamic-batching-track-paused-request-events', action='store_true', help='Track paused request ids by adding \'paused\' events ' From 9a4cb1962bcec2f2eb23dadb6ebe3d60bc022507 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 20 Feb 2026 16:46:12 -0800 Subject: [PATCH 2/4] format --- .../core/inference/batch_dimensions_utils.py | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 3ced9553ea0..8ff73345acf 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -236,16 +236,19 @@ def _calculate_cuda_graph_token_counts( [1000, 752, 504, 256] """ if num_cuda_graphs == -1: - # automatically determine the number of CUDA graphs to capture based on the `max_requests` value - cuda_graph_token_counts = [1, 2, 4] + list(range(8, 256, 8)) + list( - range(256, cuda_graph_max_tokens + 1, 16) + # automatically determine the number of CUDA graphs to + # capture based on the `max_requests` value + cuda_graph_token_counts = ( + [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16)) ) # Align each entry to TP size - cuda_graph_token_counts = list(dict.fromkeys( - math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts - )) + cuda_graph_token_counts = list( + dict.fromkeys(math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts) + ) # Clamp to max tokens - cuda_graph_token_counts = [s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens] + cuda_graph_token_counts = [ + s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens + ] if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens: cuda_graph_token_counts.append(cuda_graph_max_tokens) cuda_graph_token_counts.reverse() @@ -356,10 +359,10 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int or cuda_graph_max_tokens <= 0 ): cuda_graph_max_tokens = max_tokens - + if num_cuda_graphs != -1: - # if -1, no need to adjust. This will ne taken care of in - # the _calculate_cuda_graph_token_counts function where we will generate + # if -1, no need to adjust. This will ne taken care of in + # the _calculate_cuda_graph_token_counts function where we will generate # the token counts based on the max_tokens value and the step size. num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens) From 5e5d999681f6ef9ad54294d74c4fe9ad1c457088 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Fri, 20 Feb 2026 16:58:02 -0800 Subject: [PATCH 3/4] format --- megatron/core/inference/batch_dimensions_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py index 8ff73345acf..1a202c35af5 100644 --- a/megatron/core/inference/batch_dimensions_utils.py +++ b/megatron/core/inference/batch_dimensions_utils.py @@ -236,7 +236,7 @@ def _calculate_cuda_graph_token_counts( [1000, 752, 504, 256] """ if num_cuda_graphs == -1: - # automatically determine the number of CUDA graphs to + # automatically determine the number of CUDA graphs to # capture based on the `max_requests` value cuda_graph_token_counts = ( [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16)) @@ -361,7 +361,7 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int cuda_graph_max_tokens = max_tokens if num_cuda_graphs != -1: - # if -1, no need to adjust. This will ne taken care of in + # if -1, no need to adjust. This will be taken care of in # the _calculate_cuda_graph_token_counts function where we will generate # the token counts based on the max_tokens value and the step size. num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens) From c5e2abb57640ea98516d2a748731c22d58d1e339 Mon Sep 17 00:00:00 2001 From: Siddharth Singh Date: Mon, 23 Feb 2026 13:25:28 -0800 Subject: [PATCH 4/4] add unit test --- tests/unit_tests/inference/engines/test_dynamic_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py index 02be5c136fd..e679b5d7c64 100644 --- a/tests/unit_tests/inference/engines/test_dynamic_engine.py +++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py @@ -543,7 +543,7 @@ def teardown_method(self, method): not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching" ) @pytest.mark.parametrize("model_provider", ["gpt", "mamba"]) - @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4]) + @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4, -1]) @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration_inference]]) def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None: """Simple test that runs without errors, and validates output.""" @@ -557,6 +557,7 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None num_cuda_graphs=num_cuda_graphs, cuda_graph_scope=cuda_graph_scope, force_build_cuda_graphs=True, + context_max_requests=128, ) # Validate max_requests, max_tokens.