From 5b470e51e858e005406387526c233b5312aef2bd Mon Sep 17 00:00:00 2001
From: Siddharth Singh <sidsingh@nvidia.com>
Date: Fri, 20 Feb 2026 16:43:38 -0800
Subject: [PATCH 1/4] --inference-dynamic-batching-num-cuda-graphs -1 sets num
 cuda graphs automatically

---
 .../core/inference/batch_dimensions_utils.py  | 23 ++++++++++++++++++-
 megatron/training/arguments.py                |  9 +++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 77354d59320..3ced9553ea0 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -235,6 +235,22 @@ def _calculate_cuda_graph_token_counts(
             (tp_size=2, num_cuda_graphs=4, cuda_graph_max_tokens=1000)
             [1000, 752, 504, 256]
         """
+        if num_cuda_graphs == -1:
+            # automatically determine the number of CUDA graphs to capture based on the `max_requests` value
+            cuda_graph_token_counts = [1, 2, 4] + list(range(8, 256, 8)) + list(
+                range(256, cuda_graph_max_tokens + 1, 16)
+            )
+            # Align each entry to TP size
+            cuda_graph_token_counts = list(dict.fromkeys(
+                math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts
+            ))
+            # Clamp to max tokens
+            cuda_graph_token_counts = [s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens]
+            if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens:
+                cuda_graph_token_counts.append(cuda_graph_max_tokens)
+            cuda_graph_token_counts.reverse()
+            return cuda_graph_token_counts
+
         assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
         assert (
             cuda_graph_max_tokens > 0
@@ -340,7 +356,12 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 or cuda_graph_max_tokens <= 0
             ):
                 cuda_graph_max_tokens = max_tokens
-            num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens)
+    
+            if num_cuda_graphs != -1:
+                # if -1, no need to adjust. This will ne taken care of in 
+                # the _calculate_cuda_graph_token_counts function where we will generate 
+                # the token counts based on the max_tokens value and the step size.
+                num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens)
 
             # Calculate token counts for prefill and mixed graphs.
             # These need the full cuda_graph_max_tokens to handle variable-length sequences.
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index da9861d1b54..04cb2bb3979 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -857,6 +857,11 @@ def validate_args(args, defaults={}):
             assert args.fp8 is None, \
             "fp8 is not supported with inference dynamic batching and full_iteration CUDA graph"
 
+    if args.cuda_graph_impl == 'local':
+        assert args.inference_dynamic_batching_num_cuda_graphs > 0 or args.inference_dynamic_batching_num_cuda_graphs == -1, \
+            'inference_dynamic_batching_num_cuda_graphs should be a positive integer or -1' \
+            '-1 means that we will automatically determine the number of CUDA graphs to capture based on the `max_requests` value.'
+
     print_rank_0('using {} for parameters ...'.format(args.params_dtype))
 
     if args.dataloader_type is None:
@@ -1654,7 +1659,9 @@ def _add_inference_args(parser):
                        'cuda graph batch sizes range from 1 to `max_requests`. '
                        '(See `dynamic_context.py` for details on how '
                        '`max_requests` is computed). Due to rounding, the actual '
-                       'number of cuda graphs may not equal this argument.')
+                       'number of cuda graphs may not equal this argument.'
+                       'The user can also pass -1, in which case we automatically determine the number of graphs ' \
+                       'to capture based on the `max_requests`.')
     group.add_argument('--inference-dynamic-batching-track-paused-request-events',
                        action='store_true',
                        help='Track paused request ids by adding \'paused\' events '

From 9a4cb1962bcec2f2eb23dadb6ebe3d60bc022507 Mon Sep 17 00:00:00 2001
From: Siddharth Singh <sidsingh@nvidia.com>
Date: Fri, 20 Feb 2026 16:46:12 -0800
Subject: [PATCH 2/4] format

---
 .../core/inference/batch_dimensions_utils.py  | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 3ced9553ea0..8ff73345acf 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -236,16 +236,19 @@ def _calculate_cuda_graph_token_counts(
             [1000, 752, 504, 256]
         """
         if num_cuda_graphs == -1:
-            # automatically determine the number of CUDA graphs to capture based on the `max_requests` value
-            cuda_graph_token_counts = [1, 2, 4] + list(range(8, 256, 8)) + list(
-                range(256, cuda_graph_max_tokens + 1, 16)
+            # automatically determine the number of CUDA graphs to 
+            # capture based on the `max_requests` value
+            cuda_graph_token_counts = (
+                [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16))
             )
             # Align each entry to TP size
-            cuda_graph_token_counts = list(dict.fromkeys(
-                math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts
-            ))
+            cuda_graph_token_counts = list(
+                dict.fromkeys(math.ceil(s / tp_size) * tp_size for s in cuda_graph_token_counts)
+            )
             # Clamp to max tokens
-            cuda_graph_token_counts = [s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens]
+            cuda_graph_token_counts = [
+                s for s in cuda_graph_token_counts if s <= cuda_graph_max_tokens
+            ]
             if not cuda_graph_token_counts or cuda_graph_token_counts[-1] != cuda_graph_max_tokens:
                 cuda_graph_token_counts.append(cuda_graph_max_tokens)
             cuda_graph_token_counts.reverse()
@@ -356,10 +359,10 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 or cuda_graph_max_tokens <= 0
             ):
                 cuda_graph_max_tokens = max_tokens
-    
+
             if num_cuda_graphs != -1:
-                # if -1, no need to adjust. This will ne taken care of in 
-                # the _calculate_cuda_graph_token_counts function where we will generate 
+                # if -1, no need to adjust. This will ne taken care of in
+                # the _calculate_cuda_graph_token_counts function where we will generate
                 # the token counts based on the max_tokens value and the step size.
                 num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens)
 

From 5e5d999681f6ef9ad54294d74c4fe9ad1c457088 Mon Sep 17 00:00:00 2001
From: Siddharth Singh <sidsingh@nvidia.com>
Date: Fri, 20 Feb 2026 16:58:02 -0800
Subject: [PATCH 3/4] format

---
 megatron/core/inference/batch_dimensions_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
index 8ff73345acf..1a202c35af5 100644
--- a/megatron/core/inference/batch_dimensions_utils.py
+++ b/megatron/core/inference/batch_dimensions_utils.py
@@ -236,7 +236,7 @@ def _calculate_cuda_graph_token_counts(
             [1000, 752, 504, 256]
         """
         if num_cuda_graphs == -1:
-            # automatically determine the number of CUDA graphs to 
+            # automatically determine the number of CUDA graphs to
             # capture based on the `max_requests` value
             cuda_graph_token_counts = (
                 [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, cuda_graph_max_tokens + 1, 16))
@@ -361,7 +361,7 @@ def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int
                 cuda_graph_max_tokens = max_tokens
 
             if num_cuda_graphs != -1:
-                # if -1, no need to adjust. This will ne taken care of in
+                # if -1, no need to adjust. This will be taken care of in
                 # the _calculate_cuda_graph_token_counts function where we will generate
                 # the token counts based on the max_tokens value and the step size.
                 num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens)

From c5e2abb57640ea98516d2a748731c22d58d1e339 Mon Sep 17 00:00:00 2001
From: Siddharth Singh <sidsingh@nvidia.com>
Date: Mon, 23 Feb 2026 13:25:28 -0800
Subject: [PATCH 4/4] add unit test

---
 tests/unit_tests/inference/engines/test_dynamic_engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
index 02be5c136fd..e679b5d7c64 100644
--- a/tests/unit_tests/inference/engines/test_dynamic_engine.py
+++ b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -543,7 +543,7 @@ def teardown_method(self, method):
         not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
     )
     @pytest.mark.parametrize("model_provider", ["gpt", "mamba"])
-    @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4])
+    @pytest.mark.parametrize("num_cuda_graphs", [None, 1, 4, -1])
     @pytest.mark.parametrize("cuda_graph_scope", [[], [CudaGraphScope.full_iteration_inference]])
     def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None:
         """Simple test that runs without errors, and validates output."""
@@ -557,6 +557,7 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None
             num_cuda_graphs=num_cuda_graphs,
             cuda_graph_scope=cuda_graph_scope,
             force_build_cuda_graphs=True,
+            context_max_requests=128,
         )
 
         # Validate max_requests, max_tokens.