Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 40 additions & 25 deletions megatron/core/inference/batch_dimensions_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class CUDAGraphBatchDimensionBuilder:
"""

# Constant for rounding token counts when generating CUDA graph batch dimensions
CUDA_GRAPH_ROUNDER = 8
CUDA_GRAPH_ROUNDER = 2

@staticmethod
def _calculate_cuda_graph_token_counts(
Expand All @@ -219,8 +219,9 @@ def _calculate_cuda_graph_token_counts(
"""
Calculate CUDA graph token counts for a given configuration.

This method computes evenly-spaced token counts from step_size up to
cuda_graph_max_tokens, ensuring proper rounding and TP alignment.
This method computes exponentially-decreasing token counts (powers of 2)
from cuda_graph_max_tokens down to CUDA_GRAPH_ROUNDER, ensuring proper
rounding and TP alignment.

Args:
tp_size: Tensor parallel size (for alignment)
Expand All @@ -232,38 +233,52 @@ def _calculate_cuda_graph_token_counts(

Example:
>>> _calculate_cuda_graph_token_counts
(tp_size=2, num_cuda_graphs=4, cuda_graph_max_tokens=1000)
[1000, 752, 504, 256]
(tp_size=1, num_cuda_graphs=8, cuda_graph_max_tokens=128)
[128, 64, 32, 16, 8, 4, 2, 1]
"""
assert num_cuda_graphs >= 1, f"num_cuda_graphs must be >= 1, got {num_cuda_graphs}"
assert (
cuda_graph_max_tokens > 0
), f"cuda_graph_max_tokens must be > 0, got {cuda_graph_max_tokens}"

# Cuda graph step size.
cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs
cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int(
math.ceil(int(cuda_graph_step_size) / CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER)
)
# Make sure divisible by TP size
cuda_graph_step_size = math.ceil(cuda_graph_step_size / tp_size) * tp_size
rounder = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER

# round down cuda graph max tokens to be multiple of TP size
# Round down cuda graph max tokens to be multiple of TP size
cuda_graph_max_tokens = (cuda_graph_max_tokens // tp_size) * tp_size

# Cuda graph token counts.
if num_cuda_graphs == 1:
cuda_graph_token_counts = [cuda_graph_max_tokens]
else:
cuda_graph_token_counts = list(
range(cuda_graph_step_size, cuda_graph_max_tokens, cuda_graph_step_size)
)
if (
len(cuda_graph_token_counts) == 0
or cuda_graph_token_counts[-1] != cuda_graph_max_tokens
):
cuda_graph_token_counts.append(cuda_graph_max_tokens)
cuda_graph_token_counts.reverse()
return [cuda_graph_max_tokens]

# Exponentially decreasing, stops after num_cuda_graphs entries
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also vote to leave the linear-spaced CGs as an option, there's no harm in doing so since the code is already setup, we can just default to exponential in the arguments.

One reason for keeping this setting & code is that vLLM uses linear spacing, they just create a ton more graphs than we do because they can create them so quickly and efficiently, and I think that just speaks to how unoptimized our CG system is. So I personally would keep the old option, and just plan to use it in the future.

Copy link
Copy Markdown
Contributor Author

@mathemakitten mathemakitten Feb 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be against adding a new flag to toggle the distribution of inference cudagraphs. Inference already has a lot of flags and users don't know how to combine them effectively, and there is no empirical case that makes the argument to keep linear distribution around at the moment. I will leave a TODO to re-enable when someone wants to take it on.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, #3527 already implements the vllm strategy orthogonal to this.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mathemakitten on second thought, I am also in favor of slowly phasing out the older strategy - mostly because it's the most stress tested one we have right now. We could make yours default, while having the option to fallback onto that.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re:3527 - it does use a linear function, but builds a lot more cudagraphs compared to our default strategy.

# or when below the minimum size.
# TODO(helenn/lmcafee): Extend upper range of distribution to be linearly-spaced.
cuda_graph_token_counts = []
Comment thread
yobibyte marked this conversation as resolved.
val = cuda_graph_max_tokens
for _ in range(num_cuda_graphs):
# Round down to multiple of rounder, then up to multiple of TP size
rounded = max(rounder, (val // rounder) * rounder)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the old code guaranteed that cuda_graph_max_tokens is in the list, but now we don't have that guarantee anymore. Do we care about this, e.g., someone wants to very strictly set the max cuda graph size?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will bring this back and sub out a middle graph for it.

rounded = math.ceil(rounded / tp_size) * tp_size
if rounded not in cuda_graph_token_counts:
cuda_graph_token_counts.append(rounded)
val //= 2
if val < 1:
break

# Ensure cuda_graph_max_tokens is always included
if cuda_graph_token_counts[0] != cuda_graph_max_tokens:
cuda_graph_token_counts.insert(0, cuda_graph_max_tokens)

# Include a (possibly extra) size-1 graph
if cuda_graph_token_counts[-1] != tp_size:
cuda_graph_token_counts.append(tp_size)

# Trim from the middle if we exceed num_cuda_graphs requested by the user
# Since num_cuda_graphs >= 1, this only runs when we have at least 2 elements.
while len(cuda_graph_token_counts) > num_cuda_graphs:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this line need to be

while len(cuda_graph_token_counts) > num_cuda_graphs and len(cuda_graph_token_counts) >= 2

Otherwise pop(-2) might give an index error or (even worse) silently wrap around?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a guarantee that num_cuda_graphs >= 1 at the top of the block and we also check while len(cuda_graph_token_counts) > num_cuda_graphs, so we're actually already guaranteed that len(cuda_graph_token_counts) >= 2 when this block runs.

cuda_graph_token_counts.pop(-2)
Comment on lines +277 to +278
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add these lines afterwards:

assert len(cuda_graph_token_counts) == num_cuda_graphs
assert cuda_graph_max_tokens in num_cuda_graphs


assert len(cuda_graph_token_counts) <= num_cuda_graphs
assert cuda_graph_max_tokens in cuda_graph_token_counts

return cuda_graph_token_counts

Expand Down
44 changes: 39 additions & 5 deletions tests/unit_tests/inference/engines/test_dynamic_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from transformer_engine.pytorch.fp8 import check_fp8_support

from megatron.core import parallel_state
from megatron.core.inference.batch_dimensions_utils import CUDAGraphBatchDimensionBuilder
from megatron.core.inference.config import (
InferenceConfig,
KVCacheManagementMode,
Expand Down Expand Up @@ -706,11 +707,11 @@ def test_cuda_graph_token_counts(self) -> None:
for num_cuda_graphs, expected_cuda_graph_token_counts in [
Comment thread
mathemakitten marked this conversation as resolved.
(0, [80]),
(1, [80]),
(2, [80, 40]),
(4, [80, 72, 48, 24]),
(8, [80, 64, 48, 32, 16]),
(16, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
(32, [80, 72, 64, 56, 48, 40, 32, 24, 16, 8]),
(2, [80, 1]),
(4, [80, 40, 20, 1]),
(8, [80, 40, 20, 10, 4, 2, 1]),
(16, [80, 40, 20, 10, 4, 2, 1]),
(32, [80, 40, 20, 10, 4, 2, 1]),
]:

# Build cuda graphs (inside dynamic engine).
Expand All @@ -728,6 +729,39 @@ def test_cuda_graph_token_counts(self) -> None:
actual_cuda_graph_token_counts,
)

@pytest.mark.internal
@pytest.mark.parametrize(
"tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected",
[
# TP=1
(1, 1, 80, [80]),
(1, 2, 80, [80, 1]),
(1, 4, 80, [80, 40, 20, 1]),
(1, 8, 80, [80, 40, 20, 10, 4, 2, 1]),
(1, 16, 80, [80, 40, 20, 10, 4, 2, 1]),
# TP=2
(2, 1, 80, [80]),
(2, 2, 80, [80, 2]),
(2, 4, 80, [80, 40, 20, 2]),
(2, 8, 80, [80, 40, 20, 10, 4, 2]),
(2, 16, 80, [80, 40, 20, 10, 4, 2]),
],
)
def test_calculate_cuda_graph_token_counts(
self, tp_size, num_cuda_graphs, cuda_graph_max_tokens, expected
):
"""Test _calculate_cuda_graph_token_counts for various TP sizes."""
actual = CUDAGraphBatchDimensionBuilder._calculate_cuda_graph_token_counts(
tp_size=tp_size,
num_cuda_graphs=num_cuda_graphs,
cuda_graph_max_tokens=cuda_graph_max_tokens,
)
assert actual == expected, (
f"tp_size={tp_size}, num_cuda_graphs={num_cuda_graphs}, "
f"cuda_graph_max_tokens={cuda_graph_max_tokens}: "
f"expected {expected}, got {actual}"
)

@pytest.mark.internal
@pytest.mark.skipif(
not is_fa_min_version("2.7.3"), reason="need latest flash attn for dynamic batching"
Expand Down
Loading