From 95d225ff5261d82d02778aa867c8258e11c850e9 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Mon, 9 Feb 2026 07:15:28 -0800 Subject: [PATCH 01/10] limit bcast aliasing to io tensors --- csrc/alias_analysis.cpp | 12 +++++++++++- tests/cpp/test_persistent_buffer.cpp | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 81c5cb3cabf..7b1e54095ba 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -288,12 +288,22 @@ void AliasFinder::handle(const SliceOp* slice) { out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); } +// Only consider broadcast aliasing when input is a fusion input and output is +// a fusion output. Intermediate broadcasts will be fused with other ops and +// don't need explicit alias handling. Limiting to fusion boundaries avoids +// unnecessary allocation domain changes on intermediate tensors, which may +// trigger transpose scheduler when pointwise is preferred. For example, when a +// normalization kernel is segmented, we prefer reduction + pointwise instead of +// reduction + transpose. See SmemPersistentNotSupportedIn3DReduction. void AliasFinder::handle(const BroadcastOp* bcast) { auto* in = dynamic_cast(bcast->in()); - if (in == nullptr) { + if (in == nullptr || !in->isFusionInput()) { return; } auto* out = bcast->out()->as(); + if (!out->isFusionOutput()) { + return; + } std::optional out_layout = mapInLayoutToOutRoot(analysis_.preferredLayout(in), in, out); diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp index da9c5c906dd..9a0923b3f11 100644 --- a/tests/cpp/test_persistent_buffer.cpp +++ b/tests/cpp/test_persistent_buffer.cpp @@ -1257,6 +1257,13 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) { // persistent is not supported yet for 3D reduction. EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented()); + // expect reduction and pointwise scheduler + EXPECT_THAT( + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(), + UnorderedElementsAre( + HeuristicIs(SchedulerType::PointWise), + HeuristicIs(SchedulerType::Reduction))); + testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); } From 2663513a1c7391d16e39b25799a251abc4bcf64e Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Tue, 10 Feb 2026 06:27:48 -0800 Subject: [PATCH 02/10] revise --- csrc/alias_analysis.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 7b1e54095ba..1cf3221a43b 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -288,20 +288,22 @@ void AliasFinder::handle(const SliceOp* slice) { out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); } -// Only consider broadcast aliasing when input is a fusion input and output is -// a fusion output. Intermediate broadcasts will be fused with other ops and -// don't need explicit alias handling. Limiting to fusion boundaries avoids -// unnecessary allocation domain changes on intermediate tensors, which may -// trigger transpose scheduler when pointwise is preferred. For example, when a -// normalization kernel is segmented, we prefer reduction + pointwise instead of -// reduction + transpose. See SmemPersistentNotSupportedIn3DReduction. +// Only consider broadcast aliasing when IO tensor is involved. +// Intermediate broadcasts will be fused with other ops and don't need explicit +// alias handling. Limiting to fusion boundaries avoids unnecessary allocation +// domain changes on intermediate tensors, which may trigger transpose scheduler +// when pointwise is preferred. For example, when a normalization kernel is +// segmented, we prefer reduction + pointwise instead of reduction + transpose. +// See SmemPersistentNotSupportedIn3DReduction. void AliasFinder::handle(const BroadcastOp* bcast) { auto* in = dynamic_cast(bcast->in()); - if (in == nullptr || !in->isFusionInput()) { + if (in == nullptr) { return; } auto* out = bcast->out()->as(); - if (!out->isFusionOutput()) { + + // No alias analysis needed if no IO tensors are involved + if (!out->isFusionOutput() && !in->isFusionInput()) { return; } From 61c70c18b46c419eeaa7117d8a6b22d2f5356244 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Tue, 10 Feb 2026 12:54:51 -0800 Subject: [PATCH 03/10] only skip reduction+bcast --- csrc/alias_analysis.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 1cf3221a43b..d278a052d6a 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -7,6 +7,7 @@ // clang-format on #include #include +#include "ir/internal_nodes.h" #include #include @@ -288,25 +289,20 @@ void AliasFinder::handle(const SliceOp* slice) { out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); } -// Only consider broadcast aliasing when IO tensor is involved. -// Intermediate broadcasts will be fused with other ops and don't need explicit -// alias handling. Limiting to fusion boundaries avoids unnecessary allocation -// domain changes on intermediate tensors, which may trigger transpose scheduler -// when pointwise is preferred. For example, when a normalization kernel is -// segmented, we prefer reduction + pointwise instead of reduction + transpose. -// See SmemPersistentNotSupportedIn3DReduction. void AliasFinder::handle(const BroadcastOp* bcast) { auto* in = dynamic_cast(bcast->in()); if (in == nullptr) { return; } - auto* out = bcast->out()->as(); - - // No alias analysis needed if no IO tensors are involved - if (!out->isFusionOutput() && !in->isFusionInput()) { + // Skip reduction + broadcast to avoid unnecessary allocation-domain changes + // on broadcast tensors. For example, when a normalization kernel is + // segmented, prefer reduction + pointwise over reduction + transpose. + // See SmemPersistentNotSupportedIn3DReduction. + if (in->definition() && in->definition()->isA()) { return; } + auto* out = bcast->out()->as(); std::optional out_layout = mapInLayoutToOutRoot(analysis_.preferredLayout(in), in, out); if (!out_layout.has_value()) { From 1766073387ed0778701320f86acfae5405755234 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Thu, 12 Feb 2026 06:01:12 -0800 Subject: [PATCH 04/10] bcast alias, alloc domain follow logical domain --- csrc/alias_analysis.cpp | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index d278a052d6a..e27a9f09227 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -7,7 +7,6 @@ // clang-format on #include #include -#include "ir/internal_nodes.h" #include #include @@ -294,29 +293,25 @@ void AliasFinder::handle(const BroadcastOp* bcast) { if (in == nullptr) { return; } - // Skip reduction + broadcast to avoid unnecessary allocation-domain changes - // on broadcast tensors. For example, when a normalization kernel is - // segmented, prefer reduction + pointwise over reduction + transpose. - // See SmemPersistentNotSupportedIn3DReduction. - if (in->definition() && in->definition()->isA()) { - return; - } - auto* out = bcast->out()->as(); + std::optional out_layout = mapInLayoutToOutRoot(analysis_.preferredLayout(in), in, out); if (!out_layout.has_value()) { return; } - // Put new, broadcast dimensions to the end. - std::vector out_allocation = out_layout->allocation_domain(); - std::vector> out_contiguity = out_layout->contiguity(); + // Let the allocation domain follow the logical domain. When a normalization + // kernel is segmented, prefer reduction + pointwise over reduction + + // transpose. See SmemPersistentNotSupportedIn3DReduction. const std::vector out_logical = out->getLogicalDomain(); + std::vector out_allocation(out_logical); + std::vector> out_contiguity; for (const auto i : arange(out_logical.size())) { if (bcast->isBroadcastDim(i)) { - out_allocation.push_back(out_logical[i]); out_contiguity.push_back(std::nullopt); + } else { + out_contiguity.push_back(true); } } From 018057078a81f139cd49c92b50b9c5320d4f19fa Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Thu, 12 Feb 2026 08:08:12 -0800 Subject: [PATCH 05/10] fix --- csrc/alias_analysis.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index e27a9f09227..a828f0befa3 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -308,7 +308,7 @@ void AliasFinder::handle(const BroadcastOp* bcast) { std::vector out_allocation(out_logical); std::vector> out_contiguity; for (const auto i : arange(out_logical.size())) { - if (bcast->isBroadcastDim(i)) { + if (out_logical.at(i)->isBroadcast()) { out_contiguity.push_back(std::nullopt); } else { out_contiguity.push_back(true); From 037792491dfd901a2cae3ebb80a4ab9d126bff0c Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Thu, 12 Feb 2026 09:32:58 -0800 Subject: [PATCH 06/10] clean --- csrc/alias_analysis.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index a828f0befa3..0f67640f28c 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -304,16 +304,8 @@ void AliasFinder::handle(const BroadcastOp* bcast) { // Let the allocation domain follow the logical domain. When a normalization // kernel is segmented, prefer reduction + pointwise over reduction + // transpose. See SmemPersistentNotSupportedIn3DReduction. - const std::vector out_logical = out->getLogicalDomain(); - std::vector out_allocation(out_logical); - std::vector> out_contiguity; - for (const auto i : arange(out_logical.size())) { - if (out_logical.at(i)->isBroadcast()) { - out_contiguity.push_back(std::nullopt); - } else { - out_contiguity.push_back(true); - } - } + std::vector out_allocation = out->getLogicalDomain(); + std::vector> out_contiguity = out->getContiguity(); aliasIfCompliant( out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); From 02b844fa470d5f0bee0bdfbbc6ac42617f78ca95 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Thu, 12 Feb 2026 11:29:43 -0800 Subject: [PATCH 07/10] Revert "clean" This reverts commit 037792491dfd901a2cae3ebb80a4ab9d126bff0c. --- csrc/alias_analysis.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 0f67640f28c..a828f0befa3 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -304,8 +304,16 @@ void AliasFinder::handle(const BroadcastOp* bcast) { // Let the allocation domain follow the logical domain. When a normalization // kernel is segmented, prefer reduction + pointwise over reduction + // transpose. See SmemPersistentNotSupportedIn3DReduction. - std::vector out_allocation = out->getLogicalDomain(); - std::vector> out_contiguity = out->getContiguity(); + const std::vector out_logical = out->getLogicalDomain(); + std::vector out_allocation(out_logical); + std::vector> out_contiguity; + for (const auto i : arange(out_logical.size())) { + if (out_logical.at(i)->isBroadcast()) { + out_contiguity.push_back(std::nullopt); + } else { + out_contiguity.push_back(true); + } + } aliasIfCompliant( out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); From 359608825bc82a86d601fa5f579dceb44df9191c Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Thu, 12 Feb 2026 11:32:52 -0800 Subject: [PATCH 08/10] remove preferredLayout --- csrc/alias_analysis.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index a828f0befa3..bf88b4d01d1 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -295,12 +295,6 @@ void AliasFinder::handle(const BroadcastOp* bcast) { } auto* out = bcast->out()->as(); - std::optional out_layout = - mapInLayoutToOutRoot(analysis_.preferredLayout(in), in, out); - if (!out_layout.has_value()) { - return; - } - // Let the allocation domain follow the logical domain. When a normalization // kernel is segmented, prefer reduction + pointwise over reduction + // transpose. See SmemPersistentNotSupportedIn3DReduction. From 50c3a636e1f249c7241db391c6a6e1d1784f08bf Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Fri, 13 Feb 2026 06:04:03 -0800 Subject: [PATCH 09/10] insert bcast domain at the position corresponding to bcast op --- csrc/alias_analysis.cpp | 16 +++++++++++----- tests/cpp/test_alias_analysis.cpp | 31 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 0f67640f28c..69e391fb5b2 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -301,11 +301,17 @@ void AliasFinder::handle(const BroadcastOp* bcast) { return; } - // Let the allocation domain follow the logical domain. When a normalization - // kernel is segmented, prefer reduction + pointwise over reduction + - // transpose. See SmemPersistentNotSupportedIn3DReduction. - std::vector out_allocation = out->getLogicalDomain(); - std::vector> out_contiguity = out->getContiguity(); + // Put new, broadcast dimensions to the corresponding positions in the + // broadcast dimension. So allocation stays close to logical domain. + std::vector out_allocation = out_layout->allocation_domain(); + std::vector> out_contiguity = out_layout->contiguity(); + const std::vector out_logical = out->getLogicalDomain(); + for (const auto i : arange(out_logical.size())) { + if (bcast->isBroadcastDim(i)) { + out_allocation.insert(out_allocation.begin() + i, out_logical[i]); + out_contiguity.insert(out_contiguity.begin() + i, std::nullopt); + } + } aliasIfCompliant( out, in, Layout(std::move(out_allocation), std::move(out_contiguity))); diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp index 9e0854b01c2..6c7470d8f0a 100644 --- a/tests/cpp/test_alias_analysis.cpp +++ b/tests/cpp/test_alias_analysis.cpp @@ -271,6 +271,37 @@ TEST_F(AliasAnalysisTest, BroadcastExpandDimensions) { EXPECT_EQ(analysis.getRoot(expanded_tv), in); } +// Broadcast with input that has a reordered layout (subset of out_logical). The +// preferred layout should keep that reorder for non-broadcast dims and insert +// broadcast dims at their logical positions so allocation stays close to +// logical. +TEST_F(AliasAnalysisTest, Broadcast_OutLayoutReorderOfSubsetOfOutLogical) { + Fusion fusion; + FusionGuard fg(&fusion); + + // logical domain: [i0, i1] + // allocation domain: [i1, i0] + TensorView* in = makeContigConcreteTensor({2, 3}); + fusion.addInput(in); + in->setAllocationDomain({in->axis(1), in->axis(0)}, true); + + // logical domain: [i0, b, i1] + TensorView* out = broadcast(in, {false, true, false}); + fusion.addOutput(out); + + fusion.print(); + + // prefered layout for output is [i1, i0] + // we want to insert bcast dimension between i1 and i0, to match it original + // position in logical domain, so the final allocation domain should be [i1, + // b, i0] which is a permutation of its logical domain. + AliasAnalysisResult analysis = findAliases(&fusion); + auto preferred_layout = analysis.preferredLayout(out); + auto allocation_domain = preferred_layout->allocation_domain(); + EXPECT_THAT( + allocation_domain, ElementsAre(out->axis(2), out->axis(1), out->axis(0))); +} + // See PR: https://github.com/NVIDIA/Fuser/pull/4274 // for alias analysis for resharding exprs TEST_F(AliasAnalysisTest, AliasForReshardingExprs) { From 53e9033b9edde7b5b51415672c53eb94a4afff0d Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Mon, 16 Feb 2026 17:04:54 -0800 Subject: [PATCH 10/10] avoid insert in vect --- csrc/alias_analysis.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 69e391fb5b2..9f0abc4df14 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -306,11 +306,26 @@ void AliasFinder::handle(const BroadcastOp* bcast) { std::vector out_allocation = out_layout->allocation_domain(); std::vector> out_contiguity = out_layout->contiguity(); const std::vector out_logical = out->getLogicalDomain(); - for (const auto i : arange(out_logical.size())) { - if (bcast->isBroadcastDim(i)) { - out_allocation.insert(out_allocation.begin() + i, out_logical[i]); - out_contiguity.insert(out_contiguity.begin() + i, std::nullopt); + { + std::vector new_allocation; + std::vector> new_contiguity; + new_allocation.reserve(out_logical.size()); + new_contiguity.reserve(out_logical.size()); + + size_t alloc_idx = 0; + for (const auto i : arange(out_logical.size())) { + if (bcast->isBroadcastDim(i)) { + new_allocation.push_back(out_logical[i]); + new_contiguity.push_back(std::nullopt); + } else { + new_allocation.push_back(out_allocation.at(alloc_idx)); + new_contiguity.push_back(out_contiguity.at(alloc_idx)); + ++alloc_idx; + } } + + out_allocation = std::move(new_allocation); + out_contiguity = std::move(new_contiguity); } aliasIfCompliant(