NVIDIA · github-actions · Feb 17, 2026 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp
@@ -301,15 +301,31 @@ void AliasFinder::handle(const BroadcastOp* bcast) {
     return;
   }
 
-  // Put new, broadcast dimensions to the end.
+  // Put new, broadcast dimensions to the corresponding positions in the
+  // broadcast dimension. So allocation stays close to logical domain.
   std::vector<IterDomain*> out_allocation = out_layout->allocation_domain();
   std::vector<std::optional<bool>> out_contiguity = out_layout->contiguity();
   const std::vector<IterDomain*> out_logical = out->getLogicalDomain();
-  for (const auto i : arange(out_logical.size())) {
-    if (bcast->isBroadcastDim(i)) {
-      out_allocation.push_back(out_logical[i]);
-      out_contiguity.push_back(std::nullopt);
+  {
+    std::vector<IterDomain*> new_allocation;
+    std::vector<std::optional<bool>> new_contiguity;
+    new_allocation.reserve(out_logical.size());
+    new_contiguity.reserve(out_logical.size());
+
+    size_t alloc_idx = 0;
+    for (const auto i : arange(out_logical.size())) {
+      if (bcast->isBroadcastDim(i)) {
+        new_allocation.push_back(out_logical[i]);
+        new_contiguity.push_back(std::nullopt);
+      } else {
+        new_allocation.push_back(out_allocation.at(alloc_idx));
+        new_contiguity.push_back(out_contiguity.at(alloc_idx));
+        ++alloc_idx;
+      }
     }
+
+    out_allocation = std::move(new_allocation);
+    out_contiguity = std::move(new_contiguity);
   }
 
   aliasIfCompliant(

diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp
@@ -271,6 +271,37 @@ TEST_F(AliasAnalysisTest, BroadcastExpandDimensions) {
   EXPECT_EQ(analysis.getRoot(expanded_tv), in);
 }
 
+// Broadcast with input that has a reordered layout (subset of out_logical). The
+// preferred layout should keep that reorder for non-broadcast dims and insert
+// broadcast dims at their logical positions so allocation stays close to
+// logical.
+TEST_F(AliasAnalysisTest, Broadcast_OutLayoutReorderOfSubsetOfOutLogical) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // logical domain: [i0, i1]
+  // allocation domain: [i1, i0]
+  TensorView* in = makeContigConcreteTensor({2, 3});
+  fusion.addInput(in);
+  in->setAllocationDomain({in->axis(1), in->axis(0)}, true);
+
+  // logical domain: [i0, b, i1]
+  TensorView* out = broadcast(in, {false, true, false});
+  fusion.addOutput(out);
+
+  fusion.print();
-  fusion.print();
-  fusion.print();
+
+  // prefered layout for output is [i1, i0]
+  // we want to insert bcast dimension between i1 and i0, to match it original
+  // position in logical domain, so the final allocation domain should be [i1,
+  // b, i0] which is a permutation of its logical domain.
+  AliasAnalysisResult analysis = findAliases(&fusion);
+  auto preferred_layout = analysis.preferredLayout(out);
+  auto allocation_domain = preferred_layout->allocation_domain();
+  EXPECT_THAT(
+      allocation_domain, ElementsAre(out->axis(2), out->axis(1), out->axis(0)));
+}
+
 // See PR: https://github.com/NVIDIA/Fuser/pull/4274
 // for alias analysis for resharding exprs
 TEST_F(AliasAnalysisTest, AliasForReshardingExprs) {

diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
@@ -1257,6 +1257,13 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) {
   // persistent is not supported yet for 3D reduction.
   EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented());
 
+  // expect reduction and pointwise scheduler
+  EXPECT_THAT(
+      executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
+      UnorderedElementsAre(
+          HeuristicIs(SchedulerType::PointWise),
+          HeuristicIs(SchedulerType::Reduction)));
+
   testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__);
 }