diff --git a/csrc/alias_analysis.cpp b/csrc/alias_analysis.cpp index 81c5cb3cabf..9f0abc4df14 100644 --- a/csrc/alias_analysis.cpp +++ b/csrc/alias_analysis.cpp @@ -301,15 +301,31 @@ void AliasFinder::handle(const BroadcastOp* bcast) { return; } - // Put new, broadcast dimensions to the end. + // Put new, broadcast dimensions to the corresponding positions in the + // broadcast dimension. So allocation stays close to logical domain. std::vector out_allocation = out_layout->allocation_domain(); std::vector> out_contiguity = out_layout->contiguity(); const std::vector out_logical = out->getLogicalDomain(); - for (const auto i : arange(out_logical.size())) { - if (bcast->isBroadcastDim(i)) { - out_allocation.push_back(out_logical[i]); - out_contiguity.push_back(std::nullopt); + { + std::vector new_allocation; + std::vector> new_contiguity; + new_allocation.reserve(out_logical.size()); + new_contiguity.reserve(out_logical.size()); + + size_t alloc_idx = 0; + for (const auto i : arange(out_logical.size())) { + if (bcast->isBroadcastDim(i)) { + new_allocation.push_back(out_logical[i]); + new_contiguity.push_back(std::nullopt); + } else { + new_allocation.push_back(out_allocation.at(alloc_idx)); + new_contiguity.push_back(out_contiguity.at(alloc_idx)); + ++alloc_idx; + } } + + out_allocation = std::move(new_allocation); + out_contiguity = std::move(new_contiguity); } aliasIfCompliant( diff --git a/tests/cpp/test_alias_analysis.cpp b/tests/cpp/test_alias_analysis.cpp index 9e0854b01c2..6c7470d8f0a 100644 --- a/tests/cpp/test_alias_analysis.cpp +++ b/tests/cpp/test_alias_analysis.cpp @@ -271,6 +271,37 @@ TEST_F(AliasAnalysisTest, BroadcastExpandDimensions) { EXPECT_EQ(analysis.getRoot(expanded_tv), in); } +// Broadcast with input that has a reordered layout (subset of out_logical). The +// preferred layout should keep that reorder for non-broadcast dims and insert +// broadcast dims at their logical positions so allocation stays close to +// logical. +TEST_F(AliasAnalysisTest, Broadcast_OutLayoutReorderOfSubsetOfOutLogical) { + Fusion fusion; + FusionGuard fg(&fusion); + + // logical domain: [i0, i1] + // allocation domain: [i1, i0] + TensorView* in = makeContigConcreteTensor({2, 3}); + fusion.addInput(in); + in->setAllocationDomain({in->axis(1), in->axis(0)}, true); + + // logical domain: [i0, b, i1] + TensorView* out = broadcast(in, {false, true, false}); + fusion.addOutput(out); + + fusion.print(); + + // prefered layout for output is [i1, i0] + // we want to insert bcast dimension between i1 and i0, to match it original + // position in logical domain, so the final allocation domain should be [i1, + // b, i0] which is a permutation of its logical domain. + AliasAnalysisResult analysis = findAliases(&fusion); + auto preferred_layout = analysis.preferredLayout(out); + auto allocation_domain = preferred_layout->allocation_domain(); + EXPECT_THAT( + allocation_domain, ElementsAre(out->axis(2), out->axis(1), out->axis(0))); +} + // See PR: https://github.com/NVIDIA/Fuser/pull/4274 // for alias analysis for resharding exprs TEST_F(AliasAnalysisTest, AliasForReshardingExprs) { diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp index da9c5c906dd..9a0923b3f11 100644 --- a/tests/cpp/test_persistent_buffer.cpp +++ b/tests/cpp/test_persistent_buffer.cpp @@ -1257,6 +1257,13 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) { // persistent is not supported yet for 3D reduction. EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented()); + // expect reduction and pointwise scheduler + EXPECT_THAT( + executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(), + UnorderedElementsAre( + HeuristicIs(SchedulerType::PointWise), + HeuristicIs(SchedulerType::Reduction))); + testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__); }