Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions csrc/alias_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,15 +301,31 @@ void AliasFinder::handle(const BroadcastOp* bcast) {
return;
}

// Put new, broadcast dimensions to the end.
// Put new, broadcast dimensions to the corresponding positions in the
// broadcast dimension. So allocation stays close to logical domain.
std::vector<IterDomain*> out_allocation = out_layout->allocation_domain();
std::vector<std::optional<bool>> out_contiguity = out_layout->contiguity();
const std::vector<IterDomain*> out_logical = out->getLogicalDomain();
for (const auto i : arange(out_logical.size())) {
if (bcast->isBroadcastDim(i)) {
out_allocation.push_back(out_logical[i]);
out_contiguity.push_back(std::nullopt);
{
std::vector<IterDomain*> new_allocation;
std::vector<std::optional<bool>> new_contiguity;
new_allocation.reserve(out_logical.size());
new_contiguity.reserve(out_logical.size());

size_t alloc_idx = 0;
for (const auto i : arange(out_logical.size())) {
if (bcast->isBroadcastDim(i)) {
new_allocation.push_back(out_logical[i]);
new_contiguity.push_back(std::nullopt);
} else {
new_allocation.push_back(out_allocation.at(alloc_idx));
new_contiguity.push_back(out_contiguity.at(alloc_idx));
++alloc_idx;
}
}

out_allocation = std::move(new_allocation);
out_contiguity = std::move(new_contiguity);
}

aliasIfCompliant(
Expand Down
31 changes: 31 additions & 0 deletions tests/cpp/test_alias_analysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,37 @@ TEST_F(AliasAnalysisTest, BroadcastExpandDimensions) {
EXPECT_EQ(analysis.getRoot(expanded_tv), in);
}

// Broadcast with input that has a reordered layout (subset of out_logical). The
// preferred layout should keep that reorder for non-broadcast dims and insert
// broadcast dims at their logical positions so allocation stays close to
// logical.
TEST_F(AliasAnalysisTest, Broadcast_OutLayoutReorderOfSubsetOfOutLogical) {
Fusion fusion;
FusionGuard fg(&fusion);

// logical domain: [i0, i1]
// allocation domain: [i1, i0]
TensorView* in = makeContigConcreteTensor({2, 3});
fusion.addInput(in);
in->setAllocationDomain({in->axis(1), in->axis(0)}, true);

// logical domain: [i0, b, i1]
TensorView* out = broadcast(in, {false, true, false});
fusion.addOutput(out);

fusion.print();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug print left in test
fusion.print() appears to be a debugging artifact. Other tests in this file don't use unconditional print calls. In test_combined_inner_outer_reduction.cpp, similar calls are guarded by if (verbose). Consider removing this to avoid noisy test output.

Suggested change
fusion.print();

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!


// prefered layout for output is [i1, i0]
// we want to insert bcast dimension between i1 and i0, to match it original
// position in logical domain, so the final allocation domain should be [i1,
// b, i0] which is a permutation of its logical domain.
AliasAnalysisResult analysis = findAliases(&fusion);
auto preferred_layout = analysis.preferredLayout(out);
auto allocation_domain = preferred_layout->allocation_domain();
EXPECT_THAT(
allocation_domain, ElementsAre(out->axis(2), out->axis(1), out->axis(0)));
}

// See PR: https://github.com/NVIDIA/Fuser/pull/4274
// for alias analysis for resharding exprs
TEST_F(AliasAnalysisTest, AliasForReshardingExprs) {
Expand Down
7 changes: 7 additions & 0 deletions tests/cpp/test_persistent_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,13 @@ TEST_F(PersistentBufferTest, SmemPersistentNotSupportedIn3DReduction) {
// persistent is not supported yet for 3D reduction.
EXPECT_TRUE(executor_cache.getMostRecentKernelRuntime()->isSegmented());

// expect reduction and pointwise scheduler
EXPECT_THAT(
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups(),
UnorderedElementsAre(
HeuristicIs(SchedulerType::PointWise),
HeuristicIs(SchedulerType::Reduction)));

testValidate(executor_cache.fusion(), cg_outputs, {t0}, __LINE__, __FILE__);
}

Expand Down