From c23730204f3ee506205e4e1f4b0bf64d4b268367 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:58:00 +0800 Subject: [PATCH 01/15] cpu-o3: add full resolve train plumbing Change-Id: Id3ddfead4fd87cf04c650f368f6cfb75ff19b81b --- src/cpu/o3/comm.hh | 19 ++ src/cpu/o3/dyn_inst.cc | 3 +- src/cpu/o3/dyn_inst.hh | 36 ++++ src/cpu/o3/fetch.cc | 317 ++++++++++++++++++++++++---- src/cpu/o3/fetch.hh | 57 +++++ src/cpu/o3/iew.cc | 44 +++- src/cpu/pred/BranchPredictor.py | 4 + src/cpu/pred/btb/common.hh | 63 +++++- src/cpu/pred/btb/decoupled_bpred.cc | 130 +++++++++++- src/cpu/pred/btb/decoupled_bpred.hh | 20 ++ src/cpu/pred/btb/ftq.cc | 1 + src/cpu/pred/btb/ftq.hh | 17 ++ src/cpu/pred/btb/timed_base_pred.hh | 5 + 13 files changed, 665 insertions(+), 51 deletions(-) diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index cb88ad769f..4b647534cc 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -246,6 +246,25 @@ struct TimeStruct }; /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */ std::vector resolvedCFIs; // *F + + struct ResolveTrainEntry + { + uint64_t ftqId; + uint64_t ftqGeneration; + uint64_t pc; + uint64_t target; + bool taken; + bool mispredict; + uint8_t ftqOffset; + bool isCond; + bool isDirect; + bool isIndirect; + bool isCall; + bool isReturn; + bool isRVC; + }; + /** Full resolve-train entries produced this cycle (rollout plumbing). */ + std::vector resolveTrainEntries; // *F }; IewComm iewInfo[MaxThreads]; // iew to rename, fetch diff --git a/src/cpu/o3/dyn_inst.cc b/src/cpu/o3/dyn_inst.cc index 2b572ee556..9faeb7ad2e 100644 --- a/src/cpu/o3/dyn_inst.cc +++ b/src/cpu/o3/dyn_inst.cc @@ -68,7 +68,8 @@ DynInst::DynInst(const Arrays &arrays, const StaticInstPtr &static_inst, _numSrcs(arrays.numSrcs), _numDests(arrays.numDests), _flatDestIdx(arrays.flatDestIdx), _destIdx(arrays.destIdx), _prevDestIdx(arrays.prevDestIdx), _srcIdx(arrays.srcIdx), - _readySrcIdx(arrays.readySrcIdx), macroop(_macroop) + _readySrcIdx(arrays.readySrcIdx), ftqGeneration(0), ftqOffset(0), + macroop(_macroop) { std::fill(_readySrcIdx, _readySrcIdx + (numSrcs() + 7) / 8, 0); diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 8a6a1088c1..015b6e4051 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -418,6 +418,10 @@ class DynInst : public ExecContext, public RefCounted /** ftqId is used for squashing and committing */ /** The fetch stream queue ID of the instruction. */ unsigned ftqId; + /** FTQ target generation captured at fetch time. */ + uint64_t ftqGeneration; + /** Halfword offset of the instruction within its FTQ target block. */ + uint8_t ftqOffset; /** The number of loop iteration within an fsq entry of the instruction. */ unsigned loopIteration; @@ -1589,12 +1593,24 @@ class DynInst : public ExecContext, public RefCounted ftqId = id; } + void + setFtqGeneration(uint64_t generation) + { + ftqGeneration = generation; + } + unsigned getFtqId() { return ftqId; } + uint64_t + getFtqGeneration() const + { + return ftqGeneration; + } + void setLoopIteration(unsigned iter) { @@ -1613,6 +1629,26 @@ class DynInst : public ExecContext, public RefCounted return rpc.compressed() ? 2 : 4; } + bool isRVC() const + { + return pc->as().compressed(); + } + + Addr getControlTarget() + { + return branching() ? getNPC() : pcState().getFallThruPC(); + } + + void setFtqOffset(uint8_t offset) + { + ftqOffset = offset; + } + + uint8_t getFtqOffset() const + { + return ftqOffset; + } + protected: SquashVersion squashVer; diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 21c9cec4e6..46db1cffc3 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -81,6 +81,27 @@ namespace gem5 namespace o3 { +namespace +{ + +constexpr uint8_t RvcInstBytes = 2; +constexpr uint8_t BaseInstBytes = 4; + +size_t +resolveTrainMetaCount( + const branch_prediction::btb_pred::FetchTarget &target) +{ + size_t num_pred_metas = 0; + for (size_t i = 0; i < target.predMetas.size(); ++i) { + if (target.predMetas[i] != nullptr) { + num_pred_metas = i + 1; + } + } + return num_pred_metas; +} + +} // anonymous namespace + Fetch::IcachePort::IcachePort(Fetch *_fetch, CPU *_cpu) : RequestPort(_cpu->name() + ".icache_port", _cpu), fetch(_fetch) {} @@ -91,6 +112,8 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) cpu(_cpu), branchPred(nullptr), resolveQueueSize(params.resolveQueueSize), + enableFullResolveTrain(params.enableFullResolveTrain), + enableLegacyResolveUpdate(params.enableLegacyResolveUpdate), decodeToFetchDelay(params.decodeToFetchDelay), renameToFetchDelay(params.renameToFetchDelay), iewToFetchDelay(params.iewToFetchDelay), @@ -270,6 +293,21 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) "Number of times an entry is enqueued to the resolve queue"), ADD_STAT(resolveQueueOccupancy, statistics::units::Count::get(), "Number of entries in the resolve queue"), + ADD_STAT(fullResolveEntriesReceived, statistics::units::Count::get(), + "Number of full resolve entries received by fetch"), + ADD_STAT(fullResolveEntriesMerged, statistics::units::Count::get(), + "Number of full resolve entries merged by fetch"), + ADD_STAT(fullResolveEntriesDroppedQueueFull, + statistics::units::Count::get(), + "Number of full resolve entries dropped because the queue is full"), + ADD_STAT(fullResolveEntriesDroppedStaleTarget, + statistics::units::Count::get(), + "Number of full resolve entries dropped because the target is stale"), + ADD_STAT(fullResolveEntriesDroppedGenerationMismatch, + statistics::units::Count::get(), + "Number of full resolve entries dropped because the generation mismatched"), + ADD_STAT(fullResolvePacketsSent, statistics::units::Count::get(), + "Number of full resolve packets sent to the predictor"), ADD_STAT(traceMetaStores, statistics::units::Count::get(), "Number of stored trace metadata records (seqNum -> traceInst)"), ADD_STAT(traceMetaCleanupSquashCalls, statistics::units::Count::get(), @@ -351,6 +389,18 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) .init(1, 8, 1); resolveQueueOccupancy .init(0, 32, 1); + fullResolveEntriesReceived + .prereq(fullResolveEntriesReceived); + fullResolveEntriesMerged + .prereq(fullResolveEntriesMerged); + fullResolveEntriesDroppedQueueFull + .prereq(fullResolveEntriesDroppedQueueFull); + fullResolveEntriesDroppedStaleTarget + .prereq(fullResolveEntriesDroppedStaleTarget); + fullResolveEntriesDroppedGenerationMismatch + .prereq(fullResolveEntriesDroppedGenerationMismatch); + fullResolvePacketsSent + .prereq(fullResolvePacketsSent); traceMetaStores .prereq(traceMetaStores); traceMetaCleanupSquashCalls @@ -539,6 +589,7 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt) DPRINTF(Fetch, "req[%d]=0x%lx ", i, threads[tid].cacheReq.requests[i]->getVaddr()); } DPRINTF(Fetch, "\n"); + delete pkt; return false; } @@ -1454,67 +1505,256 @@ Fetch::checkSignalsAndUpdate(ThreadID tid) void Fetch::handleIEWSignals() { + if (lastIewSignalHandleTick == curTick()) { + return; + } + lastIewSignalHandleTick = curTick(); + // Currently resolve stage training is a btb-only feature if (!isBTBPred()) { return; } auto &incoming = fromIEW->iewInfo->resolvedCFIs; - const bool had_pending_resolve = !resolveQueue.empty(); - uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size(); - uint8_t enqueueCount = 0; - if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) { - fetchStats.resolveQueueFullEvents++; - fetchStats.resolveEnqueueFailEvent += enqueueSize; - } else { + if (!enableLegacyResolveUpdate) { + for (ThreadID tid = 0; tid < numThreads; ++tid) { + fromIEW->iewInfo[tid].resolvedCFIs.clear(); + } + } + + if (enableLegacyResolveUpdate) { + const bool had_pending_resolve = !resolveQueue.empty(); + uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size(); + uint8_t enqueueCount = 0; + + if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) { + fetchStats.resolveQueueFullEvents++; + fetchStats.resolveEnqueueFailEvent += enqueueSize; + } else { - for (const auto &resolved : incoming) { - bool merged = false; - for (auto &queued : resolveQueue) { - if (queued.resolvedFTQId == resolved.ftqId) { - queued.resolvedInstPC.push_back(resolved.pc); - merged = true; - break; + for (const auto &resolved : incoming) { + bool merged = false; + for (auto &queued : resolveQueue) { + if (queued.resolvedFTQId == resolved.ftqId) { + queued.resolvedInstPC.push_back(resolved.pc); + merged = true; + break; + } } + + if (merged) { + continue; + } + + ResolveQueueEntry new_entry; + new_entry.resolvedFTQId = resolved.ftqId; + new_entry.resolvedInstPC.push_back(resolved.pc); + resolveQueue.push_back(std::move(new_entry)); + enqueueCount++; } + fetchStats.resolveEnqueueCount.sample(enqueueCount); + } - if (merged) { - continue; + fetchStats.resolveQueueOccupancy.sample(resolveQueue.size()); + + // Process only entries that were already pending before this cycle. + // This preserves a cycle of separation between IEW producing resolved + // CFIs and fetch consuming them as predictor resolved updates. + if (had_pending_resolve && !resolveQueue.empty()) { + auto &entry = resolveQueue.front(); + unsigned int stream_id = entry.resolvedFTQId; + dbpbtb->prepareResolveUpdateEntries(stream_id, 0); + for (const auto resolvedInstPC : entry.resolvedInstPC) { + dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0); } + bool success = dbpbtb->resolveUpdate(stream_id, 0); + if (success) { + dbpbtb->notifyResolveSuccess(); + resolveQueue.pop_front(); + fetchStats.resolveDequeueCount++; + } else { + dbpbtb->notifyResolveFailure(); + } + } + } - ResolveQueueEntry new_entry; - new_entry.resolvedFTQId = resolved.ftqId; - new_entry.resolvedInstPC.push_back(resolved.pc); - resolveQueue.push_back(std::move(new_entry)); - enqueueCount++; + if (!enableFullResolveTrain) { + for (ThreadID tid = 0; tid < numThreads; ++tid) { + fromIEW->iewInfo[tid].resolveTrainEntries.clear(); } - fetchStats.resolveEnqueueCount.sample(enqueueCount); + return; } - fetchStats.resolveQueueOccupancy.sample(resolveQueue.size()); + filterResolveTrainQueue(); + const bool had_pending_resolve_train = !resolveTrainQueue.empty(); + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + auto &resolve_entries = fromIEW->iewInfo[tid].resolveTrainEntries; + for (const auto &resolved : resolve_entries) { + fetchStats.fullResolveEntriesReceived++; + + if (!dbpbtb->ftqHasTarget(resolved.ftqId, tid)) { + fetchStats.fullResolveEntriesDroppedStaleTarget++; + continue; + } + + if (!dbpbtb->ftqMatchTargetIdentity( + resolved.ftqId, resolved.ftqGeneration, tid)) { + fetchStats.fullResolveEntriesDroppedGenerationMismatch++; + continue; + } + + auto queued = std::find_if( + resolveTrainQueue.begin(), resolveTrainQueue.end(), + [tid, &resolved](const ResolveTrainQueueEntry &entry) { + return entry.tid == tid && entry.ftqId == resolved.ftqId && + entry.generation == resolved.ftqGeneration; + }); + if (queued != resolveTrainQueue.end()) { + appendResolveTrainInst(*queued, makeResolveTrainInstData( + resolved.pc, resolved.target, resolved.taken, + resolved.mispredict, resolved.ftqOffset, resolved.isCond, + resolved.isDirect, resolved.isIndirect, resolved.isCall, + resolved.isReturn, resolved.isRVC)); + fetchStats.fullResolveEntriesMerged++; + continue; + } + + if (resolveQueueSize && resolveTrainQueue.size() >= resolveQueueSize) { + fetchStats.fullResolveEntriesDroppedQueueFull++; + continue; + } - // Process only entries that were already pending before this cycle. - // This preserves a cycle of separation between IEW producing resolved CFIs - // and fetch consuming them as predictor resolved updates. - if (had_pending_resolve && !resolveQueue.empty()) { - auto &entry = resolveQueue.front(); - unsigned int stream_id = entry.resolvedFTQId; - dbpbtb->prepareResolveUpdateEntries(stream_id, 0); - for (const auto resolvedInstPC : entry.resolvedInstPC) { - dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0); + ResolveTrainQueueEntry entry; + entry.tid = tid; + entry.ftqId = resolved.ftqId; + entry.generation = resolved.ftqGeneration; + appendResolveTrainInst(entry, makeResolveTrainInstData( + resolved.pc, resolved.target, resolved.taken, + resolved.mispredict, resolved.ftqOffset, resolved.isCond, + resolved.isDirect, resolved.isIndirect, resolved.isCall, + resolved.isReturn, resolved.isRVC)); + resolveTrainQueue.push_back(std::move(entry)); } - bool success = dbpbtb->resolveUpdate(stream_id, 0); - if (success) { + + resolve_entries.clear(); + } + + filterResolveTrainQueue(); + + if (had_pending_resolve_train && !resolveTrainQueue.empty()) { + const auto &entry = resolveTrainQueue.front(); + auto packet = buildResolvedTrainPacket(entry); + if (dbpbtb->resolveTrain(packet, entry.tid)) { dbpbtb->notifyResolveSuccess(); - resolveQueue.pop_front(); - fetchStats.resolveDequeueCount++; + fetchStats.fullResolvePacketsSent++; + resolveTrainQueue.pop_front(); } else { dbpbtb->notifyResolveFailure(); } } } +Fetch::ResolveTrainInstData +Fetch::makeResolveTrainInstData( + Addr pc, Addr target, bool taken, bool mispredict, uint8_t ftqOffset, + bool isCond, bool isDirect, bool isIndirect, bool isCall, bool isReturn, + bool isRVC) const +{ + ResolveTrainInstData inst_data; + inst_data.pc = pc; + inst_data.target = target; + inst_data.taken = taken; + inst_data.mispredict = mispredict; + inst_data.ftqOffset = ftqOffset; + inst_data.isCond = isCond; + inst_data.isDirect = isDirect; + inst_data.isIndirect = isIndirect; + inst_data.isCall = isCall; + inst_data.isReturn = isReturn; + inst_data.isRVC = isRVC; + return inst_data; +} + +branch_prediction::btb_pred::ResolvedTrainPacket +Fetch::buildResolvedTrainPacket(const ResolveTrainQueueEntry &entry) const +{ + const auto &target = dbpbtb->ftqTarget(entry.ftqId, entry.tid); + + branch_prediction::btb_pred::ResolvedTrainPacket packet; + packet.tid = entry.tid; + packet.target = {entry.ftqId, entry.generation}; + packet.startPC = target.startPC; + packet.numPredMetas = resolveTrainMetaCount(target); + packet.predMetas = target.predMetas; + packet.realBranches.reserve(entry.insts.size()); + + for (const auto &inst_data : entry.insts) { + branch_prediction::btb_pred::BranchInfo branch; + branch.pc = inst_data.pc; + branch.target = inst_data.target; + branch.resolved = true; + branch.isCond = inst_data.isCond; + branch.isDirect = inst_data.isDirect; + branch.isIndirect = inst_data.isIndirect; + branch.isCall = inst_data.isCall; + branch.isReturn = inst_data.isReturn; + branch.size = inst_data.isRVC ? RvcInstBytes : BaseInstBytes; + + packet.realBranches.emplace_back( + branch, inst_data.taken, inst_data.mispredict, inst_data.ftqOffset); + } + + return packet; +} + +void +Fetch::appendResolveTrainInst( + ResolveTrainQueueEntry &entry, const ResolveTrainInstData &inst_data) +{ + auto existing = std::find_if( + entry.insts.begin(), entry.insts.end(), + [&inst_data](const ResolveTrainInstData &queued_inst) { + return queued_inst.ftqOffset == inst_data.ftqOffset && + queued_inst.pc == inst_data.pc; + }); + if (existing != entry.insts.end()) { + *existing = inst_data; + } else { + entry.insts.push_back(inst_data); + } + + std::sort(entry.insts.begin(), entry.insts.end(), + [](const ResolveTrainInstData &lhs, const ResolveTrainInstData &rhs) { + if (lhs.ftqOffset != rhs.ftqOffset) { + return lhs.ftqOffset < rhs.ftqOffset; + } + return lhs.pc < rhs.pc; + }); +} + +void +Fetch::filterResolveTrainQueue() +{ + for (auto it = resolveTrainQueue.begin(); it != resolveTrainQueue.end();) { + if (!dbpbtb->ftqHasTarget(it->ftqId, it->tid)) { + fetchStats.fullResolveEntriesDroppedStaleTarget++; + it = resolveTrainQueue.erase(it); + continue; + } + + if (!dbpbtb->ftqMatchTargetIdentity( + it->ftqId, it->generation, it->tid)) { + fetchStats.fullResolveEntriesDroppedGenerationMismatch++; + it = resolveTrainQueue.erase(it); + continue; + } + + ++it; + } +} + bool Fetch::handleCommitSignals(ThreadID tid) { @@ -1660,6 +1900,11 @@ Fetch::buildInst(ThreadID tid, StaticInstPtr staticInst, DPRINTF(DecoupleBP, "Set instruction %lu with fetch id %lu\n", instruction->seqNum, dbpbtb->ftqHeadId(0)); instruction->setFtqId(dbpbtb->ftqHeadId(0)); + const auto &fetch_target = dbpbtb->ftqFetchingTarget(tid); + instruction->setFtqGeneration(fetch_target.generation); + constexpr unsigned instShiftAmt = 1; + instruction->setFtqOffset(static_cast( + (this_pc.instAddr() - fetch_target.startPC) >> instShiftAmt)); #if TRACING_ON if (trace) { diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 19091ef30e..8f2f7aaba5 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -470,6 +470,39 @@ class Fetch */ void handleIEWSignals(); + struct ResolveTrainInstData + { + Addr pc; + Addr target; + bool taken; + bool mispredict; + uint8_t ftqOffset; + bool isCond; + bool isDirect; + bool isIndirect; + bool isCall; + bool isReturn; + bool isRVC; + }; + + struct ResolveTrainQueueEntry + { + ThreadID tid; + branch_prediction::btb_pred::FetchTargetId ftqId; + uint64_t generation; + std::vector insts; + }; + + ResolveTrainInstData makeResolveTrainInstData( + Addr pc, Addr target, bool taken, bool mispredict, uint8_t ftqOffset, + bool isCond, bool isDirect, bool isIndirect, bool isCall, + bool isReturn, bool isRVC) const; + branch_prediction::btb_pred::ResolvedTrainPacket buildResolvedTrainPacket( + const ResolveTrainQueueEntry &entry) const; + void appendResolveTrainInst( + ResolveTrainQueueEntry &entry, const ResolveTrainInstData &inst_data); + void filterResolveTrainQueue(); + /** Handles decode squash signals. * @return: Returns true if squash occurred and immediate return needed. */ @@ -603,9 +636,21 @@ class Fetch /** Maximum number of resolve entries buffered in fetch before training. */ const unsigned resolveQueueSize; + /** Enable packet-based resolve training rollout plumbing. */ + const bool enableFullResolveTrain; + + /** Keep legacy PC-only resolve updates enabled. */ + const bool enableLegacyResolveUpdate; + /** FIFO storing resolve entries waiting for BPU training. */ std::deque resolveQueue; + /** FIFO storing aggregated full resolve-train entries. */ + std::deque resolveTrainQueue; + + /** Ensures IEW signals are consumed only once per cycle. */ + Tick lastIewSignalHandleTick = Tick(-1); + /** Trace-mode implementation owner (optional, enabled by params). */ std::unique_ptr traceFetch; @@ -1095,6 +1140,18 @@ class Fetch statistics::Distribution resolveEnqueueCount; /** Stat for entry occupancy distribution of the resolve queue. */ statistics::Distribution resolveQueueOccupancy; + /** Full resolve entries observed at fetch. */ + statistics::Scalar fullResolveEntriesReceived; + /** Full resolve entries merged with an existing target. */ + statistics::Scalar fullResolveEntriesMerged; + /** Full resolve entries dropped because the queue is full. */ + statistics::Scalar fullResolveEntriesDroppedQueueFull; + /** Full resolve entries dropped because the target went stale. */ + statistics::Scalar fullResolveEntriesDroppedStaleTarget; + /** Full resolve entries dropped because generation did not match. */ + statistics::Scalar fullResolveEntriesDroppedGenerationMismatch; + /** Full resolve packets sent to the predictor. */ + statistics::Scalar fullResolvePacketsSent; // Trace metadata accounting (trace mode) /** Number of stored trace metadata records (seqNum -> traceInst). */ diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 81c261bc40..74394292f8 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1443,14 +1443,9 @@ void IEW::SquashCheckAfterExe(DynInstPtr inst) { ThreadID tid = inst->threadNumber; + const auto ¶ms = static_cast(cpu->params()); - if (inst->isControl()) { - auto &resolved_cfis = toFetch->iewInfo[tid].resolvedCFIs; - TimeStruct::IewComm::ResolvedCFIEntry entry; - entry.ftqId = inst->getFtqId(); - entry.pc = inst->getPC(); - resolved_cfis.push_back(entry); - } + const bool is_control = inst->isControl(); if (!fetchRedirect[tid] || !toCommit->squash[tid] || @@ -1466,7 +1461,39 @@ IEW::SquashCheckAfterExe(DynInstPtr inst) inst->pcState(*new_pc); } - if (inst->mispredicted() && !loadNotExecuted) { + const bool control_mispredict = + is_control && !loadNotExecuted && inst->mispredicted(); + + if (is_control) { + if (params.enableLegacyResolveUpdate) { + auto &resolved_cfis = toFetch->iewInfo[tid].resolvedCFIs; + TimeStruct::IewComm::ResolvedCFIEntry entry; + entry.ftqId = inst->getFtqId(); + entry.pc = inst->getPC(); + resolved_cfis.push_back(entry); + } + + if (params.enableFullResolveTrain) { + auto &resolve_entries = toFetch->iewInfo[tid].resolveTrainEntries; + TimeStruct::IewComm::ResolveTrainEntry entry; + entry.ftqId = inst->getFtqId(); + entry.ftqGeneration = inst->getFtqGeneration(); + entry.pc = inst->getPC(); + entry.target = inst->getControlTarget(); + entry.taken = inst->branching(); + entry.mispredict = control_mispredict; + entry.ftqOffset = inst->getFtqOffset(); + entry.isCond = inst->isCondCtrl(); + entry.isDirect = inst->isDirectCtrl(); + entry.isIndirect = inst->isIndirectCtrl(); + entry.isCall = inst->isCall(); + entry.isReturn = inst->isReturn(); + entry.isRVC = inst->isRVC(); + resolve_entries.push_back(entry); + } + } + + if (control_mispredict) { fetchRedirect[tid] = true; DPRINTF(IEW, "[tid:%i] [sn:%llu] Execute: " @@ -1555,6 +1582,7 @@ IEW::executeInsts() // Clear resolvedFSQId and resolvedInstPC since they are already handled in frontend ThreadID tid = *activeThreads->begin(); toFetch->iewInfo[tid].resolvedCFIs.clear(); + toFetch->iewInfo[tid].resolveTrainEntries.clear(); // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 0dc9570a15..eddbaa5844 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1206,3 +1206,7 @@ class DecoupledBPUWithBTB(BranchPredictor): bpDBSwitches = VectorParam.String([], "Enable which traces in the form of database") resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once") + enableFullResolveTrain = Param.Bool(Parent.enableFullResolveTrain, + "Enable packet-based resolve training rollout plumbing") + enableLegacyResolveUpdate = Param.Bool(Parent.enableLegacyResolveUpdate, + "Enable legacy PC-only resolve update") diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index e00e7fbcf7..ed6dd5f967 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -1,8 +1,11 @@ #ifndef __CPU_PRED_BTB_STREAM_STRUCT_HH__ #define __CPU_PRED_BTB_STREAM_STRUCT_HH__ +#include +#include #include #include +#include #include @@ -95,7 +98,9 @@ struct BranchInfo bool isUncond() const { return !this->isCond; } Addr getEnd() { return this->pc + this->size; } BranchInfo() - : pc(0), target(0), resolved(false), isCond(false), isIndirect(false), isCall(false), isReturn(false), size(0) + : pc(0), target(0), resolved(false), isCond(false), + isIndirect(false), isDirect(false), isCall(false), + isReturn(false), size(0) { } // BranchInfo(const Addr &pc, const Addr &target_pc, bool is_cond) : @@ -248,6 +253,56 @@ struct LFSR64 }; using FetchTargetId = uint64_t; +constexpr size_t MaxPredictorComponents = 8; + +struct FetchTargetIdentity +{ + FetchTargetId id; + uint64_t generation; + + FetchTargetIdentity() : id(0), generation(0) {} + FetchTargetIdentity(FetchTargetId id, uint64_t generation) + : id(id), generation(generation) + { + } +}; + +struct ResolvedBranch +{ + BranchInfo branch; + bool taken; + bool mispredict; + uint8_t ftqOffset; + + ResolvedBranch() + : branch(), taken(false), mispredict(false), ftqOffset(0) + { + } + + ResolvedBranch(const BranchInfo &branch, bool taken, bool mispredict, + uint8_t ftqOffset) + : branch(branch), taken(taken), mispredict(mispredict), + ftqOffset(ftqOffset) + { + } +}; + +struct ResolvedTrainPacket +{ + ThreadID tid; + FetchTargetIdentity target; + Addr startPC; + size_t numPredMetas; + std::array, MaxPredictorComponents> predMetas; + std::vector realBranches; + + ResolvedTrainPacket() + : tid(0), target(), startPC(0), numPredMetas(0), predMetas(), + realBranches() + { + predMetas.fill(nullptr); + } +}; // {branch pc -> istaken} maps using CondTakens = std::vector>; @@ -276,6 +331,7 @@ using IndirectTargets = std::vector>; struct FetchTarget { ThreadID tid; + uint64_t generation; Addr startPC; // start pc of the stream bool predTaken; // whether the FetchTarget has taken branch Addr predEndPC; // predicted stream end pc (fall through pc) @@ -306,7 +362,7 @@ struct FetchTarget // prediction metas // FIXME: use vec - std::array, 8> predMetas; // each component has a meta, TODO + std::array, MaxPredictorComponents> predMetas; Tick predTick; // tick of the prediction boost::dynamic_bitset<> history; // record GHR/s0History @@ -323,7 +379,8 @@ struct FetchTarget int s3Source; // which stage the prediction comes from FetchTarget() - : startPC(0), + : generation(0), + startPC(0), predTaken(false), predEndPC(0), predBranchInfo(BranchInfo()), diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 37cf705814..b7bad787ef 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -21,6 +21,66 @@ namespace branch_prediction namespace btb_pred { +namespace +{ + +bool +validateResolvedTrainPacket(const ResolvedTrainPacket &packet, + const FetchTarget &target, + unsigned numComponents) +{ + if (packet.numPredMetas > packet.predMetas.size()) { + return false; + } + + if (numComponents > packet.predMetas.size() || + packet.numPredMetas != numComponents) { + return false; + } + + for (unsigned i = 0; i < numComponents; ++i) { + if (packet.predMetas[i] != target.predMetas[i]) { + return false; + } + } + + uint8_t lastOffset = 0; + Addr lastPc = 0; + bool firstBranch = true; + bool seenTaken = false; + for (const auto &resolved : packet.realBranches) { + if (resolved.branch.pc < packet.startPC) { + return false; + } + + if (resolved.branch.size == 0) { + return false; + } + + if (seenTaken) { + return false; + } + + if (!firstBranch) { + if (resolved.ftqOffset < lastOffset) { + return false; + } + if (resolved.ftqOffset == lastOffset && resolved.branch.pc <= lastPc) { + return false; + } + } + + lastOffset = resolved.ftqOffset; + lastPc = resolved.branch.pc; + seenTaken = resolved.taken; + firstBranch = false; + } + + return true; +} + +} // anonymous namespace + void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { @@ -48,6 +108,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) ftq(2, p.ftq_size), historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), + enableFullResolveTrain(p.enableFullResolveTrain), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { if (bpDBSwitches.size() > 0) { @@ -598,6 +659,11 @@ DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid) // Phase 1: probe all resolved-update components to ensure no blocker for (int i = 0; i < numComponents; ++i) { if (components[i]->getResolvedUpdate()) { + if (enableFullResolveTrain && + (components[i] == mbtb || components[i] == tage || + components[i] == ittage)) { + continue; + } if (!components[i]->canResolveUpdate(target)) { return false; } @@ -607,6 +673,11 @@ DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid) // Phase 2: all clear, perform updates once for (int i = 0; i < numComponents; ++i) { if (components[i]->getResolvedUpdate()) { + if (enableFullResolveTrain && + (components[i] == mbtb || components[i] == tage || + components[i] == ittage)) { + continue; + } components[i]->doResolveUpdate(target); } } @@ -620,6 +691,58 @@ DecoupledBPUWithBTB::notifyResolveSuccess() resolveDequeueFailCounter = 0; } +bool +DecoupledBPUWithBTB::resolveTrain( + const ResolvedTrainPacket &packet, ThreadID tid) +{ + if (packet.tid != tid) { + DPRINTF(DecoupleBP, + "Resolve-train packet tid mismatch: packet=%u arg=%u\n", + packet.tid, tid); + return false; + } + + if (!ftq.matchTargetIdentity(packet.target.id, packet.target.generation, + tid)) { + DPRINTF(DecoupleBP, + "Resolve-train packet target mismatch: id=%lu generation=%lu tid=%u\n", + packet.target.id, packet.target.generation, tid); + return false; + } + + const auto &target = ftq.get(packet.target.id, tid); + if (packet.startPC != target.startPC) { + DPRINTF(DecoupleBP, + "Resolve-train packet startPC mismatch: packet=%#lx ftq=%#lx id=%lu tid=%u\n", + packet.startPC, target.startPC, packet.target.id, tid); + return false; + } + + if (!validateResolvedTrainPacket(packet, target, numComponents)) { + DPRINTF(DecoupleBP, + "Resolve-train packet validation failed: id=%lu generation=%lu tid=%u\n", + packet.target.id, packet.target.generation, tid); + return false; + } + + DPRINTF(DecoupleBP, + "Resolve-train packet accepted: id=%lu generation=%lu tid=%u startPC=%#lx branches=%zu\n", + packet.target.id, packet.target.generation, tid, packet.startPC, + packet.realBranches.size()); + + for (int i = 0; i < numComponents; ++i) { + if (!components[i]->canResolveTrain(packet)) { + return false; + } + } + + for (int i = 0; i < numComponents; ++i) { + components[i]->resolveTrain(packet); + } + + return true; +} + void DecoupledBPUWithBTB::notifyResolveFailure() { @@ -652,7 +775,7 @@ DecoupledBPUWithBTB::prepareResolveUpdateEntries(unsigned &target_id, ThreadID t target.setUpdateBTBEntries(); // only mbtb can generate new entry - if (mbtb->isEnabled()) { + if (mbtb->isEnabled() && !enableFullResolveTrain) { mbtb->getAndSetNewBTBEntry(target); } } @@ -668,7 +791,8 @@ DecoupledBPUWithBTB::markCFIResolved(unsigned &target_id, uint64_t resolvedInstP } auto &target = ftq.get(target_id, tid); - if (target.updateNewBTBEntry.pc == resolvedInstPC) { + if (!enableFullResolveTrain && + target.updateNewBTBEntry.pc == resolvedInstPC) { target.updateNewBTBEntry.resolved = true; } @@ -685,7 +809,7 @@ DecoupledBPUWithBTB::updatePredictorComponents(FetchTarget &target) target.setUpdateBTBEntries(); // only mbtb can generate new entry - if (mbtb->isEnabled()) { + if (mbtb->isEnabled() && !enableFullResolveTrain) { mbtb->getAndSetNewBTBEntry(target); } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 288450001f..6a157a610a 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -144,6 +144,7 @@ class DecoupledBPUWithBTB : public BPredUnit HistoryManager historyManager; unsigned resolveDequeueFailCounter{0}; const unsigned resolveBlockThreshold; + const bool enableFullResolveTrain; ThreadID scheduleThread() { return 0; } @@ -404,6 +405,25 @@ class DecoupledBPUWithBTB : public BPredUnit bool ftqHasFetching(ThreadID tid) const { return ftq.hasTarget(ftq.fetchId(tid), tid); } FetchTargetId ftqHeadId(ThreadID tid) const { assert(ftqHasFetching(tid)); return ftq.fetchId(tid); } const FetchTarget &ftqFetchingTarget(ThreadID tid) { assert(ftqHasFetching(tid)); return ftq.fetching(tid); } + bool ftqHasTarget(FetchTargetId targetId, ThreadID tid) const + { + return ftq.hasTarget(targetId, tid); + } + uint64_t ftqTargetGeneration(FetchTargetId targetId, ThreadID tid) const + { + return ftq.getTargetGeneration(targetId, tid); + } + const FetchTarget &ftqTarget(FetchTargetId targetId, ThreadID tid) const + { + assert(ftq.hasTarget(targetId, tid)); + return ftq.get(targetId, tid); + } + bool ftqMatchTargetIdentity( + FetchTargetId targetId, uint64_t generation, ThreadID tid) const + { + return ftq.matchTargetIdentity(targetId, generation, tid); + } + bool resolveTrain(const ResolvedTrainPacket &packet, ThreadID tid); void dumpFsq(const char *when); diff --git a/src/cpu/pred/btb/ftq.cc b/src/cpu/pred/btb/ftq.cc index 3642ef7162..ac20333b6c 100644 --- a/src/cpu/pred/btb/ftq.cc +++ b/src/cpu/pred/btb/ftq.cc @@ -28,6 +28,7 @@ FetchTargetQueue::insert(FetchTarget& target) { ThreadID tid = target.tid; assert(queue[tid].cap.size() < ftqSize[tid]); + target.generation = queue[tid].nextGeneration++; queue[tid].cap.push_back(target); } diff --git a/src/cpu/pred/btb/ftq.hh b/src/cpu/pred/btb/ftq.hh index c43d071447..23de9c9baf 100644 --- a/src/cpu/pred/btb/ftq.hh +++ b/src/cpu/pred/btb/ftq.hh @@ -28,6 +28,7 @@ class FetchTargetQueue std::deque cap; FetchTargetId baseTargetId = 1; FetchTargetId fetchptr = 1; + uint64_t nextGeneration = 1; } queue[MaxThreads]; uint32_t roundRobinPtr = 0; @@ -48,6 +49,11 @@ public: inline FetchTarget& front(ThreadID tid) { return queue[tid].cap.front(); } inline FetchTarget& back(ThreadID tid) { return queue[tid].cap.back(); } inline FetchTarget& fetching(ThreadID tid) { return get(queue[tid].fetchptr, tid); } + inline const FetchTarget& get(FetchTargetId targetId, ThreadID tid) const { + assert(targetId >= queue[tid].baseTargetId && + targetId < queue[tid].baseTargetId + queue[tid].cap.size()); + return queue[tid].cap[targetId - queue[tid].baseTargetId]; + } inline FetchTarget& get(FetchTargetId targetId, ThreadID tid) { assert(targetId >= queue[tid].baseTargetId && targetId < queue[tid].baseTargetId + queue[tid].cap.size()); @@ -57,6 +63,17 @@ public: return targetId >= queue[tid].baseTargetId && targetId < queue[tid].baseTargetId + queue[tid].cap.size(); } + // Plumbing-only in Task 2: end-to-end stale resolve rejection is added + // when resolve entries carry generation in later tasks. + inline uint64_t getTargetGeneration(FetchTargetId targetId, ThreadID tid) const { + return get(targetId, tid).generation; + } + inline bool matchTargetIdentity( + FetchTargetId targetId, uint64_t generation, ThreadID tid) const + { + return hasTarget(targetId, tid) && + getTargetGeneration(targetId, tid) == generation; + } inline bool empty(ThreadID tid) const { return queue[tid].cap.empty(); } inline bool full(ThreadID tid) const { return queue[tid].cap.size() >= ftqSize[tid]; } inline bool anyEmpty() const { diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh index fce1a6aef1..2f1b04cba2 100644 --- a/src/cpu/pred/btb/timed_base_pred.hh +++ b/src/cpu/pred/btb/timed_base_pred.hh @@ -79,6 +79,11 @@ class TimedBaseBTBPredictor: public SimObject // Two-phase resolved update: probe first, then apply virtual bool canResolveUpdate(const FetchTarget &entry) { return true; } virtual void doResolveUpdate(const FetchTarget &entry) { update(entry); } + virtual bool canResolveTrain(const ResolvedTrainPacket &packet) + { + return true; + } + virtual void resolveTrain(const ResolvedTrainPacket &packet) {} #ifndef UNIT_TEST // do some statistics on a per-branch and per-predictor basis virtual void commitBranch(const FetchTarget &entry, const DynInstPtr &inst) {} From a7fa5978b9ee18f7c293974944ca30e7c362165c Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:59:14 +0800 Subject: [PATCH 02/15] cpu: migrate resolved updates to train packets Move MBTB, BTBTAGE, and BTBITTAGE to packet-based resolve training so they consume execution truth directly while preserving legacy fallback for non-migrated components. Change-Id: I4682438d18796127542b9c9c6a23efcb1415a413 --- src/cpu/pred/btb/btb_ittage.cc | 170 ++++++++++++++++++++++- src/cpu/pred/btb/btb_ittage.hh | 2 + src/cpu/pred/btb/btb_tage.cc | 159 +++++++++++++++++++-- src/cpu/pred/btb/btb_tage.hh | 15 +- src/cpu/pred/btb/mbtb.cc | 185 +++++++++++++++++++++++++ src/cpu/pred/btb/mbtb.hh | 24 ++++ src/cpu/pred/btb/test/btb_tage.test.cc | 87 ++++++++++++ 7 files changed, 632 insertions(+), 10 deletions(-) diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index 58828467cd..f0095e0d7a 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -288,7 +288,7 @@ BTBITTAGE::update(const FetchTarget &stream) } ittageStats.updateTableHits.sample(main_info.table, 1); - if (used_alt && mispred) { + if (used_alt && alt_info.found && mispred) { auto &alt_way = tageTable[pred.altInfo.table][pred.altInfo.index]; updateCounter(false, 2, alt_way.counter); if (alt_way.counter == 0) { @@ -390,6 +390,174 @@ BTBITTAGE::update(const FetchTarget &stream) debugFlag = false; } +bool +BTBITTAGE::canResolveTrain(const ResolvedTrainPacket &packet) +{ + return true; +} + +void +BTBITTAGE::resolveTrain(const ResolvedTrainPacket &packet) +{ + auto predMeta = std::static_pointer_cast( + packet.predMetas[getComponentIdx()]); + if (!predMeta) { + DPRINTF(ITTAGE, "resolveTrain: no prediction meta, skip\n"); + return; + } + + auto preds = predMeta->preds; + auto updateTagFoldedHist = predMeta->tagFoldedHist; + auto updateAltTagFoldedHist = predMeta->altTagFoldedHist; + auto updateIndexFoldedHist = predMeta->indexFoldedHist; + + for (const auto &resolved : packet.realBranches) { + const auto &branch = resolved.branch; + if (!(branch.isIndirect && !branch.isReturn)) { + continue; + } + + BTBEntry btb_entry(branch); + auto pred_it = preds.find(branch.pc); + TagePrediction pred; + if (pred_it != preds.end()) { + pred = pred_it->second; + } + + bool mispred = resolved.mispredict; + Addr exe_target = branch.target; + auto &main_info = pred.mainInfo; + + if (mispred) { + ittageStats.updateMispred++; + } + bool &main_found = main_info.found; + auto &main_counter = main_info.entry.counter; + bool main_taken = main_info.taken(); + Addr main_target = main_info.entry.target; + + bool &used_alt = pred.useAlt; + auto &alt_info = pred.altInfo; + if (main_found) { + DPRINTF(ITTAGE, + "resolveTrain: provider table %d, idx %d, updating corresponding entry\n", + main_info.table, main_info.index); + auto &way = tageTable[main_info.table][main_info.index]; + updateCounter(exe_target == main_target, 2, way.counter); + if (way.counter == 0) { + way.target = exe_target; + } + bool alt_taken = + (alt_info.found && alt_info.taken()) || !pred.altInfo.found; + bool alt_diff = alt_taken != main_taken; + if (alt_diff) { + way.useful = exe_target == main_target; + } + + if (main_target == exe_target) { + ittageStats.predTargetHit++; + } + ittageStats.updateTableHits.sample(main_info.table, 1); + + if (used_alt && alt_info.found && mispred) { + auto &alt_way = tageTable[pred.altInfo.table][pred.altInfo.index]; + updateCounter(false, 2, alt_way.counter); + if (alt_way.counter == 0) { + alt_way.target = exe_target; + } + } else if (used_alt && alt_info.found && + alt_info.entry.target == exe_target) { + ittageStats.updateUseAltCorrect++; + } + DPRINTF(ITTAGE, "resolveTrain: useful bit set to %d\n", way.useful); + } + + bool use_alt_on_main_found_correct = + used_alt && main_found && main_target == exe_target; + bool needToAllocate = mispred && !use_alt_on_main_found_correct; + DPRINTF(ITTAGE, + "resolveTrain: mispred %d, use_alt_on_main_found_correct %d, needToAllocate %d\n", + mispred, use_alt_on_main_found_correct, needToAllocate); + + auto useful_mask = predMeta->usefulMask; + int alloc_table_num = numPredictors - (main_info.found ? main_info.table + 1 : 0); + if (main_found) { + useful_mask >>= main_info.table + 1; + useful_mask.resize(alloc_table_num); + } + int num_tables_can_allocate = (~useful_mask).count(); + bool canAllocate = num_tables_can_allocate > 0; + if (needToAllocate) { + if (canAllocate) { + usefulResetCnt -= 1; + if (usefulResetCnt <= 0) { + usefulResetCnt = 0; + } + DPRINTF(ITTAGE, + "resolveTrain: can allocate, usefulResetCnt %d\n", + usefulResetCnt); + } else { + usefulResetCnt += 1; + if (usefulResetCnt >= 256) { + usefulResetCnt = 256; + } + DPRINTF(ITTAGE, + "resolveTrain: can not allocate, usefulResetCnt %d\n", + usefulResetCnt); + } + if (usefulResetCnt == 256) { + DPRINTF(ITTAGE, "resolveTrain: reset useful bit of all entries\n"); + for (auto &table : tageTable) { + for (auto &entry : table) { + entry.useful = 0; + } + } + ittageStats.updateResetU++; + usefulResetCnt = 0; + } + } + + if (needToAllocate) { + unsigned maskMaxNum = std::pow(2, alloc_table_num); + unsigned mask = allocLFSR.get() % maskMaxNum; + bitset allocateLFSR(alloc_table_num, mask); + + auto flipped_usefulMask = useful_mask.flip(); + bitset masked = allocateLFSR & flipped_usefulMask; + bitset allocate = masked.any() ? masked : flipped_usefulMask; + + bool allocateValid = flipped_usefulMask.any(); + if (allocateValid) { + DPRINTF(ITTAGE, "resolveTrain: allocate new entry\n"); + unsigned startTable = main_found ? main_info.table + 1 : 0; + + for (int ti = startTable; ti < numPredictors; ti++) { + Addr newIndex = getTageIndex( + packet.startPC, ti, updateIndexFoldedHist[ti].get()); + Addr newTag = getTageTag(packet.startPC, ti, + updateTagFoldedHist[ti].get(), + updateAltTagFoldedHist[ti].get()); + assert(newIndex < tageTable[ti].size()); + auto &newEntry = tageTable[ti][newIndex]; + + if (allocate[ti - startTable]) { + DPRINTF(ITTAGE, + "resolveTrain: found allocatable entry, table %d, index %d, tag %d, counter %d\n", + ti, newIndex, newTag, 2); + newEntry = TageEntry(newTag, exe_target, 2, btb_entry.pc); + ittageStats.updateAllocSuccess++; + break; + } + } + } else { + ittageStats.updateAllocFailure++; + } + } + } + + DPRINTF(ITTAGE, "end resolveTrain\n"); +} + void BTBITTAGE::updateCounter(bool taken, unsigned width, short &counter) { int max = (1 << (width)) - 1; diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh index e86b45817b..fdc9b01326 100644 --- a/src/cpu/pred/btb/btb_ittage.hh +++ b/src/cpu/pred/btb/btb_ittage.hh @@ -111,6 +111,8 @@ class BTBITTAGE : public TimedBaseBTBPredictor const FetchTarget &entry,int shamt, bool cond_taken) override; void update(const FetchTarget &entry) override; + bool canResolveTrain(const ResolvedTrainPacket &packet) override; + void resolveTrain(const ResolvedTrainPacket &packet) override; void commitBranch(const FetchTarget &stream, const DynInstPtr &inst) override; diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 9f5405c878..2312c7ed5b 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -324,6 +324,7 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntri if (btb_entry.isCond && btb_entry.valid) { auto pred = generateSinglePrediction(btb_entry, startPC); meta->preds[btb_entry.pc] = pred; + meta->btbEntries[btb_entry.pc] = btb_entry; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); tageInfoForMgscs[btb_entry.pc].tage_pred_taken = pred.taken; @@ -447,16 +448,15 @@ BTBTAGE::prepareUpdateEntries(const FetchTarget &stream) { */ bool BTBTAGE::updatePredictorStateAndCheckAllocation(const BTBEntry &entry, - bool actual_taken, - const TagePrediction &pred, - const FetchTarget &stream) { + bool actual_taken, + const TagePrediction &pred, + bool this_fb_mispred) { tageStats.updateStatsWithTagePrediction(pred, false); auto &main_info = pred.mainInfo; auto &alt_info = pred.altInfo; bool used_alt = pred.useAlt; // Use base table instead of entry.ctr for fallback prediction - Addr startPC = stream.getRealStartPC(); bool base_taken = entry.ctr >= 0; bool alt_taken = alt_info.found ? alt_info.taken() : base_taken; @@ -534,9 +534,6 @@ BTBTAGE::updatePredictorStateAndCheckAllocation(const BTBEntry &entry, } } - // Check if misprediction occurred - bool this_fb_mispred = stream.squashType == SquashType::SQUASH_CTRL && - stream.squashPC == entry.pc; if (getDelay() == 2){ if (this_fb_mispred) { tageStats.updateMispred++; @@ -682,6 +679,32 @@ BTBTAGE::canResolveUpdate(const FetchTarget &stream) { return true; } +bool +BTBTAGE::canResolveTrain(const ResolvedTrainPacket &packet) +{ + Addr startAddr = packet.startPC; + unsigned updateBank = getBankId(startAddr); + +#ifndef UNIT_TEST + tageStats.updateAccessPerBank[updateBank]++; +#endif + + if (enableBankConflict && predBankValid && updateBank == lastPredBankId) { + tageStats.updateBankConflict++; + tageStats.updateDeferredDueToConflict++; +#ifndef UNIT_TEST + tageStats.updateBankConflictPerBank[updateBank]++; +#endif + DPRINTF(TAGE, "Bank conflict detected: resolve-train bank %u conflicts with prediction bank %u, " + "deferring this packet\n", + updateBank, lastPredBankId); + predBankValid = false; + return false; + } + + return true; +} + /** * @brief Perform resolved update after probe success. */ @@ -694,6 +717,123 @@ BTBTAGE::doResolveUpdate(const FetchTarget &stream) { update(stream); } +std::vector +BTBTAGE::prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const std::shared_ptr &predMeta) +{ + std::vector updates; + + for (const auto &resolved : packet.realBranches) { + if (!resolved.branch.isCond) { + continue; + } + + auto pred_it = predMeta->btbEntries.find(resolved.branch.pc); + BTBEntry entry; + if (pred_it != predMeta->btbEntries.end()) { + entry = pred_it->second; + } else { + entry = BTBEntry(resolved.branch); + entry.valid = true; + entry.alwaysTaken = false; + } + + if (entry.alwaysTaken) { + continue; + } + + updates.push_back({entry, resolved}); + } + + return updates; +} + +void +BTBTAGE::resolveTrain(const ResolvedTrainPacket &packet) +{ + if (enableBankConflict && predBankValid) { + predBankValid = false; + } + + auto predMeta = std::static_pointer_cast( + packet.predMetas[getComponentIdx()]); + if (!predMeta) { + DPRINTF(TAGE, "resolveTrain: no prediction meta, skip\n"); + return; + } + + auto entries_to_update = prepareResolveTrainEntries(packet, predMeta); + + bool hasRecomputedVsActualDiff = false; + bool hasRecomputedVsOriginalDiff = false; + for (const auto &update : entries_to_update) { + const auto &btb_entry = update.entry; + const auto &resolved = update.resolved; + auto orig_it = predMeta->preds.find(btb_entry.pc); + const bool has_original_pred = orig_it != predMeta->preds.end(); + TagePrediction original_pred; + if (has_original_pred) { + original_pred = orig_it->second; + } + bool actual_taken = resolved.taken; + +#ifndef UNIT_TEST + if (has_original_pred && original_pred.finalProviderTable >= 0) { + if (original_pred.taken == actual_taken) { + tageStats.updateFinalSourceTableCorrect[ + original_pred.finalProviderTable]++; + } else { + tageStats.updateFinalSourceTableWrong[ + original_pred.finalProviderTable]++; + } + } else if (has_original_pred && original_pred.taken == actual_taken) { + tageStats.updateFinalSourceBaseCorrect++; + } else if (has_original_pred) { + tageStats.updateFinalSourceBaseWrong++; + } +#endif + + TagePrediction recomputed = updateOnRead ? + generateSinglePrediction(btb_entry, packet.startPC, predMeta) : + original_pred; + + if (has_original_pred && updateOnRead && + recomputed.taken != original_pred.taken) { + hasRecomputedVsOriginalDiff = true; + } + if (recomputed.taken != actual_taken) { + hasRecomputedVsActualDiff = true; + } + + bool need_allocate = updatePredictorStateAndCheckAllocation( + btb_entry, actual_taken, recomputed, resolved.mispredict); + + if (need_allocate) { + uint start_table = 0; + auto &main_info = recomputed.mainInfo; + if (main_info.found) { + start_table = main_info.table + 1; + } + + uint64_t allocated_table = 0; + uint64_t allocated_index = 0; + uint64_t allocated_way = 0; + handleNewEntryAllocation(packet.startPC, btb_entry, actual_taken, + start_table, predMeta, allocated_table, + allocated_index, allocated_way); + } + } + + if (hasRecomputedVsActualDiff) { + tageStats.recomputedVsActualDiff++; + } + if (hasRecomputedVsOriginalDiff) { + tageStats.recomputedVsOriginalDiff++; + } + + DPRINTF(TAGE, "end resolveTrain\n"); +} + /** * @brief Updates the TAGE predictor state based on actual branch execution results * @@ -767,7 +907,10 @@ BTBTAGE::update(const FetchTarget &stream) { } // Update predictor state and check if need to allocate new entry - bool need_allocate = updatePredictorStateAndCheckAllocation(btb_entry, actual_taken, recomputed, stream); + bool need_allocate = updatePredictorStateAndCheckAllocation( + btb_entry, actual_taken, recomputed, + stream.squashType == SquashType::SQUASH_CTRL && + stream.squashPC == btb_entry.pc); // Handle new entry allocation if needed bool alloc_success = false; diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index f7dd76814c..d442ed8623 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -161,6 +161,8 @@ class BTBTAGE : public TimedBaseBTBPredictor void update(const FetchTarget &entry) override; bool canResolveUpdate(const FetchTarget &entry) override; void doResolveUpdate(const FetchTarget &entry) override; + bool canResolveTrain(const ResolvedTrainPacket &packet) override; + void resolveTrain(const ResolvedTrainPacket &packet) override; #ifndef UNIT_TEST void commitBranch(const FetchTarget &stream, const DynInstPtr &inst) override; @@ -413,6 +415,7 @@ public: typedef struct TageMeta { std::unordered_map preds; + std::unordered_map btbEntries; std::vector tagFoldedHist; std::vector altTagFoldedHist; std::vector indexFoldedHist; @@ -436,7 +439,17 @@ private: bool updatePredictorStateAndCheckAllocation(const BTBEntry &entry, bool actual_taken, const TagePrediction &pred, - const FetchTarget &stream); + bool this_fb_mispred); + + struct ResolveTrainUpdate + { + BTBEntry entry; + ResolvedBranch resolved; + }; + + std::vector + prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const std::shared_ptr &predMeta); // Helper method to handle new entry allocation bool handleNewEntryAllocation(const Addr &startPC, diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index 4ab8445677..4a3b29bfd6 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -29,6 +29,8 @@ #include "cpu/pred/btb/mbtb.hh" +#include + #include "base/intmath.hh" // Additional conditional includes based on build mode @@ -487,6 +489,39 @@ MBTB::checkPredictionHit(const FetchTarget &stream, const BTBMeta* meta) } +void +MBTB::checkPredictionHit(const ResolvedTrainPacket &packet, const BTBMeta *meta) +{ + const ResolvedBranch *taken_branch = nullptr; + for (const auto &resolved : packet.realBranches) { + if (resolved.taken) { + taken_branch = &resolved; + break; + } + } + + if (!taken_branch) { + btbStats.updateHit++; + return; + } + + bool pred_branch_hit = false; + for (const auto &e : meta->hit_entries) { + if (taken_branch->branch == e) { + pred_branch_hit = true; + break; + } + } + + if (!pred_branch_hit) { + DPRINTF(BTB, "resolve-train miss detected, pc %#lx\n", + taken_branch->branch.pc); + btbStats.updateMiss++; + } else { + btbStats.updateHit++; + } +} + /** * Update or replace BTB entry @@ -553,6 +588,58 @@ MBTB::updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream) } } +void +MBTB::updateBTBEntry(const BTBEntry &entry, const ResolvedBranch &resolved) +{ + btbStats.updateTotal++; + Addr alignedPC = entry.pc & ~(blockSize - 1); + int sram_id = getSRAMId(alignedPC); + auto &target_sram = (sram_id == 0) ? sram0 : sram1; + auto &target_mru = (sram_id == 0) ? mru0 : mru1; + + Addr btb_idx = getIndex(entry.pc); + + bool found = false; + auto it = target_sram[btb_idx].begin(); + for (; it != target_sram[btb_idx].end(); ++it) { + if (*it == entry) { + found = true; + break; + } + } + + bool found_in_vc = false; + int vc_idx = -1; + for (int i = 0; i < (int)victimCache.size(); ++i) { + auto &vc_entry = victimCache[i]; + if (vc_entry.valid && vc_entry.pc == entry.pc) { + found_in_vc = true; + vc_idx = i; + break; + } + } + + const BTBEntry *existing_ptr = nullptr; + if (found) { + existing_ptr = static_cast(&(*it)); + } else if (found_in_vc) { + existing_ptr = static_cast(&victimCache[vc_idx]); + } + + auto entry_to_write = buildUpdatedEntry(entry, existing_ptr, resolved); + auto ticked_entry = TickedBTBEntry(entry_to_write, curTick()); + + if (found) { + updateExistingInSRAMSet(btb_idx, target_mru[btb_idx], it, ticked_entry); + } else if (found_in_vc) { + commitToVictimCache(vc_idx, ticked_entry); + return; + } else { + replaceOldestInSRAMSet(sram_id, btb_idx, target_mru[btb_idx], + ticked_entry); + } +} + BTBEntry MBTB::buildUpdatedEntry(const BTBEntry& req_entry, const BTBEntry* existing_entry, @@ -586,6 +673,38 @@ MBTB::buildUpdatedEntry(const BTBEntry& req_entry, return entry_to_write; } +BTBEntry +MBTB::buildUpdatedEntry(const BTBEntry &req_entry, + const BTBEntry *existing_entry, + const ResolvedBranch &resolved) +{ + auto entry_to_write = (req_entry.isCond && existing_entry) + ? BTBEntry(*existing_entry) + : req_entry; + entry_to_write.tag = getTag(entry_to_write.pc); + entry_to_write.resolved = false; + + if (entry_to_write.isCond) { + bool this_cond_taken = resolved.taken && + resolved.branch.pc == entry_to_write.pc; + if (!this_cond_taken) { + entry_to_write.alwaysTaken = false; + DPRINTF(BTB, "BTB: unset alwaysTaken, pc %#lx, alwaysTaken %d\n", + entry_to_write.pc, entry_to_write.alwaysTaken); + } + if (!entry_to_write.alwaysTaken) { + updateCtr(entry_to_write.ctr, this_cond_taken); + } + } + + if (entry_to_write.isIndirect && resolved.taken && + resolved.branch.pc == entry_to_write.pc) { + entry_to_write.target = resolved.branch.target; + } + + return entry_to_write; +} + void MBTB::updateExistingInSRAMSet(Addr btb_idx, BTBHeap &heap, @@ -694,6 +813,29 @@ MBTB::update(const FetchTarget &stream) } } +bool +MBTB::canResolveTrain(const ResolvedTrainPacket &packet) +{ + return getComponentIdx() < packet.numPredMetas; +} + +void +MBTB::resolveTrain(const ResolvedTrainPacket &packet) +{ + auto meta = std::static_pointer_cast( + packet.predMetas[getComponentIdx()]); + if (!meta) { + return; + } + + checkPredictionHit(packet, meta.get()); + + auto entries_need_update = prepareResolveTrainEntries(packet, meta.get()); + for (const auto &update : entries_need_update) { + updateBTBEntry(update.entry, update.resolved); + } +} + std::vector MBTB::prepareUpdateEntries(const FetchTarget &stream) { auto all_entries = stream.updateBTBEntries; @@ -718,6 +860,49 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) { return all_entries; } +std::vector +MBTB::prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const BTBMeta *meta) +{ + std::vector updates; + std::unordered_set predicted_hits; + for (const auto &entry : meta->hit_entries) { + predicted_hits.insert(entry.pc); + } + + for (const auto &resolved : packet.realBranches) { + auto hit_it = std::find_if(meta->hit_entries.begin(), meta->hit_entries.end(), + [&resolved](const BTBEntry &entry) { + return entry.pc == resolved.branch.pc; + }); + if (hit_it != meta->hit_entries.end()) { + updates.push_back({*hit_it, resolved}); + continue; + } + + if (!resolved.taken) { + continue; + } + + DPRINTF(BTB, "Creating resolve-train BTB entry for pc %#lx\n", + resolved.branch.pc); + BTBEntry new_entry(resolved.branch); + new_entry.valid = true; + if (new_entry.isCond) { + new_entry.alwaysTaken = true; + new_entry.ctr = 0; + btbStats.newEntryWithCond++; + } else { + btbStats.newEntryWithUncond++; + } + btbStats.newEntry++; + new_entry.tag = getTag(new_entry.pc); + updates.push_back({new_entry, resolved}); + } + + return updates; +} + /** * Victim cache operations implementation */ diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh index d736d0f55c..c6990c7ccb 100644 --- a/src/cpu/pred/btb/mbtb.hh +++ b/src/cpu/pred/btb/mbtb.hh @@ -171,6 +171,10 @@ class MBTB : public TimedBaseBTBPredictor */ void update(const FetchTarget &stream) override; + bool canResolveTrain(const ResolvedTrainPacket &packet) override; + + void resolveTrain(const ResolvedTrainPacket &packet) override; + std::vector prepareUpdateEntries(const FetchTarget &stream); void printBTBEntry(const BTBEntry &e, uint64_t tick = 0) { @@ -277,17 +281,37 @@ class MBTB : public TimedBaseBTBPredictor void checkPredictionHit(const FetchTarget &stream, const BTBMeta* meta); + void checkPredictionHit(const ResolvedTrainPacket &packet, + const BTBMeta *meta); + + struct ResolveTrainUpdate + { + BTBEntry entry; + ResolvedBranch resolved; + }; + + std::vector + prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const BTBMeta *meta); + /** Update or replace BTB entry * @param entry Entry to update/replace (PC used to select SRAM and calculate index/tag) * @param stream Fetch stream with update info */ void updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream); + void updateBTBEntry(const BTBEntry &entry, + const ResolvedBranch &resolved); + // Helper: build updated entry (ctr/alwaysTaken/indirect target/tag) BTBEntry buildUpdatedEntry(const BTBEntry& req_entry, const BTBEntry* existing_entry, const FetchTarget &stream); + BTBEntry buildUpdatedEntry(const BTBEntry &req_entry, + const BTBEntry *existing_entry, + const ResolvedBranch &resolved); + // Helper: update an existing entry in SRAM set void updateExistingInSRAMSet(Addr btb_idx, BTBHeap &heap, diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index c7cea1b469..6a33227b19 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -223,6 +223,45 @@ void setupTageEntry(BTBTAGE* tage, Addr pc, int table_idx, entry.pc = pc; } +void setupTageEntryForFetchBlock(BTBTAGE *tage, Addr startPC, Addr branchPC, + int table_idx, short counter, + bool useful = false, int way = 0) +{ + Addr index = tage->getTageIndex(startPC, table_idx); + unsigned position = tage->getBranchIndexInBlock(branchPC, startPC); + Addr tag = tage->getTageTag(startPC, table_idx, + tage->tagFoldedHist[table_idx].get(), + tage->altTagFoldedHist[table_idx].get(), position); + + auto &entry = tage->tageTable[table_idx][index][way]; + entry.valid = true; + entry.tag = tag; + entry.counter = counter; + entry.useful = useful; + entry.pc = branchPC; +} + +ResolvedBranch createResolvedBranch(const BTBEntry &entry, bool taken, + bool mispredict, uint8_t ftqOffset) +{ + BranchInfo branch(entry); + branch.resolved = true; + branch.size = 4; + return ResolvedBranch(branch, taken, mispredict, ftqOffset); +} + +ResolvedTrainPacket createResolvedTrainPacket(Addr startPC, + std::shared_ptr meta, + std::vector realBranches) +{ + ResolvedTrainPacket packet; + packet.startPC = startPC; + packet.numPredMetas = 1; + packet.predMetas[0] = meta; + packet.realBranches = std::move(realBranches); + return packet; +} + /** * @brief Verify TAGE table entries * @@ -975,6 +1014,54 @@ TEST_F(BTBTAGETest, BankConflict) { } } +TEST_F(BTBTAGETest, ResolveTrainBankConflict) { + BTBTAGE bankTage(4, 2, 1024, 4); + memset(&bankTage.tageStats, 0, sizeof(BTBTAGE::TageStats)); + boost::dynamic_bitset<> testHistory(128); + std::vector testStagePreds(5); + + bankTage.enableBankConflict = true; + testStagePreds[1].btbEntries = {createBTBEntry(0x20)}; + bankTage.putPCHistory(0x20, testHistory, testStagePreds); + EXPECT_TRUE(bankTage.predBankValid); + + auto meta = bankTage.getPredictionMeta(); + auto packet = createResolvedTrainPacket( + 0xa0, meta, {createResolvedBranch(createBTBEntry(0xa0), true, false, 0)}); + + uint64_t conflicts_before = bankTage.tageStats.updateBankConflict; + bool can_train = bankTage.canResolveTrain(packet); + + EXPECT_FALSE(can_train); + EXPECT_EQ(bankTage.tageStats.updateBankConflict, conflicts_before + 1); + EXPECT_FALSE(bankTage.predBankValid); +} + +TEST_F(BTBTAGETest, ResolveTrainUsesPacketTruthForConditionalSelection) { + const Addr startPC = 0x1000; + BTBEntry first = createBTBEntry(0x1000); + BTBEntry second = createBTBEntry(0x1004); + + setupTageEntryForFetchBlock(tage, startPC, first.pc, 3, 0); + setupTageEntryForFetchBlock(tage, startPC, second.pc, 3, 0, false, 1); + + predictTAGE(tage, startPC, {first, second}, history, stagePreds); + auto meta = tage->getPredictionMeta(); + + Addr first_index = tage->getTageIndex(startPC, 3); + EXPECT_EQ(tage->tageTable[3][first_index][0].counter, 0); + EXPECT_EQ(tage->tageTable[3][first_index][1].counter, 0); + + auto packet = createResolvedTrainPacket( + startPC, meta, {createResolvedBranch(first, false, true, 0)}); + + ASSERT_TRUE(tage->canResolveTrain(packet)); + tage->resolveTrain(packet); + + EXPECT_EQ(tage->tageTable[3][first_index][0].counter, -1); + EXPECT_EQ(tage->tageTable[3][first_index][1].counter, 0); +} + class BTBTAGEUpperBoundTest : public ::testing::Test { protected: From c8a4be71333bd10da225b3e43f329c8ea6b27992 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:59:26 +0800 Subject: [PATCH 03/15] cpu: default to full resolve train rollout Enable packet-based resolve training by default while keeping legacy resolve updates enabled as fallback until the remaining components are migrated. Change-Id: I124f9657b46ad9e19bb0bf81f3d3ec85592d25ab --- src/cpu/o3/BaseO3CPU.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index d971502565..ac292c467f 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -247,6 +247,10 @@ def support_take_over(cls): branchPred = Param.BranchPredictor(DecoupledBPUWithBTB(), "Branch Predictor") resolveQueueSize = Param.Unsigned(16, "Number of entries in the branch resolution queue") + enableFullResolveTrain = Param.Bool(True, + "Enable packet-based resolve training rollout plumbing") + enableLegacyResolveUpdate = Param.Bool(True, + "Enable legacy PC-only resolve update") needsTSO = Param.Bool(False, "Enable TSO Memory model") scheduler = Param.Scheduler("") From bf8ccb44ac04b2f711ef78d6cf9d9537f7b706f0 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Tue, 7 Apr 2026 11:19:29 +0800 Subject: [PATCH 04/15] misc: document full resolve train rollout Add a review guide for the full resolve train branch covering the new dataflow, migrated components, default behavior, and the main files and invariants reviewers should check. Change-Id: Idfa11844ec1b46f59339e589bfd88c0d17d3e47f --- .../full_resolve_train_review_guide.md | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md diff --git a/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md new file mode 100644 index 0000000000..95ecba5683 --- /dev/null +++ b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md @@ -0,0 +1,271 @@ +# Full Resolve Train Review Guide + +## 1. Why this branch exists + +This branch changes GEM5 frontend resolved-branch training from a squash-assisted, +PC-only update model to an RTL-aligned full resolve-train model. + +Before this branch: + +- IEW only sent `{ftqId, pc}` through `resolvedCFIs` +- Fetch reconstructed resolved updates from `FetchTarget` state +- correct training truth depended on squash writing back `exeTaken` / + `exeBranchInfo` before resolve update was consumed + +After this branch: + +- IEW sends full per-branch resolve truth +- Fetch aggregates real resolved branches by FTQ target identity +- Fetch builds an explicit `ResolvedTrainPacket` +- migrated predictors train from packet truth plus prediction-time metadata + +The main motivation is to remove the correctness and performance risk caused by +resolve training depending on squash timing. + +## 2. High-level architecture + +The new dataflow is: + +```text +IEW + -> resolveTrainEntries[{ftqId, generation, pc, target, taken, ...}] +Fetch + -> resolveTrainQueue keyed by {tid, ftqId, generation} + -> ResolvedTrainPacket{startPC, predMetas, realBranches} +DecoupledBPUWithBTB + -> resolveTrain(packet) + -> per-component canResolveTrain/resolveTrain +``` + +The old path still exists as fallback: + +```text +IEW + -> resolvedCFIs[{ftqId, pc}] +Fetch + -> legacy resolveQueue +DecoupledBPUWithBTB + -> prepareResolveUpdateEntries/markCFIResolved/resolveUpdate +``` + +Current default mode is: + +- `enableFullResolveTrain = True` +- `enableLegacyResolveUpdate = True` + +This means: + +- migrated components use the full packet path +- non-migrated `resolvedUpdate` components still keep legacy fallback + +It is not a double-update mode for migrated components. + +## 3. Review map by file group + +### 3.1 O3 protocol and Fetch plumbing + +Relevant files: + +- `src/cpu/o3/BaseO3CPU.py` +- `src/cpu/o3/comm.hh` +- `src/cpu/o3/dyn_inst.hh` +- `src/cpu/o3/dyn_inst.cc` +- `src/cpu/o3/iew.cc` +- `src/cpu/o3/fetch.hh` +- `src/cpu/o3/fetch.cc` +- `src/cpu/pred/BranchPredictor.py` + +Key changes: + +- adds rollout params: `enableFullResolveTrain`, `enableLegacyResolveUpdate` +- adds `ResolveTrainEntry` to `IewComm` +- records `ftqGeneration` and `ftqOffset` in `DynInst` +- emits full resolve truth from IEW +- adds `resolveTrainQueue` in Fetch +- builds `ResolvedTrainPacket` from queued truth plus `FetchTarget` metadata +- only pops packet queue on explicit predictor acceptance + +Main review questions: + +- does `DynInst` carry enough fetch-time identity for stable FTQ matching? +- does Fetch aggregate by `{tid, ftqId, generation}` correctly? +- are stale, squashed, committed, and reused FTQ targets rejected safely? +- does full-resolve retry feed the same throttle path as legacy resolve update? + +### 3.2 FTQ identity protection and predictor top-level API + +Relevant files: + +- `src/cpu/pred/btb/common.hh` +- `src/cpu/pred/btb/ftq.hh` +- `src/cpu/pred/btb/ftq.cc` +- `src/cpu/pred/btb/decoupled_bpred.hh` +- `src/cpu/pred/btb/decoupled_bpred.cc` +- `src/cpu/pred/btb/timed_base_pred.hh` + +Key changes: + +- adds `generation` to `FetchTarget` +- allocates a fresh generation when a logical FTQ target is created +- adds FTQ identity helpers: generation lookup and identity matching +- adds packet types: + - `FetchTargetIdentity` + - `ResolvedBranch` + - `ResolvedTrainPacket` +- adds `DecoupledBPUWithBTB::resolveTrain()` +- adds default component hooks: + - `canResolveTrain(packet)` + - `resolveTrain(packet)` +- adds top-level packet validation before fan-out + +Main review questions: + +- is FTQ generation sufficient to reject stale resolve traffic? +- does packet validation reject malformed branch lists and stale metadata? +- does `resolveTrain()` preserve the old probe/apply contract semantics? + +### 3.3 Migrated predictors + +Relevant files: + +- `src/cpu/pred/btb/mbtb.hh` +- `src/cpu/pred/btb/mbtb.cc` +- `src/cpu/pred/btb/btb_tage.hh` +- `src/cpu/pred/btb/btb_tage.cc` +- `src/cpu/pred/btb/btb_ittage.hh` +- `src/cpu/pred/btb/btb_ittage.cc` + +Migrated components: + +- `MBTB` +- `BTBTAGE` +- `BTBITTAGE` + +Current split: + +- in full-resolve mode, these three no longer consume legacy resolved-update +- other non-migrated components can still use legacy resolved-update if enabled + +Main review questions: + +- does each component now train only from packet truth on the new path? +- are legacy side effects cleanly disabled for migrated components? +- do bank conflict / readiness semantics still behave the same? + +## 4. Component-by-component summary + +### 4.1 MBTB + +What changed: + +- packet-based `canResolveTrain()` / `resolveTrain()` were added +- new path no longer depends on: + - `updateBTBEntries` + - `updateNewBTBEntry` + - per-entry resolved bits inside `FetchTarget` +- packet updates reuse existing SRAM / victim-cache update machinery + +Important review points: + +- MBTB legacy resolved-update is skipped in full-resolve mode +- MBTB-specific legacy prepare/mark side effects are also gated off in + full-resolve mode + +### 4.2 BTBTAGE + +What changed: + +- packet-based bank-conflict probe added in `canResolveTrain()` +- packet path now trains using prediction snapshots, not squash-populated state +- metadata now retains predicted conditional `BTBEntry` per branch PC +- missing-meta conditional branches on the packet path are now materialized and + still trained, matching old new-entry behavior + +Important review points: + +- packet path only trains intended conditional branches +- packet conflict failures now drive `notifyResolveFailure()` so retry and + prediction throttling still work + +### 4.3 BTBITTAGE + +What changed: + +- packet-based `resolveTrain()` now uses indirect branch truth from packet data +- no longer depends on squash-derived `exeBranchInfo` on the new path +- legacy resolved-update is skipped in full-resolve mode + +Important review points: + +- training remains scoped to indirect non-return branches +- alternate-provider update now only happens when `alt_info.found` is true, + fixing an existing corruption hazard in both old and new paths + +## 5. Current default behavior + +Current defaults are set in `src/cpu/o3/BaseO3CPU.py`: + +- `enableFullResolveTrain = True` +- `enableLegacyResolveUpdate = True` + +This is intentional. + +Reason: + +- migrated components already use the packet path +- some other `resolvedUpdate` users may still exist outside this migration set +- keeping legacy enabled avoids silent loss of resolve-stage training during the + rollout period + +So current behavior is: + +- `MBTB`, `BTBTAGE`, `BTBITTAGE` -> full packet path +- non-migrated resolved-update components -> legacy path +- commit-time predictors -> unchanged commit path + +## 6. Verification done on this branch + +Fresh verification used before final commit creation: + +- build: `scons build/RISCV/gem5.opt --gold-linker -j60` +- unit test: `build/RISCV/cpu/pred/btb/test/tage.test.debug` + +Observed result: + +- `gem5.opt` builds successfully +- `tage.test.debug` passes `21/21` + +New or extended test coverage includes packet-mode BTBTAGE cases for: + +- bank-conflict probe behavior +- packet-truth conditional selection +- new conditional entry training without prediction metadata + +## 7. Suggested review order + +For fastest review, read in this order: + +1. `src/cpu/o3/comm.hh` +2. `src/cpu/o3/iew.cc` +3. `src/cpu/o3/fetch.hh` +4. `src/cpu/o3/fetch.cc` +5. `src/cpu/pred/btb/common.hh` +6. `src/cpu/pred/btb/ftq.hh` +7. `src/cpu/pred/btb/decoupled_bpred.hh` +8. `src/cpu/pred/btb/decoupled_bpred.cc` +9. `src/cpu/pred/btb/mbtb.cc` +10. `src/cpu/pred/btb/btb_tage.cc` +11. `src/cpu/pred/btb/btb_ittage.cc` +12. `src/cpu/pred/btb/test/btb_tage.test.cc` + +## 8. Known follow-up work + +This branch does not yet remove the legacy path. + +Natural next steps after review: + +- migrate any remaining `resolvedUpdate` components if needed +- once all needed components are packetized, turn `enableLegacyResolveUpdate` + default off +- then delete legacy `resolvedCFIs` / `prepareResolveUpdateEntries()` / + `markCFIResolved()`-based training path From ca46f62d8906d6f590505e829217436b2f6adfe4 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Tue, 7 Apr 2026 20:37:49 +0800 Subject: [PATCH 05/15] cpu: do not save metas another time Change-Id: I736783b6b3bb469c7a02d060e9317ee24879d31a --- src/cpu/o3/fetch.cc | 15 --------------- src/cpu/o3/fetch.hh | 1 - src/cpu/pred/btb/btb_ittage.cc | 10 ++++++---- src/cpu/pred/btb/btb_ittage.hh | 6 ++++-- src/cpu/pred/btb/btb_tage.cc | 10 ++++++---- src/cpu/pred/btb/btb_tage.hh | 6 ++++-- src/cpu/pred/btb/common.hh | 6 +----- src/cpu/pred/btb/decoupled_bpred.cc | 25 ++++--------------------- src/cpu/pred/btb/mbtb.cc | 11 +++++++---- src/cpu/pred/btb/mbtb.hh | 6 ++++-- src/cpu/pred/btb/timed_base_pred.hh | 6 ++++-- 11 files changed, 40 insertions(+), 62 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 46db1cffc3..c416555880 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -87,19 +87,6 @@ namespace constexpr uint8_t RvcInstBytes = 2; constexpr uint8_t BaseInstBytes = 4; -size_t -resolveTrainMetaCount( - const branch_prediction::btb_pred::FetchTarget &target) -{ - size_t num_pred_metas = 0; - for (size_t i = 0; i < target.predMetas.size(); ++i) { - if (target.predMetas[i] != nullptr) { - num_pred_metas = i + 1; - } - } - return num_pred_metas; -} - } // anonymous namespace Fetch::IcachePort::IcachePort(Fetch *_fetch, CPU *_cpu) : @@ -1686,8 +1673,6 @@ Fetch::buildResolvedTrainPacket(const ResolveTrainQueueEntry &entry) const packet.tid = entry.tid; packet.target = {entry.ftqId, entry.generation}; packet.startPC = target.startPC; - packet.numPredMetas = resolveTrainMetaCount(target); - packet.predMetas = target.predMetas; packet.realBranches.reserve(entry.insts.size()); for (const auto &inst_data : entry.insts) { diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 8f2f7aaba5..e27213d7a1 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1152,7 +1152,6 @@ class Fetch statistics::Scalar fullResolveEntriesDroppedGenerationMismatch; /** Full resolve packets sent to the predictor. */ statistics::Scalar fullResolvePacketsSent; - // Trace metadata accounting (trace mode) /** Number of stored trace metadata records (seqNum -> traceInst). */ statistics::Scalar traceMetaStores; diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index f0095e0d7a..ec9104b3bf 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -391,18 +391,20 @@ BTBITTAGE::update(const FetchTarget &stream) } bool -BTBITTAGE::canResolveTrain(const ResolvedTrainPacket &packet) +BTBITTAGE::canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { return true; } void -BTBITTAGE::resolveTrain(const ResolvedTrainPacket &packet) +BTBITTAGE::resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { auto predMeta = std::static_pointer_cast( - packet.predMetas[getComponentIdx()]); + target.predMetas[getComponentIdx()]); if (!predMeta) { - DPRINTF(ITTAGE, "resolveTrain: no prediction meta, skip\n"); + DPRINTF(ITTAGE, "resolveTrain: no live prediction meta, skip\n"); return; } diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh index fdc9b01326..dc4723d982 100644 --- a/src/cpu/pred/btb/btb_ittage.hh +++ b/src/cpu/pred/btb/btb_ittage.hh @@ -111,8 +111,10 @@ class BTBITTAGE : public TimedBaseBTBPredictor const FetchTarget &entry,int shamt, bool cond_taken) override; void update(const FetchTarget &entry) override; - bool canResolveTrain(const ResolvedTrainPacket &packet) override; - void resolveTrain(const ResolvedTrainPacket &packet) override; + bool canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; + void resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; void commitBranch(const FetchTarget &stream, const DynInstPtr &inst) override; diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 2312c7ed5b..0084cb30bc 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -680,7 +680,8 @@ BTBTAGE::canResolveUpdate(const FetchTarget &stream) { } bool -BTBTAGE::canResolveTrain(const ResolvedTrainPacket &packet) +BTBTAGE::canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { Addr startAddr = packet.startPC; unsigned updateBank = getBankId(startAddr); @@ -749,16 +750,17 @@ BTBTAGE::prepareResolveTrainEntries(const ResolvedTrainPacket &packet, } void -BTBTAGE::resolveTrain(const ResolvedTrainPacket &packet) +BTBTAGE::resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { if (enableBankConflict && predBankValid) { predBankValid = false; } auto predMeta = std::static_pointer_cast( - packet.predMetas[getComponentIdx()]); + target.predMetas[getComponentIdx()]); if (!predMeta) { - DPRINTF(TAGE, "resolveTrain: no prediction meta, skip\n"); + DPRINTF(TAGE, "resolveTrain: no live prediction meta, skip\n"); return; } diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index d442ed8623..94313c9f2a 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -161,8 +161,10 @@ class BTBTAGE : public TimedBaseBTBPredictor void update(const FetchTarget &entry) override; bool canResolveUpdate(const FetchTarget &entry) override; void doResolveUpdate(const FetchTarget &entry) override; - bool canResolveTrain(const ResolvedTrainPacket &packet) override; - void resolveTrain(const ResolvedTrainPacket &packet) override; + bool canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; + void resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; #ifndef UNIT_TEST void commitBranch(const FetchTarget &stream, const DynInstPtr &inst) override; diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index ed6dd5f967..2d8b26e6a3 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -292,15 +292,11 @@ struct ResolvedTrainPacket ThreadID tid; FetchTargetIdentity target; Addr startPC; - size_t numPredMetas; - std::array, MaxPredictorComponents> predMetas; std::vector realBranches; ResolvedTrainPacket() - : tid(0), target(), startPC(0), numPredMetas(0), predMetas(), - realBranches() + : tid(0), target(), startPC(0), realBranches() { - predMetas.fill(nullptr); } }; diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index b7bad787ef..33ff4784ae 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -25,25 +25,8 @@ namespace { bool -validateResolvedTrainPacket(const ResolvedTrainPacket &packet, - const FetchTarget &target, - unsigned numComponents) +validateResolvedTrainPacket(const ResolvedTrainPacket &packet) { - if (packet.numPredMetas > packet.predMetas.size()) { - return false; - } - - if (numComponents > packet.predMetas.size() || - packet.numPredMetas != numComponents) { - return false; - } - - for (unsigned i = 0; i < numComponents; ++i) { - if (packet.predMetas[i] != target.predMetas[i]) { - return false; - } - } - uint8_t lastOffset = 0; Addr lastPc = 0; bool firstBranch = true; @@ -718,7 +701,7 @@ DecoupledBPUWithBTB::resolveTrain( return false; } - if (!validateResolvedTrainPacket(packet, target, numComponents)) { + if (!validateResolvedTrainPacket(packet)) { DPRINTF(DecoupleBP, "Resolve-train packet validation failed: id=%lu generation=%lu tid=%u\n", packet.target.id, packet.target.generation, tid); @@ -731,13 +714,13 @@ DecoupledBPUWithBTB::resolveTrain( packet.realBranches.size()); for (int i = 0; i < numComponents; ++i) { - if (!components[i]->canResolveTrain(packet)) { + if (!components[i]->canResolveTrain(packet, target)) { return false; } } for (int i = 0; i < numComponents; ++i) { - components[i]->resolveTrain(packet); + components[i]->resolveTrain(packet, target); } return true; diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index 4a3b29bfd6..e77bb53649 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -814,17 +814,20 @@ MBTB::update(const FetchTarget &stream) } bool -MBTB::canResolveTrain(const ResolvedTrainPacket &packet) +MBTB::canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { - return getComponentIdx() < packet.numPredMetas; + return true; } void -MBTB::resolveTrain(const ResolvedTrainPacket &packet) +MBTB::resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { auto meta = std::static_pointer_cast( - packet.predMetas[getComponentIdx()]); + target.predMetas[getComponentIdx()]); if (!meta) { + DPRINTF(BTB, "resolveTrain: no live prediction meta, skip\n"); return; } diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh index c6990c7ccb..9364bbf7f2 100644 --- a/src/cpu/pred/btb/mbtb.hh +++ b/src/cpu/pred/btb/mbtb.hh @@ -171,9 +171,11 @@ class MBTB : public TimedBaseBTBPredictor */ void update(const FetchTarget &stream) override; - bool canResolveTrain(const ResolvedTrainPacket &packet) override; + bool canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; - void resolveTrain(const ResolvedTrainPacket &packet) override; + void resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; std::vector prepareUpdateEntries(const FetchTarget &stream); diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh index 2f1b04cba2..c0431a78ae 100644 --- a/src/cpu/pred/btb/timed_base_pred.hh +++ b/src/cpu/pred/btb/timed_base_pred.hh @@ -79,11 +79,13 @@ class TimedBaseBTBPredictor: public SimObject // Two-phase resolved update: probe first, then apply virtual bool canResolveUpdate(const FetchTarget &entry) { return true; } virtual void doResolveUpdate(const FetchTarget &entry) { update(entry); } - virtual bool canResolveTrain(const ResolvedTrainPacket &packet) + virtual bool canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) { return true; } - virtual void resolveTrain(const ResolvedTrainPacket &packet) {} + virtual void resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) {} #ifndef UNIT_TEST // do some statistics on a per-branch and per-predictor basis virtual void commitBranch(const FetchTarget &entry, const DynInstPtr &inst) {} From e8e8dafddf63f6a02077ee474e6d18b4405308c9 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Tue, 7 Apr 2026 23:18:59 +0800 Subject: [PATCH 06/15] cpu: find and alloc new entries in TAGE correctly Change-Id: Icda8729682a3d1112280dbc1a3b120e922bc830b --- src/cpu/o3/fetch.cc | 4 +- src/cpu/pred/btb/btb_tage.cc | 35 +++- src/cpu/pred/btb/btb_tage.hh | 7 + src/cpu/pred/btb/test/btb_tage.test.cc | 232 ++++++++++++++++++++++++- 4 files changed, 262 insertions(+), 16 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index c416555880..f4b9fa750a 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -1595,10 +1595,10 @@ Fetch::handleIEWSignals() auto queued = std::find_if( resolveTrainQueue.begin(), resolveTrainQueue.end(), [tid, &resolved](const ResolveTrainQueueEntry &entry) { - return entry.tid == tid && entry.ftqId == resolved.ftqId && - entry.generation == resolved.ftqGeneration; + return entry.tid == tid && entry.ftqId == resolved.ftqId; }); if (queued != resolveTrainQueue.end()) { + queued->generation = resolved.ftqGeneration; appendResolveTrainInst(*queued, makeResolveTrainInstData( resolved.pc, resolved.target, resolved.taken, resolved.mispredict, resolved.ftqOffset, resolved.isCond, diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 0084cb30bc..7a4975cf6f 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -731,19 +731,23 @@ BTBTAGE::prepareResolveTrainEntries(const ResolvedTrainPacket &packet, auto pred_it = predMeta->btbEntries.find(resolved.branch.pc); BTBEntry entry; + ResolveTrainUpdate::EntryClass entry_class; if (pred_it != predMeta->btbEntries.end()) { entry = pred_it->second; + entry_class = ResolveTrainUpdate::EntryClass::ExistingPredictedEntry; } else { entry = BTBEntry(resolved.branch); entry.valid = true; entry.alwaysTaken = false; + entry.ctr = -1; + entry_class = ResolveTrainUpdate::EntryClass::NewEntryCandidate; } if (entry.alwaysTaken) { continue; } - updates.push_back({entry, resolved}); + updates.push_back({entry, resolved, entry_class}); } return updates; @@ -771,11 +775,23 @@ BTBTAGE::resolveTrain(const ResolvedTrainPacket &packet, for (const auto &update : entries_to_update) { const auto &btb_entry = update.entry; const auto &resolved = update.resolved; + const bool is_new_entry = + update.entryClass == + ResolveTrainUpdate::EntryClass::NewEntryCandidate; auto orig_it = predMeta->preds.find(btb_entry.pc); const bool has_original_pred = orig_it != predMeta->preds.end(); TagePrediction original_pred; if (has_original_pred) { original_pred = orig_it->second; + } else if (!is_new_entry) { + DPRINTF(TAGE, + "resolveTrain: missing original prediction for old entry pc %#lx, skip\n", + btb_entry.pc); + continue; + } else { + DPRINTF(TAGE, + "resolveTrain: reconstruct prediction for new entry pc %#lx from snapshot\n", + btb_entry.pc); } bool actual_taken = resolved.taken; @@ -795,14 +811,17 @@ BTBTAGE::resolveTrain(const ResolvedTrainPacket &packet, } #endif - TagePrediction recomputed = updateOnRead ? - generateSinglePrediction(btb_entry, packet.startPC, predMeta) : - original_pred; - - if (has_original_pred && updateOnRead && - recomputed.taken != original_pred.taken) { - hasRecomputedVsOriginalDiff = true; + TagePrediction recomputed; + if (updateOnRead || is_new_entry) { + recomputed = generateSinglePrediction(btb_entry, packet.startPC, + predMeta); + if (has_original_pred && recomputed.taken != original_pred.taken) { + hasRecomputedVsOriginalDiff = true; + } + } else { + recomputed = original_pred; } + if (recomputed.taken != actual_taken) { hasRecomputedVsActualDiff = true; } diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index 94313c9f2a..30e3d9c1e8 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -445,8 +445,15 @@ private: struct ResolveTrainUpdate { + enum class EntryClass + { + ExistingPredictedEntry, + NewEntryCandidate, + }; + BTBEntry entry; ResolvedBranch resolved; + EntryClass entryClass; }; std::vector diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index 6a33227b19..fff528f090 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -97,7 +97,7 @@ void applyPathHistoryTaken(boost::dynamic_bitset<>& history, Addr pc, Addr targe * @param pc Branch PC to search for * @return Pair of (found, prediction) where found indicates if PC was found */ -std::pair findCondTaken(const gem5::branch_prediction::btb_pred::CondTakens& condTakens, Addr pc) { +std::pair findCondTaken(const CondTakens& condTakens, Addr pc) { auto it = CondTakens_find(condTakens, pc); if (it != condTakens.end()) { return {true, it->second}; @@ -254,14 +254,109 @@ ResolvedTrainPacket createResolvedTrainPacket(Addr startPC, std::shared_ptr meta, std::vector realBranches) { + (void)meta; ResolvedTrainPacket packet; packet.startPC = startPC; - packet.numPredMetas = 1; - packet.predMetas[0] = meta; packet.realBranches = std::move(realBranches); return packet; } +FetchTarget createResolvedTrainTarget(Addr startPC, std::shared_ptr meta) +{ + FetchTarget target; + target.startPC = startPC; + target.predMetas[0] = meta; + return target; +} + +void advanceActualHistory(BTBTAGE *tage, + boost::dynamic_bitset<> &history, + const std::vector &entries, + const std::vector &actual_takens) +{ + ASSERT_EQ(entries.size(), actual_takens.size()); + for (size_t i = 0; i < entries.size(); ++i) { + tage->doUpdateHist(history, actual_takens[i], entries[i].pc, entries[i].target); + if (actual_takens[i]) { + applyPathHistoryTaken(history, entries[i].pc, entries[i].target); + } + } + tage->checkFoldedHist(history, "actual history advance"); +} + +void legacyTrainSequence(BTBTAGE *tage, Addr startPC, + const std::vector &entries, + const std::vector &actual_takens, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + ASSERT_EQ(entries.size(), actual_takens.size()); + stagePreds[1].btbEntries = entries; + tage->putPCHistory(startPC, history, stagePreds); + auto meta = tage->getPredictionMeta(); + + for (size_t i = 0; i < entries.size(); ++i) { + auto pred = findCondTaken(stagePreds[1].condTakens, entries[i].pc); + ASSERT_TRUE(pred.first) << "Missing legacy prediction for PC " + << std::hex << entries[i].pc; + + FetchTarget stream = createStream(startPC, entries[i], actual_takens[i], meta); + if (pred.second != actual_takens[i]) { + stream = setMispredStream(stream); + } + tage->update(stream); + } + + advanceActualHistory(tage, history, entries, actual_takens); +} + +void resolveTrainSequence(BTBTAGE *tage, Addr startPC, + const std::vector &entries, + const std::vector &actual_takens, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + ASSERT_EQ(entries.size(), actual_takens.size()); + stagePreds[1].btbEntries = entries; + tage->putPCHistory(startPC, history, stagePreds); + auto meta = tage->getPredictionMeta(); + + std::vector resolved_branches; + resolved_branches.reserve(entries.size()); + for (size_t i = 0; i < entries.size(); ++i) { + auto pred = findCondTaken(stagePreds[1].condTakens, entries[i].pc); + ASSERT_TRUE(pred.first) << "Missing resolve-train prediction for PC " + << std::hex << entries[i].pc; + resolved_branches.push_back(createResolvedBranch( + entries[i], actual_takens[i], pred.second != actual_takens[i], i)); + } + + auto packet = createResolvedTrainPacket(startPC, meta, resolved_branches); + auto target = createResolvedTrainTarget(startPC, meta); + ASSERT_TRUE(tage->canResolveTrain(packet, target)) + << "resolveTrain should be accepted for the constructed packet"; + tage->resolveTrain(packet, target); + + advanceActualHistory(tage, history, entries, actual_takens); +} + +BTBTAGE::TagePrediction predictBranch(BTBTAGE *tage, Addr startPC, + const std::vector &entries, + boost::dynamic_bitset<> &history, + std::vector &stagePreds, + Addr branchPC) +{ + stagePreds[1].btbEntries = entries; + tage->putPCHistory(startPC, history, stagePreds); + auto meta = std::static_pointer_cast(tage->getPredictionMeta()); + auto it = meta->preds.find(branchPC); + if (it == meta->preds.end()) { + ADD_FAILURE() << "Missing probe prediction for PC " << std::hex << branchPC; + return BTBTAGE::TagePrediction(); + } + return it->second; +} + /** * @brief Verify TAGE table entries * @@ -310,6 +405,38 @@ int findTableWithEntry(BTBTAGE* tage, Addr startPC, Addr branchPC) { return -1; } +int findTableWithEntry(BTBTAGE* tage, Addr startPC, Addr branchPC, + const std::shared_ptr& meta) { + for (int t = 0; t < tage->numPredictors; t++) { + Addr index = tage->getTageIndex(startPC, t, meta->indexFoldedHist[t].get()); + for (unsigned way = 0; way < tage->numWays[t]; way++) { + auto &entry = tage->tageTable[t][index][way]; + if (entry.valid && entry.pc == branchPC) { + return t; + } + } + } + return -1; +} + +std::vector findTablesWithEntry( + BTBTAGE* tage, Addr startPC, Addr branchPC, + const std::shared_ptr& meta) +{ + std::vector tables; + for (int t = 0; t < tage->numPredictors; t++) { + Addr index = tage->getTageIndex(startPC, t, meta->indexFoldedHist[t].get()); + for (unsigned way = 0; way < tage->numWays[t]; way++) { + auto &entry = tage->tageTable[t][index][way]; + if (entry.valid && entry.pc == branchPC) { + tables.push_back(t); + break; + } + } + } + return tables; +} + class BTBTAGETest : public ::testing::Test { protected: @@ -1028,9 +1155,10 @@ TEST_F(BTBTAGETest, ResolveTrainBankConflict) { auto meta = bankTage.getPredictionMeta(); auto packet = createResolvedTrainPacket( 0xa0, meta, {createResolvedBranch(createBTBEntry(0xa0), true, false, 0)}); + auto target = createResolvedTrainTarget(0xa0, meta); uint64_t conflicts_before = bankTage.tageStats.updateBankConflict; - bool can_train = bankTage.canResolveTrain(packet); + bool can_train = bankTage.canResolveTrain(packet, target); EXPECT_FALSE(can_train); EXPECT_EQ(bankTage.tageStats.updateBankConflict, conflicts_before + 1); @@ -1054,14 +1182,106 @@ TEST_F(BTBTAGETest, ResolveTrainUsesPacketTruthForConditionalSelection) { auto packet = createResolvedTrainPacket( startPC, meta, {createResolvedBranch(first, false, true, 0)}); + auto target = createResolvedTrainTarget(startPC, meta); - ASSERT_TRUE(tage->canResolveTrain(packet)); - tage->resolveTrain(packet); + ASSERT_TRUE(tage->canResolveTrain(packet, target)); + tage->resolveTrain(packet, target); EXPECT_EQ(tage->tageTable[3][first_index][0].counter, -1); EXPECT_EQ(tage->tageTable[3][first_index][1].counter, 0); } +TEST_F(BTBTAGETest, ResolveTrainRepeatedShortPatternMatchesLegacyProviderGrowth) { + const Addr bodyStartPC = 0x1000; + const Addr loopStartPC = 0x1100; + const BTBEntry body = createBTBEntry(0x1004, true, true, false, -1, 0x100c); + const BTBEntry loop = createBTBEntry(loopStartPC, true, true, false, -1, bodyStartPC); + const int iterations = 160; + + BTBTAGE legacyTage; + BTBTAGE fullTage; + memset(&legacyTage.tageStats, 0, sizeof(BTBTAGE::TageStats)); + memset(&fullTage.tageStats, 0, sizeof(BTBTAGE::TageStats)); + + boost::dynamic_bitset<> legacyHistory(64, false); + boost::dynamic_bitset<> fullHistory(64, false); + std::vector legacyStagePreds(2); + std::vector fullStagePreds(2); + + auto legacyTrainNewEntry = [&](BTBTAGE *tage, + boost::dynamic_bitset<> &curHistory, + std::vector &curStagePreds, + bool taken) { + curStagePreds[1].btbEntries.clear(); + tage->putPCHistory(bodyStartPC, curHistory, curStagePreds); + auto meta = tage->getPredictionMeta(); + + FetchTarget stream; + stream.startPC = bodyStartPC; + stream.exeBranchInfo = body; + stream.exeTaken = taken; + stream.resolved = true; + stream.predBranchInfo = body; + stream.updateBTBEntries.clear(); + stream.updateIsOldEntry = false; + stream.updateNewBTBEntry = body; + stream.predMetas[0] = meta; + if (taken) { + stream = setMispredStream(stream); + } + + tage->update(stream); + advanceActualHistory(tage, curHistory, {body}, {taken}); + }; + + auto resolveTrainNewEntry = [&](BTBTAGE *tage, + boost::dynamic_bitset<> &curHistory, + std::vector &curStagePreds, + bool taken) { + curStagePreds[1].btbEntries.clear(); + tage->putPCHistory(bodyStartPC, curHistory, curStagePreds); + auto meta = tage->getPredictionMeta(); + + auto packet = createResolvedTrainPacket( + bodyStartPC, meta, {createResolvedBranch(body, taken, taken, 0)}); + auto target = createResolvedTrainTarget(bodyStartPC, meta); + ASSERT_TRUE(tage->canResolveTrain(packet, target)); + tage->resolveTrain(packet, target); + advanceActualHistory(tage, curHistory, {body}, {taken}); + }; + + for (int i = 0; i < iterations; ++i) { + const bool bodyTaken = (i % 2) == 0; + + legacyTrainNewEntry(&legacyTage, legacyHistory, legacyStagePreds, bodyTaken); + resolveTrainNewEntry(&fullTage, fullHistory, fullStagePreds, bodyTaken); + + legacyTrainSequence(&legacyTage, loopStartPC, {loop}, {true}, + legacyHistory, legacyStagePreds); + resolveTrainSequence(&fullTage, loopStartPC, {loop}, {true}, + fullHistory, fullStagePreds); + } + + auto legacyPred = predictBranch(&legacyTage, bodyStartPC, {body}, + legacyHistory, legacyStagePreds, body.pc); + auto fullPred = predictBranch(&fullTage, bodyStartPC, {body}, + fullHistory, fullStagePreds, body.pc); + + auto legacyMeta = std::static_pointer_cast(legacyTage.getPredictionMeta()); + auto fullMeta = std::static_pointer_cast(fullTage.getPredictionMeta()); + auto legacyTables = findTablesWithEntry(&legacyTage, bodyStartPC, body.pc, legacyMeta); + auto fullTables = findTablesWithEntry(&fullTage, bodyStartPC, body.pc, fullMeta); + + ASSERT_GT(legacyPred.mainInfo.table, 0) + << "Legacy training should grow beyond table 0 for the short repeated pattern"; + EXPECT_EQ(fullPred.mainInfo.table, legacyPred.mainInfo.table) + << "Full resolve-train should activate the same provider depth as legacy update"; + EXPECT_EQ(fullPred.finalProviderTable, legacyPred.finalProviderTable) + << "Full resolve-train should converge to the same final provider as legacy update"; + EXPECT_EQ(fullTables, legacyTables) + << "Full resolve-train should build the same set of TAGE tables as legacy update"; +} + class BTBTAGEUpperBoundTest : public ::testing::Test { protected: From 9acf432afa6cf830c30b230e1e6c08c9036030c8 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 14:00:00 +0800 Subject: [PATCH 07/15] cpu: filter packet rather than block it with fake path branch Change-Id: I2ca72fa9dadeca3ed0b369ba2ce21c7664768753 --- src/cpu/o3/fetch.cc | 17 +++++++++++ src/cpu/pred/btb/decoupled_bpred.cc | 35 ++++++++++++++--------- src/cpu/pred/btb/decoupled_bpred.hh | 28 ++++++++++++++++++ src/cpu/pred/btb/decoupled_bpred_stats.cc | 22 ++++++++++++++ 4 files changed, 88 insertions(+), 14 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index f4b9fa750a..1ea4df95cd 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -87,6 +87,21 @@ namespace constexpr uint8_t RvcInstBytes = 2; constexpr uint8_t BaseInstBytes = 4; +void +trimResolveTrainInstsAfterTaken(Fetch::ResolveTrainQueueEntry &entry) +{ + auto first_taken = std::find_if( + entry.insts.begin(), entry.insts.end(), + [](const Fetch::ResolveTrainInstData &inst_data) { + return inst_data.taken; + }); + if (first_taken == entry.insts.end()) { + return; + } + + entry.insts.erase(std::next(first_taken), entry.insts.end()); +} + } // anonymous namespace Fetch::IcachePort::IcachePort(Fetch *_fetch, CPU *_cpu) : @@ -1717,6 +1732,8 @@ Fetch::appendResolveTrainInst( } return lhs.pc < rhs.pc; }); + + trimResolveTrainInstsAfterTaken(entry); } void diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 33ff4784ae..c9e333a760 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -21,11 +21,8 @@ namespace branch_prediction namespace btb_pred { -namespace -{ - -bool -validateResolvedTrainPacket(const ResolvedTrainPacket &packet) +DecoupledBPUWithBTB::ResolveTrainValidationReason +DecoupledBPUWithBTB::validateResolvedTrainPacket(const ResolvedTrainPacket &packet) { uint8_t lastOffset = 0; Addr lastPc = 0; @@ -33,23 +30,28 @@ validateResolvedTrainPacket(const ResolvedTrainPacket &packet) bool seenTaken = false; for (const auto &resolved : packet.realBranches) { if (resolved.branch.pc < packet.startPC) { - return false; + dbpBtbStats.fullResolveTrainValidationPcBeforeStart++; + return ResolveTrainValidationReason::PcBeforeStart; } if (resolved.branch.size == 0) { - return false; + dbpBtbStats.fullResolveTrainValidationZeroSize++; + return ResolveTrainValidationReason::ZeroSize; } if (seenTaken) { - return false; + dbpBtbStats.fullResolveTrainValidationAfterTaken++; + return ResolveTrainValidationReason::AfterTaken; } if (!firstBranch) { if (resolved.ftqOffset < lastOffset) { - return false; + dbpBtbStats.fullResolveTrainValidationOffsetReversed++; + return ResolveTrainValidationReason::OffsetReversed; } if (resolved.ftqOffset == lastOffset && resolved.branch.pc <= lastPc) { - return false; + dbpBtbStats.fullResolveTrainValidationPcOrderSameOffset++; + return ResolveTrainValidationReason::PcOrderSameOffset; } } @@ -59,11 +61,9 @@ validateResolvedTrainPacket(const ResolvedTrainPacket &packet) firstBranch = false; } - return true; + return ResolveTrainValidationReason::Accepted; } -} // anonymous namespace - void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { @@ -682,6 +682,7 @@ DecoupledBPUWithBTB::resolveTrain( DPRINTF(DecoupleBP, "Resolve-train packet tid mismatch: packet=%u arg=%u\n", packet.tid, tid); + dbpBtbStats.fullResolveTrainRejectTidMismatch++; return false; } @@ -690,6 +691,7 @@ DecoupledBPUWithBTB::resolveTrain( DPRINTF(DecoupleBP, "Resolve-train packet target mismatch: id=%lu generation=%lu tid=%u\n", packet.target.id, packet.target.generation, tid); + dbpBtbStats.fullResolveTrainRejectTargetMismatch++; return false; } @@ -698,13 +700,15 @@ DecoupledBPUWithBTB::resolveTrain( DPRINTF(DecoupleBP, "Resolve-train packet startPC mismatch: packet=%#lx ftq=%#lx id=%lu tid=%u\n", packet.startPC, target.startPC, packet.target.id, tid); + dbpBtbStats.fullResolveTrainRejectStartPCMismatch++; return false; } - if (!validateResolvedTrainPacket(packet)) { + if (validateResolvedTrainPacket(packet) != ResolveTrainValidationReason::Accepted) { DPRINTF(DecoupleBP, "Resolve-train packet validation failed: id=%lu generation=%lu tid=%u\n", packet.target.id, packet.target.generation, tid); + dbpBtbStats.fullResolveTrainRejectPacketValidation++; return false; } @@ -715,6 +719,7 @@ DecoupledBPUWithBTB::resolveTrain( for (int i = 0; i < numComponents; ++i) { if (!components[i]->canResolveTrain(packet, target)) { + dbpBtbStats.fullResolveTrainRejectComponent++; return false; } } @@ -723,6 +728,8 @@ DecoupledBPUWithBTB::resolveTrain( components[i]->resolveTrain(packet, target); } + dbpBtbStats.fullResolveTrainAccepted++; + return true; } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 6a157a610a..eddd59d3f7 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -150,6 +150,19 @@ class DecoupledBPUWithBTB : public BPredUnit void processNewPrediction(ThreadID tid); + enum class ResolveTrainValidationReason + { + Accepted = 0, + PcBeforeStart, + ZeroSize, + AfterTaken, + OffsetReversed, + PcOrderSameOffset, + }; + + ResolveTrainValidationReason validateResolvedTrainPacket( + const ResolvedTrainPacket &packet); + FetchTarget createFetchTargetEntry(ThreadID tid); void updateHistoryForPrediction(FetchTarget &entry); @@ -299,6 +312,21 @@ class DecoupledBPUWithBTB : public BPredUnit // Window blocking statistics statistics::Scalar predictionBlockedForUpdate; // Times prediction was blocked for update priority + // Full resolve train reject statistics + statistics::Scalar fullResolveTrainAccepted; + statistics::Scalar fullResolveTrainRejectTidMismatch; + statistics::Scalar fullResolveTrainRejectTargetMismatch; + statistics::Scalar fullResolveTrainRejectStartPCMismatch; + statistics::Scalar fullResolveTrainRejectPacketValidation; + statistics::Scalar fullResolveTrainRejectComponent; + + // Full resolve train validation sub-reasons + statistics::Scalar fullResolveTrainValidationPcBeforeStart; + statistics::Scalar fullResolveTrainValidationZeroSize; + statistics::Scalar fullResolveTrainValidationAfterTaken; + statistics::Scalar fullResolveTrainValidationOffsetReversed; + statistics::Scalar fullResolveTrainValidationPcOrderSameOffset; + statistics::Scalar s1PredWrongFallthrough; statistics::Scalar s1PredWrongUbtb; statistics::Scalar s1PredWrongAbtb; diff --git a/src/cpu/pred/btb/decoupled_bpred_stats.cc b/src/cpu/pred/btb/decoupled_bpred_stats.cc index a64d943428..fb1bc846e2 100644 --- a/src/cpu/pred/btb/decoupled_bpred_stats.cc +++ b/src/cpu/pred/btb/decoupled_bpred_stats.cc @@ -458,6 +458,28 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats( ADD_STAT(predFalseHit, statistics::units::Count::get(), "false hit detected at pred"), ADD_STAT(commitFalseHit, statistics::units::Count::get(), "false hit detected at commit"), ADD_STAT(predictionBlockedForUpdate, statistics::units::Count::get(), "prediction blocked for update priority"), + ADD_STAT(fullResolveTrainAccepted, statistics::units::Count::get(), + "accepted full resolve-train packets"), + ADD_STAT(fullResolveTrainRejectTidMismatch, statistics::units::Count::get(), + "full resolve-train packets rejected due to thread id mismatch"), + ADD_STAT(fullResolveTrainRejectTargetMismatch, statistics::units::Count::get(), + "full resolve-train packets rejected due to FTQ target mismatch"), + ADD_STAT(fullResolveTrainRejectStartPCMismatch, statistics::units::Count::get(), + "full resolve-train packets rejected due to startPC mismatch"), + ADD_STAT(fullResolveTrainRejectPacketValidation, statistics::units::Count::get(), + "full resolve-train packets rejected due to packet validation failure"), + ADD_STAT(fullResolveTrainRejectComponent, statistics::units::Count::get(), + "full resolve-train packets rejected by predictor components"), + ADD_STAT(fullResolveTrainValidationPcBeforeStart, statistics::units::Count::get(), + "packet validation failures because branch pc is before startPC"), + ADD_STAT(fullResolveTrainValidationZeroSize, statistics::units::Count::get(), + "packet validation failures because branch size is zero"), + ADD_STAT(fullResolveTrainValidationAfterTaken, statistics::units::Count::get(), + "packet validation failures because another branch appears after a taken branch"), + ADD_STAT(fullResolveTrainValidationOffsetReversed, statistics::units::Count::get(), + "packet validation failures because ftq offsets are not monotonic"), + ADD_STAT(fullResolveTrainValidationPcOrderSameOffset, statistics::units::Count::get(), + "packet validation failures because pcs at same offset are not strictly increasing"), ADD_STAT(s1PredWrongFallthrough, statistics::units::Count::get(), "S1pred wrong full throughs"), ADD_STAT(s1PredWrongUbtb, statistics::units::Count::get(),"S1pred wrong using ubtb "), ADD_STAT(s1PredWrongAbtb, statistics::units::Count::get(), "S1pred wrong using abtb "), From b44f30887d2797bf086f52d04d3a937cd813c564 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:34:51 +0800 Subject: [PATCH 08/15] cpu: remove legacy resolve update path Collapse resolved-stage BTB training onto the packet-based full resolve path, remove the old resolvedCFI/resolveQueue helper chain, and reinterpret config intent around full-resolve participation versus commit fallback. Change-Id: I777f46178bc7b1ce891bb2242cefff0b166e48a2 --- configs/example/idealkmhv3.py | 1 + configs/example/kmhv3.py | 5 +- src/cpu/o3/BaseO3CPU.py | 4 +- src/cpu/o3/comm.hh | 14 ----- src/cpu/o3/fetch.cc | 79 --------------------------- src/cpu/o3/fetch.hh | 20 +------ src/cpu/o3/iew.cc | 16 ++---- src/cpu/pred/BranchPredictor.py | 13 +++-- src/cpu/pred/btb/btb_ittage.cc | 17 +++--- src/cpu/pred/btb/btb_tage.cc | 13 ++--- src/cpu/pred/btb/decoupled_bpred.cc | 83 ----------------------------- src/cpu/pred/btb/decoupled_bpred.hh | 3 -- src/cpu/pred/btb/mbtb.cc | 9 ++-- 13 files changed, 29 insertions(+), 248 deletions(-) diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py index 76d6d4800f..51b6c8f702 100644 --- a/configs/example/idealkmhv3.py +++ b/configs/example/idealkmhv3.py @@ -83,6 +83,7 @@ def setKmhV3IdealParams(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': + cpu.enableFullResolveTrain = False cpu.branchPred.ftq_size = 64 cpu.branchPred.fsq_size = 64 # cpu.branchPred.microtage.enabled = False diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index d82c6b98bd..722b973660 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -93,6 +93,7 @@ def setKmhV3Params(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': + cpu.enableFullResolveTrain = True cpu.branchPred.ftq_size = 64 cpu.branchPred.fsq_size = 64 @@ -100,10 +101,6 @@ def setKmhV3Params(args, system): cpu.branchPred.tage = BTBTAGEUpperBound( usePathHashHistory=True) - cpu.branchPred.mbtb.resolvedUpdate = True - cpu.branchPred.tage.resolvedUpdate = True - cpu.branchPred.ittage.resolvedUpdate = True - cpu.branchPred.ubtb.enabled = True cpu.branchPred.abtb.enabled = True cpu.branchPred.microtage.enabled = True diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index ac292c467f..35e5fe655f 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -248,9 +248,7 @@ def support_take_over(cls): "Branch Predictor") resolveQueueSize = Param.Unsigned(16, "Number of entries in the branch resolution queue") enableFullResolveTrain = Param.Bool(True, - "Enable packet-based resolve training rollout plumbing") - enableLegacyResolveUpdate = Param.Bool(True, - "Enable legacy PC-only resolve update") + "Train eligible BTB components from full resolve packets instead of commit fallback") needsTSO = Param.Bool(False, "Enable TSO Memory model") scheduler = Param.Scheduler("") diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index 4b647534cc..0b092b32a2 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -197,12 +197,6 @@ struct SquashVersion SquashVersion() : version(0) {} }; -struct ResolveQueueEntry -{ - uint64_t resolvedFTQId; - std::vector resolvedInstPC; -}; - /** Struct that defines all backwards communication. */ struct TimeStruct { @@ -239,14 +233,6 @@ struct TimeStruct StallReason lqHeadStallReason; StallReason sqHeadStallReason; - struct ResolvedCFIEntry - { - uint64_t ftqId; - uint64_t pc; - }; - /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */ - std::vector resolvedCFIs; // *F - struct ResolveTrainEntry { uint64_t ftqId; diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 1ea4df95cd..6d43b1fb1f 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -115,7 +115,6 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) branchPred(nullptr), resolveQueueSize(params.resolveQueueSize), enableFullResolveTrain(params.enableFullResolveTrain), - enableLegacyResolveUpdate(params.enableLegacyResolveUpdate), decodeToFetchDelay(params.decodeToFetchDelay), renameToFetchDelay(params.renameToFetchDelay), iewToFetchDelay(params.iewToFetchDelay), @@ -285,16 +284,6 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) statistics::units::Count, statistics::units::Cycle>::get(), "Frontend Bandwidth Bound", frontendBound - frontendLatencyBound), - ADD_STAT(resolveQueueFullEvents, statistics::units::Count::get(), - "Number of events the resolve queue becomes full"), - ADD_STAT(resolveEnqueueFailEvent, statistics::units::Count::get(), - "Number of times an entry could not be enqueued to the resolve queue"), - ADD_STAT(resolveDequeueCount, statistics::units::Count::get(), - "Number of times an entry is dequeued from the resolve queue"), - ADD_STAT(resolveEnqueueCount, statistics::units::Count::get(), - "Number of times an entry is enqueued to the resolve queue"), - ADD_STAT(resolveQueueOccupancy, statistics::units::Count::get(), - "Number of entries in the resolve queue"), ADD_STAT(fullResolveEntriesReceived, statistics::units::Count::get(), "Number of full resolve entries received by fetch"), ADD_STAT(fullResolveEntriesMerged, statistics::units::Count::get(), @@ -387,10 +376,6 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) .flags(statistics::total); frontendBandwidthBound .flags(statistics::total); - resolveEnqueueCount - .init(1, 8, 1); - resolveQueueOccupancy - .init(0, 32, 1); fullResolveEntriesReceived .prereq(fullResolveEntriesReceived); fullResolveEntriesMerged @@ -1517,70 +1502,6 @@ Fetch::handleIEWSignals() return; } - auto &incoming = fromIEW->iewInfo->resolvedCFIs; - - if (!enableLegacyResolveUpdate) { - for (ThreadID tid = 0; tid < numThreads; ++tid) { - fromIEW->iewInfo[tid].resolvedCFIs.clear(); - } - } - - if (enableLegacyResolveUpdate) { - const bool had_pending_resolve = !resolveQueue.empty(); - uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size(); - uint8_t enqueueCount = 0; - - if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) { - fetchStats.resolveQueueFullEvents++; - fetchStats.resolveEnqueueFailEvent += enqueueSize; - } else { - - for (const auto &resolved : incoming) { - bool merged = false; - for (auto &queued : resolveQueue) { - if (queued.resolvedFTQId == resolved.ftqId) { - queued.resolvedInstPC.push_back(resolved.pc); - merged = true; - break; - } - } - - if (merged) { - continue; - } - - ResolveQueueEntry new_entry; - new_entry.resolvedFTQId = resolved.ftqId; - new_entry.resolvedInstPC.push_back(resolved.pc); - resolveQueue.push_back(std::move(new_entry)); - enqueueCount++; - } - fetchStats.resolveEnqueueCount.sample(enqueueCount); - } - - fetchStats.resolveQueueOccupancy.sample(resolveQueue.size()); - - // Process only entries that were already pending before this cycle. - // This preserves a cycle of separation between IEW producing resolved - // CFIs and fetch consuming them as predictor resolved updates. - if (had_pending_resolve && !resolveQueue.empty()) { - auto &entry = resolveQueue.front(); - unsigned int stream_id = entry.resolvedFTQId; - dbpbtb->prepareResolveUpdateEntries(stream_id, 0); - for (const auto resolvedInstPC : entry.resolvedInstPC) { - dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0); - } - bool success = dbpbtb->resolveUpdate(stream_id, 0); - if (success) { - dbpbtb->notifyResolveSuccess(); - resolveQueue.pop_front(); - fetchStats.resolveDequeueCount++; - } else { - dbpbtb->notifyResolveFailure(); - } - } - } - if (!enableFullResolveTrain) { for (ThreadID tid = 0; tid < numThreads; ++tid) { fromIEW->iewInfo[tid].resolveTrainEntries.clear(); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index e27213d7a1..5cf65cebb0 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -465,8 +465,7 @@ class Fetch */ bool handleCommitSignals(ThreadID tid); - /** Handles iew signals including resolved cfi, mark their btb entries - * and train predictors if they are configured to update in resolve stage. + /** Handles IEW signals for full resolve-train predictor updates. */ void handleIEWSignals(); @@ -639,12 +638,6 @@ class Fetch /** Enable packet-based resolve training rollout plumbing. */ const bool enableFullResolveTrain; - /** Keep legacy PC-only resolve updates enabled. */ - const bool enableLegacyResolveUpdate; - - /** FIFO storing resolve entries waiting for BPU training. */ - std::deque resolveQueue; - /** FIFO storing aggregated full resolve-train entries. */ std::deque resolveTrainQueue; @@ -1129,17 +1122,6 @@ class Fetch statistics::Formula frontendLatencyBound; /** Frontend Bandwidth Bound */ statistics::Formula frontendBandwidthBound; - /** Stat for total cycles the resolve queue is full. */ - statistics::Scalar resolveQueueFullEvents; - /** Stat for total number of resolve enqueue fail events. */ - statistics::Scalar resolveEnqueueFailEvent; - - /** Stat for total number of resolve dequeue events. */ - statistics::Scalar resolveDequeueCount; - /** Stat for total number of resolve enqueue events. */ - statistics::Distribution resolveEnqueueCount; - /** Stat for entry occupancy distribution of the resolve queue. */ - statistics::Distribution resolveQueueOccupancy; /** Full resolve entries observed at fetch. */ statistics::Scalar fullResolveEntriesReceived; /** Full resolve entries merged with an existing target. */ diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 74394292f8..b77ac88778 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1465,14 +1465,6 @@ IEW::SquashCheckAfterExe(DynInstPtr inst) is_control && !loadNotExecuted && inst->mispredicted(); if (is_control) { - if (params.enableLegacyResolveUpdate) { - auto &resolved_cfis = toFetch->iewInfo[tid].resolvedCFIs; - TimeStruct::IewComm::ResolvedCFIEntry entry; - entry.ftqId = inst->getFtqId(); - entry.pc = inst->getPC(); - resolved_cfis.push_back(entry); - } - if (params.enableFullResolveTrain) { auto &resolve_entries = toFetch->iewInfo[tid].resolveTrainEntries; TimeStruct::IewComm::ResolveTrainEntry entry; @@ -1579,10 +1571,10 @@ IEW::executeInsts() // @todo This doesn't actually work anymore, we should fix it. // printAvailableInsts(); - // Clear resolvedFSQId and resolvedInstPC since they are already handled in frontend - ThreadID tid = *activeThreads->begin(); - toFetch->iewInfo[tid].resolvedCFIs.clear(); - toFetch->iewInfo[tid].resolveTrainEntries.clear(); + // Clear resolve-train entries before producing this cycle's updates. + for (ThreadID active_tid : *activeThreads) { + toFetch->iewInfo[active_tid].resolveTrainEntries.clear(); + } // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index eddbaa5844..772654024f 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -968,7 +968,8 @@ class TimedBaseBTBPredictor(SimObject): blockSize = Param.Unsigned(Parent.predictWidth, "Block size in bytes") predictWidth = Param.Unsigned(Parent.predictWidth, "Maximum range in bytes that a single prediction can cover") numDelay = Param.Unsigned(1000, "Number of bubbles to put on a prediction") - resolvedUpdate = Param.Bool(False, "Enable resolved update, no need to wait until commit") + resolvedUpdate = Param.Bool(False, + "Train from full resolve packets instead of commit fallback") enabled = Param.Bool(True, "Enable this predictor component") class MBTB(TimedBaseBTBPredictor): @@ -982,6 +983,8 @@ class MBTB(TimedBaseBTBPredictor): numThreads = Param.Unsigned(1, "Number of threads") numWays = Param.Unsigned(4, "Number of ways per set") # for 2 SRAMs, 4 ways per SRAM numDelay = 2 + resolvedUpdate = Param.Bool(Parent.enableFullResolveTrain, + "Train MBTB from full resolve packets instead of commit fallback") blockSize = 32 # max 64 byte block, 32 byte aligned # MBTB is always half-aligned - no parameter needed victimCacheSize = Param.Unsigned(0, "Number of entries in the victim cache") @@ -1059,6 +1062,8 @@ class BTBTAGE(TimedBaseBTBPredictor): numBanks = Param.Unsigned(4, "Number of banks for bank conflict simulation") enableBankConflict = Param.Bool(False, "Enable bank conflict simulation") numDelay = 2 + resolvedUpdate = Param.Bool(Parent.enableFullResolveTrain, + "Train BTBTAGE from full resolve packets instead of commit fallback") class BTBTAGEUpperBound(BTBTAGE): type = 'BTBTAGEUpperBound' @@ -1114,6 +1119,8 @@ class BTBITTAGE(TimedBaseBTBPredictor): maxHistLen = Param.Unsigned(970, "The length of history passed from DBP") numTablesToAlloc = Param.Unsigned(1,"The number of table to allocated each time") numDelay = 2 + resolvedUpdate = Param.Bool(Parent.enableFullResolveTrain, + "Train BTBITTAGE from full resolve packets instead of commit fallback") class BTBMGSC(TimedBaseBTBPredictor): type = 'BTBMGSC' @@ -1207,6 +1214,4 @@ class DecoupledBPUWithBTB(BranchPredictor): bpDBSwitches = VectorParam.String([], "Enable which traces in the form of database") resolveBlockThreshold = Param.Unsigned(8, "Consecutive resolve dequeue failures before blocking prediction once") enableFullResolveTrain = Param.Bool(Parent.enableFullResolveTrain, - "Enable packet-based resolve training rollout plumbing") - enableLegacyResolveUpdate = Param.Bool(Parent.enableLegacyResolveUpdate, - "Enable legacy PC-only resolve update") + "Train eligible BTB components from full resolve packets instead of commit fallback") diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index ec9104b3bf..972167bef0 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -223,17 +223,12 @@ BTBITTAGE::update(const FetchTarget &stream) all_entries_to_update.push_back(stream.updateNewBTBEntry); } - // // only update indirect branches that are not returns - if (getResolvedUpdate()) { - auto remove_it = - std::remove_if(all_entries_to_update.begin(), all_entries_to_update.end(), - [](const BTBEntry &e) { return !(e.isIndirect && !e.isReturn && e.resolved); }); - all_entries_to_update.erase(remove_it, all_entries_to_update.end()); - } else { - auto remove_it = std::remove_if(all_entries_to_update.begin(), all_entries_to_update.end(), - [](const BTBEntry &e) { return !(e.isIndirect && !e.isReturn); }); - all_entries_to_update.erase(remove_it, all_entries_to_update.end()); - } + auto remove_it = std::remove_if(all_entries_to_update.begin(), + all_entries_to_update.end(), + [](const BTBEntry &e) { + return !(e.isIndirect && !e.isReturn); + }); + all_entries_to_update.erase(remove_it, all_entries_to_update.end()); // get tage predictions from meta // TODO: use component idx diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 7a4975cf6f..52877d7dd8 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -423,16 +423,9 @@ BTBTAGE::prepareUpdateEntries(const FetchTarget &stream) { all_entries.push_back(potential_new_entry); } - // Filter: only keep conditional branches that are not always taken - if (getResolvedUpdate()) { - auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), - [](const BTBEntry &e) { return !(e.isCond && !e.alwaysTaken && e.resolved); }); - all_entries.erase(remove_it, all_entries.end()); - } else { - auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), - [](const BTBEntry &e) { return !(e.isCond && !e.alwaysTaken); }); - all_entries.erase(remove_it, all_entries.end()); - } + auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), + [](const BTBEntry &e) { return !(e.isCond && !e.alwaysTaken); }); + all_entries.erase(remove_it, all_entries.end()); return all_entries; } diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index c9e333a760..18aab26ebb 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -624,50 +624,6 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid) historyManager.commit(target_id); } -bool -DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid) -{ - if (!ftq.hasTarget(target_id, tid)) { - DPRINTF(DecoupleBP, "Target id %u not found in fetchTargetQueue, cannot update predictors\n", target_id); - return true; - } - - auto &target = ftq.get(target_id, tid); - - // Update predictor components only if the target is hit or taken - if (!(target.isHit || target.exeTaken)) { - return true; - } - - // Phase 1: probe all resolved-update components to ensure no blocker - for (int i = 0; i < numComponents; ++i) { - if (components[i]->getResolvedUpdate()) { - if (enableFullResolveTrain && - (components[i] == mbtb || components[i] == tage || - components[i] == ittage)) { - continue; - } - if (!components[i]->canResolveUpdate(target)) { - return false; - } - } - } - - // Phase 2: all clear, perform updates once - for (int i = 0; i < numComponents; ++i) { - if (components[i]->getResolvedUpdate()) { - if (enableFullResolveTrain && - (components[i] == mbtb || components[i] == tage || - components[i] == ittage)) { - continue; - } - components[i]->doResolveUpdate(target); - } - } - - return true; -} - void DecoupledBPUWithBTB::notifyResolveSuccess() { @@ -750,45 +706,6 @@ DecoupledBPUWithBTB::blockPredictionOnce() threads[0].blockPredictionPending = true; } -void -DecoupledBPUWithBTB::prepareResolveUpdateEntries(unsigned &target_id, ThreadID tid) -{ - if (!ftq.hasTarget(target_id, tid)) { - DPRINTF(DecoupleBP, "Target id %u not found in fetchTargetQueue, cannot update predictors\n", target_id); - return; - } - auto &target = ftq.get(target_id, tid); - - if (target.isHit || target.exeTaken) { - // Prepare target for update - target.setUpdateInstEndPC(predictWidth); - target.setUpdateBTBEntries(); - - // only mbtb can generate new entry - if (mbtb->isEnabled() && !enableFullResolveTrain) { - mbtb->getAndSetNewBTBEntry(target); - } - } -} - -void -DecoupledBPUWithBTB::markCFIResolved(unsigned &target_id, uint64_t resolvedInstPC, ThreadID tid) -{ - - if (!ftq.hasTarget(target_id, tid)) { - DPRINTF(DecoupleBP, "Target id %u not found in fetchTargetQueue, cannot update predictors\n", target_id); - return; - } - auto &target = ftq.get(target_id, tid); - - if (!enableFullResolveTrain && - target.updateNewBTBEntry.pc == resolvedInstPC) { - target.updateNewBTBEntry.resolved = true; - } - - target.markBTBEntryResolved(resolvedInstPC); -} - void DecoupledBPUWithBTB::updatePredictorComponents(FetchTarget &target) { diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index eddd59d3f7..95b4597017 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -754,9 +754,6 @@ class DecoupledBPUWithBTB : public BPredUnit void resetPC(Addr new_pc); // Helper functions for update - bool resolveUpdate(unsigned &target_id, ThreadID tid); - void prepareResolveUpdateEntries(unsigned &target_id, ThreadID tid); - void markCFIResolved(unsigned &target, uint64_t resolvedInstPC, ThreadID tid); void updatePredictorComponents(FetchTarget &target); void updateStatistics(const FetchTarget &target); void notifyResolveSuccess(); diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index e77bb53649..1f41a73ad5 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -853,12 +853,9 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) { all_entries.push_back(potential_new_entry); } - // Filter: only keep conditional branches that are not always taken - if (getResolvedUpdate()) { - auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), - [](const BTBEntry &e) { return !e.resolved; }); - all_entries.erase(remove_it, all_entries.end()); - } + auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), + [](const BTBEntry &e) { return !e.resolved; }); + all_entries.erase(remove_it, all_entries.end()); return all_entries; } From b833d771a0a5d9be3dc39a5f14c1bcdfe7b22541 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:42:07 +0800 Subject: [PATCH 09/15] misc: dicument update Change-Id: Icbf82e480f0fec8c1119c36851c9f6367835c745 --- .../full_resolve_train_review_guide.md | 321 ++++++++---------- 1 file changed, 143 insertions(+), 178 deletions(-) diff --git a/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md index 95ecba5683..195d256674 100644 --- a/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md +++ b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md @@ -2,270 +2,235 @@ ## 1. Why this branch exists -This branch changes GEM5 frontend resolved-branch training from a squash-assisted, -PC-only update model to an RTL-aligned full resolve-train model. +This branch replaces the old squash-assisted resolved-update path with a single +packet-based full resolve-train path for the migrated BTB predictors. -Before this branch: +Historically GEM5 used: -- IEW only sent `{ftqId, pc}` through `resolvedCFIs` -- Fetch reconstructed resolved updates from `FetchTarget` state -- correct training truth depended on squash writing back `exeTaken` / - `exeBranchInfo` before resolve update was consumed +- `IEW -> resolvedCFIs` +- `Fetch.resolveQueue` +- `prepareResolveUpdateEntries()` +- `markCFIResolved()` +- `resolveUpdate()` -After this branch: +That flow depended on squash-populated execution truth and mixed together: -- IEW sends full per-branch resolve truth -- Fetch aggregates real resolved branches by FTQ target identity -- Fetch builds an explicit `ResolvedTrainPacket` -- migrated predictors train from packet truth plus prediction-time metadata +- resolve notification +- new-entry discovery +- predictor-specific update preparation -The main motivation is to remove the correctness and performance risk caused by -resolve training depending on squash timing. +The new branch instead uses explicit resolve truth from IEW and packet-based +training. -## 2. High-level architecture +## 2. Current architecture -The new dataflow is: +### 2.1 Resolved-stage training path + +Current resolved-stage dataflow is: ```text IEW -> resolveTrainEntries[{ftqId, generation, pc, target, taken, ...}] Fetch - -> resolveTrainQueue keyed by {tid, ftqId, generation} - -> ResolvedTrainPacket{startPC, predMetas, realBranches} + -> resolveTrainQueue keyed by FTQ target identity + -> ResolvedTrainPacket{tid, target, startPC, realBranches} DecoupledBPUWithBTB -> resolveTrain(packet) - -> per-component canResolveTrain/resolveTrain + -> MBTB / BTBTAGE / BTBITTAGE ``` -The old path still exists as fallback: +Important current semantics: -```text -IEW - -> resolvedCFIs[{ftqId, pc}] -Fetch - -> legacy resolveQueue -DecoupledBPUWithBTB - -> prepareResolveUpdateEntries/markCFIResolved/resolveUpdate -``` +- packets are truth-only; they do not own frozen predictor metadata +- training uses live FTQ metadata from the current `FetchTarget` +- fetch-side packet formation trims branches after the first taken branch, + matching RTL-style training-prefix semantics -Current default mode is: +### 2.2 Legacy path status -- `enableFullResolveTrain = True` -- `enableLegacyResolveUpdate = True` +The old legacy resolved-update chain has been removed from the active training +architecture: -This means: +- no `resolvedCFIs` predictor training path +- no `Fetch.resolveQueue`-based predictor training path +- no predictor-top `prepareResolveUpdateEntries()` / `markCFIResolved()` / + `resolveUpdate()` helper chain -- migrated components use the full packet path -- non-migrated `resolvedUpdate` components still keep legacy fallback +Resolved-stage BTB training is now single-path: full resolve train. -It is not a double-update mode for migrated components. +### 2.3 Commit/update path -## 3. Review map by file group +Commit/update behavior remains intact. -### 3.1 O3 protocol and Fetch plumbing +This matters because not every predictor needs resolved-stage packet training. +Current design is: -Relevant files: +- migrated BTB predictors use full resolve-train +- other components may still rely on commit/update behavior + +## 3. Component map + +### 3.1 Components on full resolve-train + +- `MBTB` +- `BTBTAGE` +- `BTBITTAGE` + +### 3.2 Components not migrated to full resolve-train in this branch + +These are not treated as active users of the removed legacy BTB resolved-update +chain in current configs: + +- `MicroTAGE` (still has local old-style logic in code, but not enabled as an + active resolved-stage user in current configs) +- `SC` remains follow-up work if full RTL parity is desired + +### 3.3 Non-resolved-stage structures + +These are not intended to use full resolve-train in the RTL-aligned model: + +- commit/update-only or redirect/recover structures +- components whose role is speculative / fast-train / commit-time only + +## 4. File map + +### O3 / frontend integration -- `src/cpu/o3/BaseO3CPU.py` - `src/cpu/o3/comm.hh` - `src/cpu/o3/dyn_inst.hh` - `src/cpu/o3/dyn_inst.cc` - `src/cpu/o3/iew.cc` - `src/cpu/o3/fetch.hh` - `src/cpu/o3/fetch.cc` -- `src/cpu/pred/BranchPredictor.py` - -Key changes: - -- adds rollout params: `enableFullResolveTrain`, `enableLegacyResolveUpdate` -- adds `ResolveTrainEntry` to `IewComm` -- records `ftqGeneration` and `ftqOffset` in `DynInst` -- emits full resolve truth from IEW -- adds `resolveTrainQueue` in Fetch -- builds `ResolvedTrainPacket` from queued truth plus `FetchTarget` metadata -- only pops packet queue on explicit predictor acceptance - -Main review questions: +- `src/cpu/o3/BaseO3CPU.py` -- does `DynInst` carry enough fetch-time identity for stable FTQ matching? -- does Fetch aggregate by `{tid, ftqId, generation}` correctly? -- are stale, squashed, committed, and reused FTQ targets rejected safely? -- does full-resolve retry feed the same throttle path as legacy resolve update? +These files now: -### 3.2 FTQ identity protection and predictor top-level API +- carry full resolve truth from IEW +- maintain `resolveTrainQueue` +- build truth-only packets +- trim branches after the first taken branch before training -Relevant files: +### Predictor top / FTQ integration - `src/cpu/pred/btb/common.hh` - `src/cpu/pred/btb/ftq.hh` - `src/cpu/pred/btb/ftq.cc` - `src/cpu/pred/btb/decoupled_bpred.hh` - `src/cpu/pred/btb/decoupled_bpred.cc` +- `src/cpu/pred/btb/decoupled_bpred_stats.cc` - `src/cpu/pred/btb/timed_base_pred.hh` +- `src/cpu/pred/btb/timed_base_pred.cc` -Key changes: - -- adds `generation` to `FetchTarget` -- allocates a fresh generation when a logical FTQ target is created -- adds FTQ identity helpers: generation lookup and identity matching -- adds packet types: - - `FetchTargetIdentity` - - `ResolvedBranch` - - `ResolvedTrainPacket` -- adds `DecoupledBPUWithBTB::resolveTrain()` -- adds default component hooks: - - `canResolveTrain(packet)` - - `resolveTrain(packet)` -- adds top-level packet validation before fan-out - -Main review questions: +These files now: -- is FTQ generation sufficient to reject stale resolve traffic? -- does packet validation reject malformed branch lists and stale metadata? -- does `resolveTrain()` preserve the old probe/apply contract semantics? +- track FTQ target generation identity +- validate full resolve packets using structural checks +- dispatch full resolve packets to migrated components +- preserve commit/update path for non-resolved-stage training -### 3.3 Migrated predictors +### Migrated predictor components -Relevant files: - -- `src/cpu/pred/btb/mbtb.hh` - `src/cpu/pred/btb/mbtb.cc` -- `src/cpu/pred/btb/btb_tage.hh` - `src/cpu/pred/btb/btb_tage.cc` -- `src/cpu/pred/btb/btb_ittage.hh` - `src/cpu/pred/btb/btb_ittage.cc` -Migrated components: - -- `MBTB` -- `BTBTAGE` -- `BTBITTAGE` - -Current split: - -- in full-resolve mode, these three no longer consume legacy resolved-update -- other non-migrated components can still use legacy resolved-update if enabled - -Main review questions: +## 5. Key behavior changes since early migration commits -- does each component now train only from packet truth on the new path? -- are legacy side effects cleanly disabled for migrated components? -- do bank conflict / readiness semantics still behave the same? +The branch has moved beyond the earlier intermediate state described in older +notes. -## 4. Component-by-component summary +Current important fixes include: -### 4.1 MBTB +### 5.1 Truth-only packets -What changed: +`ResolvedTrainPacket` no longer stores duplicated predictor metadata. Training +reads metadata from the live `FetchTarget` instead. -- packet-based `canResolveTrain()` / `resolveTrain()` were added -- new path no longer depends on: - - `updateBTBEntries` - - `updateNewBTBEntry` - - per-entry resolved bits inside `FetchTarget` -- packet updates reuse existing SRAM / victim-cache update machinery +### 5.2 BTBTAGE new-entry handling -Important review points: +Full resolve-train now distinguishes: -- MBTB legacy resolved-update is skipped in full-resolve mode -- MBTB-specific legacy prepare/mark side effects are also gated off in - full-resolve mode +- existing predicted entries +- new-entry candidates -### 4.2 BTBTAGE +so short-pattern conditional branches can allocate and grow similarly to the +legacy `update()` path without depending on squash-derived helper state. -What changed: +### 5.3 RTL-style prefix trimming -- packet-based bank-conflict probe added in `canResolveTrain()` -- packet path now trains using prediction snapshots, not squash-populated state -- metadata now retains predicted conditional `BTBEntry` per branch PC -- missing-meta conditional branches on the packet path are now materialized and - still trained, matching old new-entry behavior +When a packet contains multiple resolved branches, fetch trims the branch list to +the prefix up to and including the first taken branch. -Important review points: +This avoids the previous failure mode where packet validation rejected the whole +packet because branches existed after a taken branch. -- packet path only trains intended conditional branches -- packet conflict failures now drive `notifyResolveFailure()` so retry and - prediction throttling still work +## 6. Configuration semantics -### 4.3 BTBITTAGE +Current configs are centered on `enableFullResolveTrain`. -What changed: +The old `resolvedUpdate` concept used to mean: -- packet-based `resolveTrain()` now uses indirect branch truth from packet data -- no longer depends on squash-derived `exeBranchInfo` on the new path -- legacy resolved-update is skipped in full-resolve mode +- this component trains at resolve stage rather than only at commit -Important review points: +That intent is now expressed through the new full resolve-train path for the +migrated BTB predictors, not through a legacy helper chain. -- training remains scoped to indirect non-return branches -- alternate-provider update now only happens when `alt_info.found` is true, - fixing an existing corruption hazard in both old and new paths +### Current intent -## 5. Current default behavior +- `kmhv3`: full resolve-train enabled for the migrated BTB set +- `idealkmhv3`: explicit control is still available through the top-level switch -Current defaults are set in `src/cpu/o3/BaseO3CPU.py`: +## 7. What reviewers should focus on -- `enableFullResolveTrain = True` -- `enableLegacyResolveUpdate = True` +### For architecture review -This is intentional. +- Is the resolved-stage path now single-source and coherent? +- Is FTQ generation sufficient for stale-target filtering? +- Does fetch-side branch trimming match intended RTL behavior? -Reason: +### For predictor review -- migrated components already use the packet path -- some other `resolvedUpdate` users may still exist outside this migration set -- keeping legacy enabled avoids silent loss of resolve-stage training during the - rollout period +- Do `MBTB`, `BTBTAGE`, and `BTBITTAGE` consume truth-only packets correctly? +- Is new-entry handling independent from squash-era legacy helper state? +- Does commit/update behavior remain intact where it should? -So current behavior is: +### For cleanup review -- `MBTB`, `BTBTAGE`, `BTBITTAGE` -> full packet path -- non-migrated resolved-update components -> legacy path -- commit-time predictors -> unchanged commit path +- Was the old `resolvedCFIs -> resolveQueue -> resolveUpdate()` chain removed + cleanly? +- Did config semantics stop pointing at deleted legacy machinery? -## 6. Verification done on this branch +## 8. Verification currently used on this branch -Fresh verification used before final commit creation: +### Build -- build: `scons build/RISCV/gem5.opt --gold-linker -j60` -- unit test: `build/RISCV/cpu/pred/btb/test/tage.test.debug` +- `scons build/RISCV/gem5.opt --gold-linker -j60` -Observed result: +### Unit tests -- `gem5.opt` builds successfully -- `tage.test.debug` passes `21/21` +- `build/RISCV/cpu/pred/btb/test/tage.test.debug` -New or extended test coverage includes packet-mode BTBTAGE cases for: +Current result: -- bank-conflict probe behavior -- packet-truth conditional selection -- new conditional entry training without prediction metadata +- `22/22` passing -## 7. Suggested review order +### Targeted workloads -For fastest review, read in this order: +Used repeatedly during this branch: -1. `src/cpu/o3/comm.hh` -2. `src/cpu/o3/iew.cc` -3. `src/cpu/o3/fetch.hh` -4. `src/cpu/o3/fetch.cc` -5. `src/cpu/pred/btb/common.hh` -6. `src/cpu/pred/btb/ftq.hh` -7. `src/cpu/pred/btb/decoupled_bpred.hh` -8. `src/cpu/pred/btb/decoupled_bpred.cc` -9. `src/cpu/pred/btb/mbtb.cc` -10. `src/cpu/pred/btb/btb_tage.cc` -11. `src/cpu/pred/btb/btb_ittage.cc` -12. `src/cpu/pred/btb/test/btb_tage.test.cc` +- `tage1` +- `usefulbit` +- `tage2` -## 8. Known follow-up work +The current branch state keeps these in the recovered performance range after the +packet-trimming fix and legacy-path cleanup. -This branch does not yet remove the legacy path. +## 9. Remaining follow-up work -Natural next steps after review: +Likely next steps after this branch stabilizes: -- migrate any remaining `resolvedUpdate` components if needed -- once all needed components are packetized, turn `enableLegacyResolveUpdate` - default off -- then delete legacy `resolvedCFIs` / `prepareResolveUpdateEntries()` / - `markCFIResolved()`-based training path +- decide whether `SC` should be migrated to full resolve-train +- decide whether `MicroTAGE` should be migrated to full resolve-train for closer + RTL parity +- continue reducing stale-drop rate if workload-level gaps remain From a817c9cf66027cd33ced933f75a6a270a94dab9a Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:59:20 +0800 Subject: [PATCH 10/15] cpu: only filter entries in resolve BTB update Change-Id: I371e8e306f55cb7eca8991e77db53f8825d10874 --- src/cpu/pred/btb/mbtb.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index 1f41a73ad5..88535301e0 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -853,9 +853,11 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) { all_entries.push_back(potential_new_entry); } - auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), - [](const BTBEntry &e) { return !e.resolved; }); - all_entries.erase(remove_it, all_entries.end()); + if (getResolvedUpdate()) { + auto remove_it = std::remove_if(all_entries.begin(), all_entries.end(), + [](const BTBEntry &e) { return !e.resolved; }); + all_entries.erase(remove_it, all_entries.end()); + } return all_entries; } From 03981d979310c75257583dfe8f8635806e497299 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 17:38:22 +0800 Subject: [PATCH 11/15] misc: cleanup code Change-Id: If72220a598a23ca2aad8fe50a390b80fdce11a7b --- src/cpu/o3/dyn_inst.hh | 10 ---------- src/cpu/o3/iew.cc | 7 +++++-- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index 015b6e4051..76b28446d8 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -1629,16 +1629,6 @@ class DynInst : public ExecContext, public RefCounted return rpc.compressed() ? 2 : 4; } - bool isRVC() const - { - return pc->as().compressed(); - } - - Addr getControlTarget() - { - return branching() ? getNPC() : pcState().getFallThruPC(); - } - void setFtqOffset(uint8_t offset) { ftqOffset = offset; diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index b77ac88778..57d34000c7 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1468,10 +1468,13 @@ IEW::SquashCheckAfterExe(DynInstPtr inst) if (params.enableFullResolveTrain) { auto &resolve_entries = toFetch->iewInfo[tid].resolveTrainEntries; TimeStruct::IewComm::ResolveTrainEntry entry; + const auto control_target = inst->branching() ? + inst->getNPC() : inst->pcState().getFallThruPC(); + const auto is_rvc = inst->pcState().as().compressed(); entry.ftqId = inst->getFtqId(); entry.ftqGeneration = inst->getFtqGeneration(); entry.pc = inst->getPC(); - entry.target = inst->getControlTarget(); + entry.target = control_target; entry.taken = inst->branching(); entry.mispredict = control_mispredict; entry.ftqOffset = inst->getFtqOffset(); @@ -1480,7 +1483,7 @@ IEW::SquashCheckAfterExe(DynInstPtr inst) entry.isIndirect = inst->isIndirectCtrl(); entry.isCall = inst->isCall(); entry.isReturn = inst->isReturn(); - entry.isRVC = inst->isRVC(); + entry.isRVC = is_rvc; resolve_entries.push_back(entry); } } From dba66368fcbab10fa67727ca064b93a85fb14fbc Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 8 Apr 2026 18:16:11 +0800 Subject: [PATCH 12/15] misc: move trim func out of anon namespace Change-Id: I16d6b8d495ef16e9e20f4b5ca5a8262484674ae6 --- src/cpu/o3/fetch.cc | 35 +++++++++++++++-------------------- src/cpu/o3/fetch.hh | 1 + 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 6d43b1fb1f..6a04d6e6a5 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -81,29 +81,9 @@ namespace gem5 namespace o3 { -namespace -{ - constexpr uint8_t RvcInstBytes = 2; constexpr uint8_t BaseInstBytes = 4; -void -trimResolveTrainInstsAfterTaken(Fetch::ResolveTrainQueueEntry &entry) -{ - auto first_taken = std::find_if( - entry.insts.begin(), entry.insts.end(), - [](const Fetch::ResolveTrainInstData &inst_data) { - return inst_data.taken; - }); - if (first_taken == entry.insts.end()) { - return; - } - - entry.insts.erase(std::next(first_taken), entry.insts.end()); -} - -} // anonymous namespace - Fetch::IcachePort::IcachePort(Fetch *_fetch, CPU *_cpu) : RequestPort(_cpu->name() + ".icache_port", _cpu), fetch(_fetch) {} @@ -1657,6 +1637,21 @@ Fetch::appendResolveTrainInst( trimResolveTrainInstsAfterTaken(entry); } +void +Fetch::trimResolveTrainInstsAfterTaken(ResolveTrainQueueEntry &entry) +{ + auto first_taken = std::find_if( + entry.insts.begin(), entry.insts.end(), + [](const ResolveTrainInstData &inst_data) { + return inst_data.taken; + }); + if (first_taken == entry.insts.end()) { + return; + } + + entry.insts.erase(std::next(first_taken), entry.insts.end()); +} + void Fetch::filterResolveTrainQueue() { diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 5cf65cebb0..90a9e7d1d1 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -500,6 +500,7 @@ class Fetch const ResolveTrainQueueEntry &entry) const; void appendResolveTrainInst( ResolveTrainQueueEntry &entry, const ResolveTrainInstData &inst_data); + void trimResolveTrainInstsAfterTaken(ResolveTrainQueueEntry &entry); void filterResolveTrainQueue(); /** Handles decode squash signals. From 08425fc68f820ed1fce09fc332d4f555d3f8037c Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:30:59 +0800 Subject: [PATCH 13/15] misc: remove redundant test & counter Change-Id: I6f1de90ea368558913dc913593ac0f193f54976b --- .../full_resolve_train_review_guide.md | 22 +- src/cpu/o3/fetch.cc | 5 - src/cpu/o3/fetch.hh | 2 - src/cpu/pred/btb/decoupled_bpred.cc | 7 - src/cpu/pred/btb/decoupled_bpred.hh | 9 - src/cpu/pred/btb/decoupled_bpred_stats.cc | 14 -- src/cpu/pred/btb/test/btb_tage.test.cc | 205 +----------------- 7 files changed, 22 insertions(+), 242 deletions(-) diff --git a/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md index 195d256674..5f02c62e03 100644 --- a/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md +++ b/docs/Gem5_Docs/frontend/full_resolve_train_review_guide.md @@ -127,6 +127,10 @@ These files now: - track FTQ target generation identity - validate full resolve packets using structural checks - dispatch full resolve packets to migrated components +- expose only the retained top-level resolve-train counters: + `fullResolveTrainAccepted`, `fullResolveTrainRejectTargetMismatch`, + `fullResolveTrainRejectPacketValidation`, and + `fullResolveTrainRejectComponent` - preserve commit/update path for non-resolved-stage training ### Migrated predictor components @@ -209,11 +213,23 @@ migrated BTB predictors, not through a legacy helper chain. ### Unit tests -- `build/RISCV/cpu/pred/btb/test/tage.test.debug` +- build: `scons build/RISCV/cpu/pred/btb/test/tage.test.debug --unit-test -j60` +- run: `build/RISCV/cpu/pred/btb/test/tage.test.debug --gtest_filter=BTBTAGETest.*` -Current result: +Current retained coverage: -- `22/22` passing +- the existing `BTBTAGETest.*` suite remains in place +- within the branch-added cleanup surface, the retained regressions are: + `NewConditionalEntryWithoutPredictionMetaStillTrains`, + `ResolveTrainBankConflict`, + `ResolveTrainUsesPacketTruthForConditionalSelection`, and + `ResolveTrainRepeatedShortPatternMatchesLegacyProviderGrowth` + +Removed from the branch verification surface: + +- rollout-time debug counters +- legacy resolve-update-only `BankConflict` +- exploratory `BTBTAGEUpperBound*` checks added during branch development ### Targeted workloads diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 6a04d6e6a5..c1db255b9a 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -266,8 +266,6 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) frontendBound - frontendLatencyBound), ADD_STAT(fullResolveEntriesReceived, statistics::units::Count::get(), "Number of full resolve entries received by fetch"), - ADD_STAT(fullResolveEntriesMerged, statistics::units::Count::get(), - "Number of full resolve entries merged by fetch"), ADD_STAT(fullResolveEntriesDroppedQueueFull, statistics::units::Count::get(), "Number of full resolve entries dropped because the queue is full"), @@ -358,8 +356,6 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) .flags(statistics::total); fullResolveEntriesReceived .prereq(fullResolveEntriesReceived); - fullResolveEntriesMerged - .prereq(fullResolveEntriesMerged); fullResolveEntriesDroppedQueueFull .prereq(fullResolveEntriesDroppedQueueFull); fullResolveEntriesDroppedStaleTarget @@ -1520,7 +1516,6 @@ Fetch::handleIEWSignals() resolved.mispredict, resolved.ftqOffset, resolved.isCond, resolved.isDirect, resolved.isIndirect, resolved.isCall, resolved.isReturn, resolved.isRVC)); - fetchStats.fullResolveEntriesMerged++; continue; } diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 90a9e7d1d1..a9b5b28de9 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1125,8 +1125,6 @@ class Fetch statistics::Formula frontendBandwidthBound; /** Full resolve entries observed at fetch. */ statistics::Scalar fullResolveEntriesReceived; - /** Full resolve entries merged with an existing target. */ - statistics::Scalar fullResolveEntriesMerged; /** Full resolve entries dropped because the queue is full. */ statistics::Scalar fullResolveEntriesDroppedQueueFull; /** Full resolve entries dropped because the target went stale. */ diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 18aab26ebb..9c9d8e7250 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -30,27 +30,22 @@ DecoupledBPUWithBTB::validateResolvedTrainPacket(const ResolvedTrainPacket &pack bool seenTaken = false; for (const auto &resolved : packet.realBranches) { if (resolved.branch.pc < packet.startPC) { - dbpBtbStats.fullResolveTrainValidationPcBeforeStart++; return ResolveTrainValidationReason::PcBeforeStart; } if (resolved.branch.size == 0) { - dbpBtbStats.fullResolveTrainValidationZeroSize++; return ResolveTrainValidationReason::ZeroSize; } if (seenTaken) { - dbpBtbStats.fullResolveTrainValidationAfterTaken++; return ResolveTrainValidationReason::AfterTaken; } if (!firstBranch) { if (resolved.ftqOffset < lastOffset) { - dbpBtbStats.fullResolveTrainValidationOffsetReversed++; return ResolveTrainValidationReason::OffsetReversed; } if (resolved.ftqOffset == lastOffset && resolved.branch.pc <= lastPc) { - dbpBtbStats.fullResolveTrainValidationPcOrderSameOffset++; return ResolveTrainValidationReason::PcOrderSameOffset; } } @@ -638,7 +633,6 @@ DecoupledBPUWithBTB::resolveTrain( DPRINTF(DecoupleBP, "Resolve-train packet tid mismatch: packet=%u arg=%u\n", packet.tid, tid); - dbpBtbStats.fullResolveTrainRejectTidMismatch++; return false; } @@ -656,7 +650,6 @@ DecoupledBPUWithBTB::resolveTrain( DPRINTF(DecoupleBP, "Resolve-train packet startPC mismatch: packet=%#lx ftq=%#lx id=%lu tid=%u\n", packet.startPC, target.startPC, packet.target.id, tid); - dbpBtbStats.fullResolveTrainRejectStartPCMismatch++; return false; } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 95b4597017..5183f406a4 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -314,19 +314,10 @@ class DecoupledBPUWithBTB : public BPredUnit // Full resolve train reject statistics statistics::Scalar fullResolveTrainAccepted; - statistics::Scalar fullResolveTrainRejectTidMismatch; statistics::Scalar fullResolveTrainRejectTargetMismatch; - statistics::Scalar fullResolveTrainRejectStartPCMismatch; statistics::Scalar fullResolveTrainRejectPacketValidation; statistics::Scalar fullResolveTrainRejectComponent; - // Full resolve train validation sub-reasons - statistics::Scalar fullResolveTrainValidationPcBeforeStart; - statistics::Scalar fullResolveTrainValidationZeroSize; - statistics::Scalar fullResolveTrainValidationAfterTaken; - statistics::Scalar fullResolveTrainValidationOffsetReversed; - statistics::Scalar fullResolveTrainValidationPcOrderSameOffset; - statistics::Scalar s1PredWrongFallthrough; statistics::Scalar s1PredWrongUbtb; statistics::Scalar s1PredWrongAbtb; diff --git a/src/cpu/pred/btb/decoupled_bpred_stats.cc b/src/cpu/pred/btb/decoupled_bpred_stats.cc index fb1bc846e2..21c37c0a0b 100644 --- a/src/cpu/pred/btb/decoupled_bpred_stats.cc +++ b/src/cpu/pred/btb/decoupled_bpred_stats.cc @@ -460,26 +460,12 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats( ADD_STAT(predictionBlockedForUpdate, statistics::units::Count::get(), "prediction blocked for update priority"), ADD_STAT(fullResolveTrainAccepted, statistics::units::Count::get(), "accepted full resolve-train packets"), - ADD_STAT(fullResolveTrainRejectTidMismatch, statistics::units::Count::get(), - "full resolve-train packets rejected due to thread id mismatch"), ADD_STAT(fullResolveTrainRejectTargetMismatch, statistics::units::Count::get(), "full resolve-train packets rejected due to FTQ target mismatch"), - ADD_STAT(fullResolveTrainRejectStartPCMismatch, statistics::units::Count::get(), - "full resolve-train packets rejected due to startPC mismatch"), ADD_STAT(fullResolveTrainRejectPacketValidation, statistics::units::Count::get(), "full resolve-train packets rejected due to packet validation failure"), ADD_STAT(fullResolveTrainRejectComponent, statistics::units::Count::get(), "full resolve-train packets rejected by predictor components"), - ADD_STAT(fullResolveTrainValidationPcBeforeStart, statistics::units::Count::get(), - "packet validation failures because branch pc is before startPC"), - ADD_STAT(fullResolveTrainValidationZeroSize, statistics::units::Count::get(), - "packet validation failures because branch size is zero"), - ADD_STAT(fullResolveTrainValidationAfterTaken, statistics::units::Count::get(), - "packet validation failures because another branch appears after a taken branch"), - ADD_STAT(fullResolveTrainValidationOffsetReversed, statistics::units::Count::get(), - "packet validation failures because ftq offsets are not monotonic"), - ADD_STAT(fullResolveTrainValidationPcOrderSameOffset, statistics::units::Count::get(), - "packet validation failures because pcs at same offset are not strictly increasing"), ADD_STAT(s1PredWrongFallthrough, statistics::units::Count::get(), "S1pred wrong full throughs"), ADD_STAT(s1PredWrongUbtb, statistics::units::Count::get(),"S1pred wrong using ubtb "), ADD_STAT(s1PredWrongAbtb, statistics::units::Count::get(), "S1pred wrong using abtb "), diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index fff528f090..41eb8de123 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -5,7 +5,6 @@ #include "base/types.hh" #include "cpu/pred/btb/btb_tage.hh" -#include "cpu/pred/btb/btb_tage_ub.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" @@ -1064,83 +1063,11 @@ TEST_F(BTBTAGETest, NewConditionalEntryWithoutPredictionMetaStillTrains) { } /** - * @brief Test bank conflict detection + * @brief Test resolve-train bank conflict detection * - * Verifies: - * 1. Same bank access causes conflict and drops update (when enabled) - * 2. Different bank access has no conflict - * 3. Disabled flag prevents conflict detection + * Verifies that a same-bank resolve-train update is rejected when bank + * conflict checking is enabled. */ -TEST_F(BTBTAGETest, BankConflict) { - // Create TAGE with 4 banks - BTBTAGE *bankTage = new BTBTAGE(4, 2, 1024, 4); - boost::dynamic_bitset<> testHistory(128); - std::vector testStagePreds(5); - - // Bank ID derives from bits [2:1] (pc >> 1) & 0x3 when instShiftAmt == 1. - // Bank 0: ..., 0x100, 0x108 ... Bank 1: ..., 0x102, 0x10A ... - // Bank 2: ..., 0x104, 0x10C ... Bank 3: ..., 0x106, 0x10E ... - - // Test 1: Same bank conflict (enabled) - bankTage->enableBankConflict = true; - { - // Predict on bank 1 (0x20), then update on bank 1 (0xa0) - testStagePreds[1].btbEntries = {createBTBEntry(0x20)}; - bankTage->putPCHistory(0x20, testHistory, testStagePreds); - EXPECT_TRUE(bankTage->predBankValid); - - auto meta = bankTage->getPredictionMeta(); - FetchTarget stream = createStream(0xa0, createBTBEntry(0xa0), true, meta); - setupTageEntry(bankTage, 0xa0, 0, 1, false); - - uint64_t conflicts_before = bankTage->tageStats.updateBankConflict; - bool can_update = bankTage->canResolveUpdate(stream); - - // Should detect conflict and defer update - EXPECT_EQ(bankTage->tageStats.updateBankConflict, conflicts_before + 1); - EXPECT_FALSE(can_update); - EXPECT_FALSE(bankTage->predBankValid); - } - - // Test 2: Different bank, no conflict - { - // Predict on bank 0 (0x100), update on bank 2 (0x104) - testStagePreds[1].btbEntries = {createBTBEntry(0x100)}; - bankTage->putPCHistory(0x100, testHistory, testStagePreds); - - auto meta = bankTage->getPredictionMeta(); - FetchTarget stream = createStream(0x104, createBTBEntry(0x104), true, meta); - - uint64_t conflicts_before = bankTage->tageStats.updateBankConflict; - bool can_update = bankTage->canResolveUpdate(stream); - ASSERT_TRUE(can_update); - bankTage->doResolveUpdate(stream); - - // Should not detect conflict - EXPECT_EQ(bankTage->tageStats.updateBankConflict, conflicts_before); - } - - // Test 3: Disabled flag prevents conflict - bankTage->enableBankConflict = false; - { - // Same bank (0x20 and 0xa0), but conflict disabled - testStagePreds[1].btbEntries = {createBTBEntry(0x20)}; - bankTage->putPCHistory(0x20, testHistory, testStagePreds); - - auto meta = bankTage->getPredictionMeta(); - FetchTarget stream = createStream(0xa0, createBTBEntry(0xa0), true, meta); - setupTageEntry(bankTage, 0xa0, 0, 1, false); - - uint64_t conflicts_before = bankTage->tageStats.updateBankConflict; - bool can_update = bankTage->canResolveUpdate(stream); - ASSERT_TRUE(can_update); - bankTage->doResolveUpdate(stream); - - // No conflict even with same bank - EXPECT_EQ(bankTage->tageStats.updateBankConflict, conflicts_before); - } -} - TEST_F(BTBTAGETest, ResolveTrainBankConflict) { BTBTAGE bankTage(4, 2, 1024, 4); memset(&bankTage.tageStats, 0, sizeof(BTBTAGE::TageStats)); @@ -1282,132 +1209,6 @@ TEST_F(BTBTAGETest, ResolveTrainRepeatedShortPatternMatchesLegacyProviderGrowth) << "Full resolve-train should build the same set of TAGE tables as legacy update"; } -class BTBTAGEUpperBoundTest : public ::testing::Test -{ - protected: - void SetUp() override { - tage = new BTBTAGEUpperBound(); - memset(&tage->tageStats, 0, sizeof(BTBTAGE::TageStats)); - history.resize(128, false); - stagePreds.resize(2); - } - - BTBTAGEUpperBound *tage; - boost::dynamic_bitset<> history; - std::vector stagePreds; -}; - -class BTBTAGEUpperBoundPathHashTest : public ::testing::Test -{ - protected: - void SetUp() override { - tage = new BTBTAGEUpperBound(4, 1024, 4, - BTBTAGEUpperBound::HistorySource::PathHash); - memset(&tage->tageStats, 0, sizeof(BTBTAGE::TageStats)); - outcomeHistory.resize(128, false); - pathHistory.resize(128, false); - stagePreds.resize(2); - } - - BTBTAGEUpperBound *tage; - boost::dynamic_bitset<> outcomeHistory; - boost::dynamic_bitset<> pathHistory; - std::vector stagePreds; -}; - -TEST_F(BTBTAGEUpperBoundTest, ExactContextLookup) { - BTBEntry entry = createBTBEntry(0x1000, true, true, false, -1); - boost::dynamic_bitset<> historyA(128, 0); - boost::dynamic_bitset<> historyB(128, 0); - historyB[0] = true; - - ASSERT_TRUE(tage->insertExactEntry(3, entry.pc, historyA, 2)); - EXPECT_TRUE(tage->hasExactEntry(3, entry.pc, historyA)); - EXPECT_FALSE(tage->hasExactEntry(3, entry.pc, historyB)); - - bool predA = predictTAGE(tage, 0x1000, {entry}, historyA, stagePreds); - bool predB = predictTAGE(tage, 0x1000, {entry}, historyB, stagePreds); - - EXPECT_TRUE(predA); - EXPECT_FALSE(predB); -} - -TEST_F(BTBTAGEUpperBoundTest, ProviderAltSelection) { - BTBEntry entry = createBTBEntry(0x1000, true, true, false, -1); - - ASSERT_TRUE(tage->insertExactEntry(3, entry.pc, history, 0)); - ASSERT_TRUE(tage->insertExactEntry(1, entry.pc, history, -2)); - - predictTAGE(tage, 0x1000, {entry}, history, stagePreds); - auto meta = std::static_pointer_cast(tage->getPredictionMeta()); - auto pred = meta->preds[entry.pc]; - - EXPECT_EQ(pred.mainInfo.table, 3u); - EXPECT_EQ(pred.altInfo.table, 1u); - EXPECT_TRUE(pred.useAlt); - EXPECT_FALSE(pred.taken); -} - -TEST_F(BTBTAGEUpperBoundTest, AllocationUsesPredictionTimeHistory) { - BTBEntry entry = createBTBEntry(0x1000, true, true, false, -1); - boost::dynamic_bitset<> historyA(128, 0); - boost::dynamic_bitset<> historyB(128, 0); - historyB[0] = true; - - predictTAGE(tage, 0x1000, {entry}, historyA, stagePreds); - auto meta = tage->getPredictionMeta(); - - FetchTarget stream = createStream(0x1000, entry, true, meta); - stream = setMispredStream(stream); - - tage->recoverHist(historyB, stream, 1, true); - tage->update(stream); - - EXPECT_TRUE(tage->hasExactEntry(0, entry.pc, historyA)); - EXPECT_FALSE(tage->hasExactEntry(0, entry.pc, historyB)); -} - -TEST_F(BTBTAGEUpperBoundTest, NewConditionalEntryWithoutPredictionMetaStillTrains) { - boost::dynamic_bitset<> historyA(128, 0); - stagePreds[1].btbEntries.clear(); - tage->putPCHistory(0x1000, historyA, stagePreds); - auto meta = tage->getPredictionMeta(); - - BTBEntry newEntry = createBTBEntry(0x1010, true, true, false, -1); - FetchTarget stream; - stream.startPC = 0x1000; - stream.exeBranchInfo = newEntry; - stream.exeTaken = true; - stream.resolved = true; - stream.predBranchInfo = newEntry; - stream.updateBTBEntries.clear(); - stream.updateIsOldEntry = false; - stream.updateNewBTBEntry = newEntry; - stream.predMetas[0] = meta; - stream = setMispredStream(stream); - - tage->update(stream); - - EXPECT_TRUE(tage->hasExactEntry(0, newEntry.pc, historyA)); -} - -TEST_F(BTBTAGEUpperBoundPathHashTest, PredictionUsesPathHashHistorySnapshot) { - BTBEntry entry = createBTBEntry(0x1000, true, true, false, -1, 0x2000); - boost::dynamic_bitset<> pathHistoryA(128, 0); - boost::dynamic_bitset<> pathHistoryB(128, 0); - applyPathHistoryTaken(pathHistoryB, entry.pc, entry.target); - - ASSERT_TRUE(tage->insertExactEntry(2, entry.pc, pathHistoryB, 2)); - - FullBTBPrediction pred; - pred.btbEntries.push_back(entry); - pred.condTakens.push_back({entry.pc, true}); - tage->specUpdatePHist(pathHistoryA, pred); - - bool predicted = predictTAGE(tage, 0x1000, {entry}, outcomeHistory, stagePreds); - - EXPECT_TRUE(predicted); -} } // namespace test From 21e95cbfb05c231019e6f2a679e5d6ebcb03bc89 Mon Sep 17 00:00:00 2001 From: Yaksis <59007159+Yakkhini@users.noreply.github.com> Date: Thu, 9 Apr 2026 17:10:26 +0800 Subject: [PATCH 14/15] cpu: enable resolve train on ideal config for performance eval --- configs/example/idealkmhv3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py index 51b6c8f702..a51c75b453 100644 --- a/configs/example/idealkmhv3.py +++ b/configs/example/idealkmhv3.py @@ -27,7 +27,6 @@ def setKmhV3IdealParams(args, system): # fetch cpu.mmu.itb.size = 96 cpu.fetchWidth = 32 - cpu.iewToFetchDelay = 2 # for resolved update, should train branch after squash cpu.commitToFetchDelay = 2 cpu.fetchQueueSize = 64 @@ -83,7 +82,7 @@ def setKmhV3IdealParams(args, system): # branch predictor if args.bp_type == 'DecoupledBPUWithBTB': - cpu.enableFullResolveTrain = False + cpu.enableFullResolveTrain = True cpu.branchPred.ftq_size = 64 cpu.branchPred.fsq_size = 64 # cpu.branchPred.microtage.enabled = False From 1ae2c03f4ddf3a03de9b0ff69e5c16db7e5f0fa3 Mon Sep 17 00:00:00 2001 From: Yakkhini <59007159+Yakkhini@users.noreply.github.com> Date: Wed, 15 Apr 2026 12:30:05 +0800 Subject: [PATCH 15/15] cpu: port MicroTAGE to new resolve train Change-Id: I0ff2929bd102656f2e4cce4de4e783b78406cb2c --- src/cpu/pred/BranchPredictor.py | 2 + src/cpu/pred/btb/microtage.cc | 146 +++++++++++- src/cpu/pred/btb/microtage.hh | 29 ++- src/cpu/pred/btb/test/SConscript | 8 + src/cpu/pred/btb/test/microtage.test.cc | 298 ++++++++++++++++++++++++ 5 files changed, 474 insertions(+), 9 deletions(-) create mode 100644 src/cpu/pred/btb/test/microtage.test.cc diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 772654024f..41cc08ca6e 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1102,6 +1102,8 @@ class MicroTAGE(TimedBaseBTBPredictor): numBanks = Param.Unsigned(4,"Number of banks for bank conflict simulation") enableBankConflict = Param.Bool(False,"Enable bank conflict simulation") numDelay = Param.Unsigned(0,"Prediction latency in cycles") + resolvedUpdate = Param.Bool(Parent.enableFullResolveTrain, + "Train MicroTAGE from full resolve packets instead of commit fallback") class BTBITTAGE(TimedBaseBTBPredictor): type = 'BTBITTAGE' diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc index bcc57db899..634cb66984 100644 --- a/src/cpu/pred/btb/microtage.cc +++ b/src/cpu/pred/btb/microtage.cc @@ -267,6 +267,7 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEnt if (btb_entry.isCond && btb_entry.valid) { auto pred = generateSinglePrediction(btb_entry, startPC); meta->preds[btb_entry.pc] = pred; + meta->btbEntries[btb_entry.pc] = btb_entry; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } @@ -387,7 +388,7 @@ bool MicroTAGE::updatePredictorStateAndCheckAllocation(const BTBEntry &entry, bool actual_taken, const TagePrediction &pred, - const FetchTarget &stream) { + bool this_fb_mispred) { tageStats.updateStatsWithTagePrediction(pred, false); auto &main_info = pred.mainInfo; @@ -467,9 +468,6 @@ MicroTAGE::updatePredictorStateAndCheckAllocation(const BTBEntry &entry, } } - // Check if misprediction occurred - bool this_fb_mispred = stream.squashType == SquashType::SQUASH_CTRL && - stream.squashPC == entry.pc; // No allocation if no misprediction if (!this_fb_mispred) { return false; @@ -608,6 +606,140 @@ MicroTAGE::doResolveUpdate(const FetchTarget &stream) { update(stream); } +std::vector +MicroTAGE::prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const std::shared_ptr &predMeta) +{ + std::vector updates; + bool synthesizedNewEntry = false; + + for (const auto &resolved : packet.realBranches) { + if (!resolved.branch.isCond) { + continue; + } + + auto pred_it = predMeta->btbEntries.find(resolved.branch.pc); + BTBEntry entry; + ResolveTrainUpdate::EntryClass entryClass; + if (pred_it != predMeta->btbEntries.end()) { + entry = pred_it->second; + entryClass = ResolveTrainUpdate::EntryClass::ExistingPredictedEntry; + } else { + // Legacy update only exposes at most one MBTB-generated new entry, + // and only for the actually taken unpredicted branch. + if (synthesizedNewEntry || !resolved.taken) { + continue; + } + + entry = BTBEntry(resolved.branch); + entry.valid = true; + entryClass = ResolveTrainUpdate::EntryClass::NewEntryCandidate; + synthesizedNewEntry = true; + } + + if (entry.alwaysTaken) { + continue; + } + + updates.push_back({entry, resolved, entryClass}); + } + + return updates; +} + +bool +MicroTAGE::canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) +{ + Addr startAddr = packet.startPC; + unsigned updateBank = getBankId(startAddr); + +#ifndef UNIT_TEST + tageStats.updateAccessPerBank[updateBank]++; +#endif + + if (enableBankConflict && predBankValid && updateBank == lastPredBankId) { + tageStats.updateBankConflict++; + tageStats.updateDeferredDueToConflict++; +#ifndef UNIT_TEST + tageStats.updateBankConflictPerBank[updateBank]++; +#endif + DPRINTF(UTAGE, "Bank conflict detected: resolve-train bank %u conflicts with prediction bank %u, " + "deferring this packet\n", + updateBank, lastPredBankId); + predBankValid = false; + return false; + } + + return true; +} + +void +MicroTAGE::resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) +{ + if (enableBankConflict && predBankValid) { + predBankValid = false; + } + + auto predMeta = std::static_pointer_cast( + target.predMetas[getComponentIdx()]); + if (!predMeta) { + DPRINTF(UTAGE, "resolveTrain: no live prediction meta, skip\n"); + return; + } + + auto entries_to_update = prepareResolveTrainEntries(packet, predMeta); + + bool utage_hit = false; + for (const auto &update : entries_to_update) { + const auto &btb_entry = update.entry; + const bool isNewEntry = + update.entryClass == ResolveTrainUpdate::EntryClass::NewEntryCandidate; + TagePrediction recomputed; + if (!isNewEntry && !updateOnRead) { + auto pred_it = predMeta->preds.find(btb_entry.pc); + if (pred_it != predMeta->preds.end()) { + recomputed = pred_it->second; + } else { + recomputed = generateSinglePrediction(btb_entry, packet.startPC, + predMeta); + } + } else { + recomputed = generateSinglePrediction(btb_entry, packet.startPC, + predMeta); + } + + if (recomputed.mainprovided) { + utage_hit = true; + } + + bool need_allocate = updatePredictorStateAndCheckAllocation( + btb_entry, update.resolved.taken, recomputed, update.resolved.mispredict); + + if (need_allocate) { + uint start_table = 0; + auto &main_info = recomputed.mainInfo; + if (main_info.found) { + start_table = main_info.table + 1; + } + + uint64_t allocated_table = 0; + uint64_t allocated_index = 0; + uint64_t allocated_way = 0; + handleNewEntryAllocation(packet.startPC, btb_entry, + update.resolved.taken, start_table, + predMeta, allocated_table, + allocated_index, allocated_way); + } + } + + if (utage_hit) { + tageStats.updateUtageHit++; + } + DPRINTF(UTAGE, "end resolveTrain\n"); +} + /** * @brief Updates the TAGE predictor state based on actual branch execution results * @@ -652,8 +784,10 @@ MicroTAGE::update(const FetchTarget &stream) { if (recomputed.mainprovided) { utage_hit = true; } - // Update predictor state and check if need to allocate new entry - bool need_allocate = updatePredictorStateAndCheckAllocation(btb_entry, actual_taken, recomputed, stream); + bool this_fb_mispred = stream.squashType == SquashType::SQUASH_CTRL && + stream.squashPC == btb_entry.pc; + bool need_allocate = updatePredictorStateAndCheckAllocation( + btb_entry, actual_taken, recomputed, this_fb_mispred); // Handle new entry allocation if needed bool alloc_success = false; diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh index da593f6787..7e9a6b3c85 100644 --- a/src/cpu/pred/btb/microtage.hh +++ b/src/cpu/pred/btb/microtage.hh @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -150,6 +151,10 @@ class MicroTAGE : public TimedBaseBTBPredictor void update(const FetchTarget &entry) override; bool canResolveUpdate(const FetchTarget &entry) override; void doResolveUpdate(const FetchTarget &entry) override; + bool canResolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; + void resolveTrain(const ResolvedTrainPacket &packet, + const FetchTarget &target) override; #ifndef UNIT_TEST void commitBranch(const FetchTarget &stream, const DynInstPtr &inst) override; @@ -335,6 +340,7 @@ public: typedef struct TageMeta { std::unordered_map preds; + std::unordered_map btbEntries; std::vector tagFoldedHist; std::vector indexFoldedHist; std::vector altTagFoldedHist; @@ -358,9 +364,26 @@ private: // Helper method to update predictor state for a single entry bool updatePredictorStateAndCheckAllocation(const BTBEntry &entry, - bool actual_taken, - const TagePrediction &pred, - const FetchTarget &stream); + bool actual_taken, + const TagePrediction &pred, + bool this_fb_mispred); + + struct ResolveTrainUpdate + { + enum class EntryClass + { + ExistingPredictedEntry, + NewEntryCandidate, + }; + + BTBEntry entry; + ResolvedBranch resolved; + EntryClass entryClass; + }; + + std::vector + prepareResolveTrainEntries(const ResolvedTrainPacket &packet, + const std::shared_ptr &predMeta); // Helper method to handle new entry allocation bool handleNewEntryAllocation(const Addr &startPC, diff --git a/src/cpu/pred/btb/test/SConscript b/src/cpu/pred/btb/test/SConscript index d99632cb81..629c72cd9f 100644 --- a/src/cpu/pred/btb/test/SConscript +++ b/src/cpu/pred/btb/test/SConscript @@ -29,6 +29,13 @@ GTest('tage.test', '../timed_base_pred.cc', ) +GTest('microtage.test', + '../microtage.cc', + '../folded_hist.cc', + '../timed_base_pred.cc', + 'microtage.test.cc', +) + GTest('mgsc.test', '../btb_mgsc.cc', 'btb_mgsc.test.cc', @@ -53,6 +60,7 @@ GTest('folded_hist.test', env.Append(UNITTESTS=['uras.test', 'btb.test', 'tage.test', + 'microtage.test', 'mgsc.test', 'folded_hist.test', 'jump_ahead.test', diff --git a/src/cpu/pred/btb/test/microtage.test.cc b/src/cpu/pred/btb/test/microtage.test.cc new file mode 100644 index 0000000000..185aa8b723 --- /dev/null +++ b/src/cpu/pred/btb/test/microtage.test.cc @@ -0,0 +1,298 @@ +#include + +#include +#include + +#include "cpu/pred/btb/common.hh" +#include "cpu/pred/btb/microtage.hh" + +namespace gem5 +{ + +namespace branch_prediction +{ + +namespace btb_pred +{ + +namespace test +{ + +namespace +{ + +BTBEntry +createBTBEntry(Addr pc, int ctr = 0) +{ + BTBEntry entry; + entry.pc = pc; + entry.target = pc + 0x80; + entry.isCond = true; + entry.valid = true; + entry.alwaysTaken = false; + entry.ctr = ctr; + return entry; +} + +FetchTarget +createLegacyStream(Addr startPC, const BTBEntry &entry, bool taken, + const std::shared_ptr &meta) +{ + FetchTarget stream; + stream.startPC = startPC; + stream.resolved = true; + stream.exeTaken = taken; + stream.exeBranchInfo = entry; + stream.predBranchInfo = entry; + stream.updateBTBEntries = {entry}; + stream.updateIsOldEntry = true; + stream.predMetas[0] = meta; + return stream; +} + +ResolvedBranch +createResolvedBranch(const BTBEntry &entry, bool taken, bool mispredict) +{ + BranchInfo branch(entry); + branch.resolved = true; + branch.size = 4; + return ResolvedBranch(branch, taken, mispredict, 0); +} + +ResolvedTrainPacket +createResolvedTrainPacket(Addr startPC, const ResolvedBranch &resolved) +{ + ResolvedTrainPacket packet; + packet.startPC = startPC; + packet.realBranches.push_back(resolved); + return packet; +} + +ResolvedTrainPacket +createResolvedTrainPacket(Addr startPC, + std::vector resolvedBranches) +{ + ResolvedTrainPacket packet; + packet.startPC = startPC; + packet.realBranches = std::move(resolvedBranches); + return packet; +} + +FetchTarget +createResolvedTrainTarget(Addr startPC, const std::shared_ptr &meta) +{ + FetchTarget target; + target.startPC = startPC; + target.predMetas[0] = meta; + return target; +} + +void +applyPathHistoryTaken(boost::dynamic_bitset<> &history, Addr pc, Addr target) +{ + history <<= 2; + uint64_t hash = pathHash(pc, target); + for (std::size_t i = 0; i < pathHashLength && i < history.size(); ++i) { + history[i] = history[i] ^ (hash & 1); + hash >>= 1; + } +} + +void +advanceActualHistory(MicroTAGE &tage, boost::dynamic_bitset<> &history, + const BTBEntry &entry, bool taken) +{ + tage.doUpdateHist(history, taken, entry.pc, entry.target); + if (taken) { + applyPathHistoryTaken(history, entry.pc, entry.target); + } +} + +bool +predictTaken(MicroTAGE &tage, Addr startPC, const BTBEntry &entry, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + stagePreds[1].btbEntries = {entry}; + stagePreds[1].condTakens.clear(); + tage.putPCHistory(startPC, history, stagePreds); + Addr branchPC = entry.pc; + auto it = CondTakens_find(stagePreds[1].condTakens, branchPC); + EXPECT_NE(it, stagePreds[1].condTakens.end()); + return it != stagePreds[1].condTakens.end() && it->second; +} + +MicroTAGE::TagePrediction +probePrediction(MicroTAGE &tage, Addr startPC, const BTBEntry &entry, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + stagePreds[1].btbEntries = {entry}; + stagePreds[1].condTakens.clear(); + tage.putPCHistory(startPC, history, stagePreds); + auto meta = std::static_pointer_cast( + tage.getPredictionMeta()); + auto it = meta->preds.find(entry.pc); + EXPECT_NE(it, meta->preds.end()); + return it != meta->preds.end() ? it->second : MicroTAGE::TagePrediction(); +} + +void +legacyTrain(MicroTAGE &tage, Addr startPC, const BTBEntry &entry, bool taken, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + bool predicted_taken = predictTaken(tage, startPC, entry, history, stagePreds); + auto meta = tage.getPredictionMeta(); + FetchTarget stream = createLegacyStream(startPC, entry, taken, meta); + if (predicted_taken != taken) { + stream.squashType = SquashType::SQUASH_CTRL; + stream.squashPC = entry.pc; + } + tage.update(stream); + advanceActualHistory(tage, history, entry, taken); +} + +void +resolveTrain(MicroTAGE &tage, Addr startPC, const BTBEntry &entry, bool taken, + boost::dynamic_bitset<> &history, + std::vector &stagePreds) +{ + bool predicted_taken = predictTaken(tage, startPC, entry, history, stagePreds); + auto meta = tage.getPredictionMeta(); + auto packet = createResolvedTrainPacket( + startPC, createResolvedBranch(entry, taken, predicted_taken != taken)); + auto target = createResolvedTrainTarget(startPC, meta); + ASSERT_TRUE(tage.canResolveTrain(packet, target)); + tage.resolveTrain(packet, target); + advanceActualHistory(tage, history, entry, taken); +} + +size_t +countValidEntries(const MicroTAGE &tage) +{ + size_t count = 0; + for (const auto &table : tage.tageTable) { + for (const auto &set : table) { + for (const auto &way : set) { + count += way.valid ? 1 : 0; + } + } + } + return count; +} + +size_t +countEntriesForPc(const MicroTAGE &tage, Addr pc) +{ + size_t count = 0; + for (const auto &table : tage.tageTable) { + for (const auto &set : table) { + for (const auto &way : set) { + count += way.valid && way.pc == pc; + } + } + } + return count; +} + +void +zeroStats(MicroTAGE &tage) +{ + std::memset(&tage.tageStats, 0, sizeof(MicroTAGE::TageStats)); +} + +} // namespace + +TEST(MicroTAGEResolveTrainTest, MatchesLegacyTrainingOnRepeatedConditionalPattern) +{ + MicroTAGE legacy; + MicroTAGE resolved; + zeroStats(legacy); + zeroStats(resolved); + Addr startPC = 0x1000; + Addr branchPC = startPC + 0x10; + BTBEntry entry = createBTBEntry(branchPC, 0); + std::vector pattern = {true, true, false, true, false, false}; + + boost::dynamic_bitset<> legacyHistory(256); + boost::dynamic_bitset<> resolvedHistory(256); + std::vector legacyStagePreds(2); + std::vector resolvedStagePreds(2); + + for (int iter = 0; iter < 12; ++iter) { + for (bool taken : pattern) { + legacyTrain(legacy, startPC, entry, taken, legacyHistory, + legacyStagePreds); + resolveTrain(resolved, startPC, entry, taken, resolvedHistory, + resolvedStagePreds); + } + } + + size_t legacyOccupancy = countValidEntries(legacy); + size_t resolvedOccupancy = countValidEntries(resolved); + EXPECT_GT(legacyOccupancy, 0U) + << "legacy training should allocate at least one MicroTAGE entry"; + EXPECT_EQ(legacyOccupancy, resolvedOccupancy); + + auto legacyProbe = probePrediction(legacy, startPC, entry, legacyHistory, + legacyStagePreds); + auto resolvedProbe = probePrediction(resolved, startPC, entry, resolvedHistory, + resolvedStagePreds); + EXPECT_EQ(legacyProbe.mainprovided, resolvedProbe.mainprovided); + EXPECT_EQ(legacyProbe.mainInfo.found, resolvedProbe.mainInfo.found); + EXPECT_EQ(legacyProbe.mainInfo.table, resolvedProbe.mainInfo.table); + EXPECT_EQ(legacyProbe.taken, resolvedProbe.taken); +} + +TEST(MicroTAGEResolveTrainTest, + IgnoresUnpredictedConditionalBranchesOutsideLegacyTrainingSet) +{ + MicroTAGE legacy; + MicroTAGE resolved; + zeroStats(legacy); + zeroStats(resolved); + + const Addr startPC = 0x2000; + const BTBEntry predicted = createBTBEntry(startPC + 0x4, -1); + const BTBEntry missing = createBTBEntry(startPC + 0x10, -1); + + boost::dynamic_bitset<> legacyHistory(256); + boost::dynamic_bitset<> resolvedHistory(256); + std::vector legacyStagePreds(2); + std::vector resolvedStagePreds(2); + + legacyStagePreds[1].btbEntries = {predicted}; + legacy.putPCHistory(startPC, legacyHistory, legacyStagePreds); + auto legacyMeta = legacy.getPredictionMeta(); + + FetchTarget legacyStream = createLegacyStream(startPC, predicted, false, + legacyMeta); + legacy.update(legacyStream); + + resolvedStagePreds[1].btbEntries = {predicted}; + resolved.putPCHistory(startPC, resolvedHistory, resolvedStagePreds); + auto resolvedMeta = resolved.getPredictionMeta(); + + auto packet = createResolvedTrainPacket( + startPC, + {createResolvedBranch(predicted, false, false), + createResolvedBranch(missing, true, true)}); + auto target = createResolvedTrainTarget(startPC, resolvedMeta); + ASSERT_TRUE(resolved.canResolveTrain(packet, target)); + resolved.resolveTrain(packet, target); + + EXPECT_EQ(countEntriesForPc(legacy, missing.pc), 0U) + << "legacy update should not synthesize a MicroTAGE entry for an " + "unpredicted branch in this scenario"; + EXPECT_EQ(countEntriesForPc(resolved, missing.pc), + countEntriesForPc(legacy, missing.pc)); +} + +} // namespace test + +} // namespace btb_pred + +} // namespace branch_prediction + +} // namespace gem5