From 797e2e7cbc507052c9460c14a969450fff258cd7 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Mon, 18 May 2026 16:49:39 +0800 Subject: [PATCH 1/6] cpu: Add FDIP opportunity statistics Change-Id: Idec3aab68bdefcc9256801ba47c8038e618e030e --- src/cpu/pred/btb/decoupled_bpred.cc | 83 ++++++++++++++++++++++- src/cpu/pred/btb/decoupled_bpred.hh | 21 ++++++ src/cpu/pred/btb/decoupled_bpred_stats.cc | 25 +++++++ 3 files changed, 128 insertions(+), 1 deletion(-) diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index a1fee43d87..fc668212f1 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -24,7 +24,9 @@ namespace btb_pred void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { - ftq.fetching(tid).fetchInstNum = fetched_inst_num; + auto &target = ftq.fetching(tid); + target.fetchInstNum = fetched_inst_num; + recordFdipFetchedTarget(target); ftq.finishTarget(tid); } @@ -115,6 +117,77 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) }); } +unsigned +DecoupledBPUWithBTB::fdipCandidateCacheBlocks(const FetchTarget &target) const +{ + const Addr blk_size = cpu ? cpu->cacheLineSize() : 64; + Addr end_pc = target.predEndPC; + if (end_pc <= target.startPC) { + end_pc = target.startPC + 1; + } + + const Addr start_blk = target.startPC & ~(blk_size - 1); + const Addr end_blk = (end_pc - 1) & ~(blk_size - 1); + return (end_blk - start_blk) / blk_size + 1; +} + +uint64_t +DecoupledBPUWithBTB::fdipTargetAgeCycles(const FetchTarget &target) const +{ + if (!cpu || curTick() < target.predTick) { + return 0; + } + return static_cast(cpu->ticksToCycles(curTick() - target.predTick)); +} + +void +DecoupledBPUWithBTB::recordFdipCandidateTarget(const FetchTarget &target) +{ + const unsigned blocks = fdipCandidateCacheBlocks(target); + dbpBtbStats.fdipCandidateTargets++; + dbpBtbStats.fdipCandidateCacheBlocks.sample(blocks, 1); + dbpBtbStats.fdipCandidateCacheBlocksTotal += blocks; +} + +void +DecoupledBPUWithBTB::recordFdipFetchedTarget(const FetchTarget &target) +{ + dbpBtbStats.fdipTargetFetched++; + dbpBtbStats.fdipTargetFetchLatency.sample(fdipTargetAgeCycles(target), 1); +} + +void +DecoupledBPUWithBTB::recordFdipCommittedTarget(const FetchTarget &target) +{ + dbpBtbStats.fdipTargetCommitted++; + dbpBtbStats.fdipTargetCommitLatency.sample(fdipTargetAgeCycles(target), 1); +} + +void +DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid, + FetchTargetId firstTargetId, + FetchTargetId lastTargetId) +{ + if (lastTargetId < firstTargetId) { + return; + } + + unsigned squashed = 0; + for (auto id = firstTargetId; id <= lastTargetId; ++id) { + if (!ftq.hasTarget(id, tid)) { + continue; + } + const auto &target = ftq.get(id, tid); + dbpBtbStats.fdipTargetsSquashed++; + dbpBtbStats.fdipTargetSquashLatency.sample(fdipTargetAgeCycles(target), 1); + squashed++; + } + + if (squashed > 0) { + dbpBtbStats.fdipSquashBatchSize.sample(squashed, 1); + } +} + void DecoupledBPUWithBTB::tick() @@ -380,6 +453,8 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid) predTraceManager->write_record(PredictionTrace(ftq.backId(tid), entry)); } + recordFdipCandidateTarget(entry); + // 5. Add entry to fetch target queue ftq.insert(entry); threads[tid].validprediction = false; @@ -449,6 +524,10 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id, dumpFsq("Before control squash"); } + if (ftq.backId(tid) > target_id) { + recordFdipSquashedTargets(tid, target_id + 1, ftq.backId(tid)); + } + // Remove targets after the squashed one ftq.squashAfter(target_id, tid); @@ -562,6 +641,8 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid) target.startPC, target.exeBranchInfo.pc, target.exeBranchInfo.target, target.predBranchInfo.pc, target.predBranchInfo.target); + recordFdipCommittedTarget(target); + // Update statistics updateStatistics(target); diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 288450001f..8628e2fc4e 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -187,6 +187,14 @@ class DecoupledBPUWithBTB : public BPredUnit */ void generateFinalPredAndCreateBubbles(ThreadID tid); + unsigned fdipCandidateCacheBlocks(const FetchTarget &target) const; + uint64_t fdipTargetAgeCycles(const FetchTarget &target) const; + void recordFdipCandidateTarget(const FetchTarget &target); + void recordFdipFetchedTarget(const FetchTarget &target); + void recordFdipCommittedTarget(const FetchTarget &target); + void recordFdipSquashedTargets(ThreadID tid, FetchTargetId firstTargetId, + FetchTargetId lastTargetId); + void clearPreds(ThreadID tid) { for (auto &stagePred : threads[tid].predsOfEachStage) { stagePred.condTakens.clear(); @@ -287,6 +295,19 @@ class DecoupledBPUWithBTB : public BPredUnit statistics::Distribution commitFsqEntryFetchedInsts; statistics::Scalar commitFsqEntryOnlyHasOneJump; + // FDIP opportunity statistics. These do not issue prefetches; they + // measure whether predicted FSQ targets live long enough to be useful. + statistics::Scalar fdipCandidateTargets; + statistics::Distribution fdipCandidateCacheBlocks; + statistics::Scalar fdipCandidateCacheBlocksTotal; + statistics::Scalar fdipTargetFetched; + statistics::Distribution fdipTargetFetchLatency; + statistics::Scalar fdipTargetCommitted; + statistics::Distribution fdipTargetCommitLatency; + statistics::Scalar fdipTargetsSquashed; + statistics::Distribution fdipTargetSquashLatency; + statistics::Distribution fdipSquashBatchSize; + statistics::Scalar btbHit; statistics::Scalar btbMiss; statistics::Scalar btbEntriesWithDifferentStart; diff --git a/src/cpu/pred/btb/decoupled_bpred_stats.cc b/src/cpu/pred/btb/decoupled_bpred_stats.cc index a64d943428..2c8255c010 100644 --- a/src/cpu/pred/btb/decoupled_bpred_stats.cc +++ b/src/cpu/pred/btb/decoupled_bpred_stats.cc @@ -451,6 +451,26 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats( ADD_STAT(commitFsqEntryHasInsts, statistics::units::Count::get(), "number of insts that commit fsq entries have"), ADD_STAT(commitFsqEntryFetchedInsts, statistics::units::Count::get(), "number of insts that commit fsq entries fetched"), ADD_STAT(commitFsqEntryOnlyHasOneJump, statistics::units::Count::get(), "number of fsq entries with only one instruction (jump)"), + ADD_STAT(fdipCandidateTargets, statistics::units::Count::get(), + "number of FSQ entries that could seed FDIP"), + ADD_STAT(fdipCandidateCacheBlocks, statistics::units::Count::get(), + "cache blocks covered by each FDIP candidate FSQ entry"), + ADD_STAT(fdipCandidateCacheBlocksTotal, statistics::units::Count::get(), + "total cache blocks covered by FDIP candidate FSQ entries"), + ADD_STAT(fdipTargetFetched, statistics::units::Count::get(), + "number of FDIP candidate FSQ entries consumed by fetch"), + ADD_STAT(fdipTargetFetchLatency, statistics::units::Cycle::get(), + "cycles from prediction to fetch consuming the FSQ entry"), + ADD_STAT(fdipTargetCommitted, statistics::units::Count::get(), + "number of FDIP candidate FSQ entries committed"), + ADD_STAT(fdipTargetCommitLatency, statistics::units::Cycle::get(), + "cycles from prediction to committing the FSQ entry"), + ADD_STAT(fdipTargetsSquashed, statistics::units::Count::get(), + "number of younger FDIP candidate FSQ entries removed by squash"), + ADD_STAT(fdipTargetSquashLatency, statistics::units::Cycle::get(), + "cycles from prediction to squash removal for FDIP candidates"), + ADD_STAT(fdipSquashBatchSize, statistics::units::Count::get(), + "number of younger FSQ entries removed by each squash"), ADD_STAT(btbHit, statistics::units::Count::get(), "btb hits (in predict block)"), ADD_STAT(btbMiss, statistics::units::Count::get(), "btb misses (in predict block)"), ADD_STAT(btbEntriesWithDifferentStart, statistics::units::Count::get(), "number of btb entries with different start PC"), @@ -474,6 +494,11 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats( fsqEntryDist.init(0, fsqSize, 20).flags(statistics::total); commitFsqEntryHasInsts.init(0, maxInstsNum >> 1, 1); commitFsqEntryFetchedInsts.init(0, maxInstsNum >> 1, 1); + fdipCandidateCacheBlocks.init(0, 8, 1); + fdipTargetFetchLatency.init(0, 4096, 16); + fdipTargetCommitLatency.init(0, 4096, 16); + fdipTargetSquashLatency.init(0, 4096, 16); + fdipSquashBatchSize.init(0, fsqSize, 1); branchClassCounts.init(NumBranchClasses); branchClassMisses.init(NumBranchClasses); controlSquashByClass.init(NumBranchClasses); From a7624bfc779daa8fb089e6b875afbc620c321dc2 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Tue, 19 May 2026 16:11:40 +0800 Subject: [PATCH 2/6] cpu: Add off-by-default FDIP prototype Change-Id: Id5d5d004a72d7a5a8cec6ac877a5de5e89a78835 --- src/cpu/o3/BaseO3CPU.py | 16 ++ src/cpu/o3/fetch.cc | 304 +++++++++++++++++++++++++++- src/cpu/o3/fetch.hh | 74 +++++++ src/cpu/pred/btb/decoupled_bpred.hh | 3 + src/mem/cache/cache.cc | 3 + 5 files changed, 398 insertions(+), 2 deletions(-) diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index f6f46d85b8..3b3dac27eb 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -255,6 +255,22 @@ def support_take_over(cls): store_prefetch_train = Param.Bool(True, "Training store prefetcher with store addresses") + fdip = Param.Bool(False, "Enable fetch-directed instruction prefetch") + fdipLookaheadTargets = Param.Unsigned( + 4, "Number of future FSQ targets to scan for FDIP") + fdipMaxPrefetchesPerCycle = Param.Unsigned( + 2, "Maximum FDIP prefetch requests generated per cycle") + fdipMaxBlocksPerTarget = Param.Unsigned( + 1, "Maximum cache blocks to prefetch for each FDIP target") + fdipMinTargetDistance = Param.Unsigned( + 1, "Minimum distance from the current fetch target to prefetch") + fdipMinTargetAgeCycles = Param.Unsigned( + 0, "Minimum predicted-target age before FDIP prefetches it") + fdipSkipTargetStartBlock = Param.Bool( + False, "Skip the first cache block of each FDIP target") + fdipMaxPendingTranslations = Param.Unsigned( + 32, "Maximum FDIP translations in flight") + # value predictor valuePred = Param.ValuePredictor(NULL, "valuepred unit") enableSelectiveVPFlush = Param.Bool(False, diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 21c9cec4e6..f31e2a5fb0 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -100,6 +100,17 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) retryPkt(), retryTid(InvalidThreadID), cacheBlkSize(cpu->cacheLineSize()), + fdip(params.fdip), + fdipLookaheadTargets(params.fdipLookaheadTargets), + fdipMaxPrefetchesPerCycle(params.fdipMaxPrefetchesPerCycle), + fdipMaxBlocksPerTarget(params.fdipMaxBlocksPerTarget), + fdipMinTargetDistance(params.fdipMinTargetDistance), + fdipMinTargetAgeCycles(params.fdipMinTargetAgeCycles), + fdipSkipTargetStartBlock(params.fdipSkipTargetStartBlock), + fdipMaxPendingTranslations(params.fdipMaxPendingTranslations), + fdipPendingTranslations(0), + fdipPendingPrefetches(0), + fdipGeneration(0), fetchBufferSize(params.fetchBufferSize), fetchQueueSize(params.fetchQueueSize), numThreads(params.numThreads), @@ -277,7 +288,33 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) ADD_STAT(traceMetaCleanupSquashEntries, statistics::units::Count::get(), "Total entries erased by squash/rollback cleanups"), ADD_STAT(traceMetaCleanupCommitCalls, statistics::units::Count::get(), - "Number of times cleanup was called on successful commit") + "Number of times cleanup was called on successful commit"), + ADD_STAT(fdipTargetsIdentified, statistics::units::Count::get(), + "Number of future FSQ targets considered by FDIP"), + ADD_STAT(fdipTargetsAlreadyIssued, statistics::units::Count::get(), + "Number of future FSQ targets skipped because FDIP already issued them"), + ADD_STAT(fdipBlocksIdentified, statistics::units::Count::get(), + "Number of cache blocks identified by FDIP"), + ADD_STAT(fdipTranslationsStarted, statistics::units::Count::get(), + "Number of FDIP translations started"), + ADD_STAT(fdipTranslationThrottled, statistics::units::Count::get(), + "Number of FDIP translations blocked by the pending translation limit"), + ADD_STAT(fdipTranslationFaults, statistics::units::Count::get(), + "Number of FDIP translations that faulted"), + ADD_STAT(fdipPrefetchesIssued, statistics::units::Count::get(), + "Number of FDIP prefetch packets issued to the I-cache"), + ADD_STAT(fdipPrefetchesDropped, statistics::units::Count::get(), + "Number of queued FDIP prefetch packets dropped before reaching the I-cache"), + ADD_STAT(fdipPrefetchRetriesQueued, statistics::units::Count::get(), + "Number of FDIP prefetch packets queued for I-cache retry"), + ADD_STAT(fdipPrefetchRetriesSent, statistics::units::Count::get(), + "Number of FDIP prefetch retry packets accepted by the I-cache"), + ADD_STAT(fdipPrefetchResponses, statistics::units::Count::get(), + "Number of FDIP prefetch responses received"), + ADD_STAT(fdipStaleTranslations, statistics::units::Count::get(), + "Number of FDIP translations discarded after squash or reset"), + ADD_STAT(fdipStalePrefetchResponses, statistics::units::Count::get(), + "Number of FDIP prefetch responses discarded after squash or reset") { icacheStallCycles .prereq(icacheStallCycles); @@ -315,6 +352,32 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) .prereq(icacheSquashes); tlbSquashes .prereq(tlbSquashes); + fdipTargetsIdentified + .prereq(fdipTargetsIdentified); + fdipTargetsAlreadyIssued + .prereq(fdipTargetsAlreadyIssued); + fdipBlocksIdentified + .prereq(fdipBlocksIdentified); + fdipTranslationsStarted + .prereq(fdipTranslationsStarted); + fdipTranslationThrottled + .prereq(fdipTranslationThrottled); + fdipTranslationFaults + .prereq(fdipTranslationFaults); + fdipPrefetchesIssued + .prereq(fdipPrefetchesIssued); + fdipPrefetchesDropped + .prereq(fdipPrefetchesDropped); + fdipPrefetchRetriesQueued + .prereq(fdipPrefetchRetriesQueued); + fdipPrefetchRetriesSent + .prereq(fdipPrefetchRetriesSent); + fdipPrefetchResponses + .prereq(fdipPrefetchResponses); + fdipStaleTranslations + .prereq(fdipStaleTranslations); + fdipStalePrefetchResponses + .prereq(fdipStalePrefetchResponses); nisnDist .init(/* base value */ 0, /* last value */ fetch->fetchWidth, @@ -424,6 +487,8 @@ Fetch::resetStage() numInst = 0; interruptPending = false; cacheBlocked = false; + ++fdipGeneration; + discardFdipRetryPackets(); priorityList.clear(); @@ -438,6 +503,7 @@ Fetch::resetStage() threads[tid].reset(); ftqEntryFetchedInsts[tid] = 0; + fdipIssuedTargets[tid].clear(); fetchQueue[tid].clear(); @@ -523,6 +589,220 @@ Fetch::handleMultiCacheLineFetch(Addr vaddr, ThreadID tid, Addr pc) return true; } +void +Fetch::issueFdipPrefetches(ThreadID tid) +{ + if (!fdip || isTraceMode() || cacheBlocked || !fdipRetryPkt.empty() || + fetchStatus[tid] != Running || !dbpbtb->ftqHasFetching(tid)) { + return; + } + + const CacheRequestStatus cache_status = + threads[tid].cacheReq.getOverallStatus(); + if (cache_status == TlbWait || cache_status == CacheWaitRetry || + cache_status == CacheWaitResponse) { + return; + } + + auto &issued_targets = fdipIssuedTargets[tid]; + if (issued_targets.size() > 4096) { + issued_targets.clear(); + } + + const auto fetch_id = dbpbtb->ftqHeadId(tid); + const auto back_id = dbpbtb->ftqBackId(tid); + if (back_id <= fetch_id || fdipMaxPrefetchesPerCycle == 0 || + fdipMaxBlocksPerTarget == 0) { + return; + } + + unsigned issued_this_cycle = 0; + const auto first_id = fetch_id + fdipMinTargetDistance; + const auto last_id = std::min(back_id, fetch_id + fdipLookaheadTargets); + + for (auto target_id = first_id; target_id <= last_id; ++target_id) { + if (!dbpbtb->ftqHasTarget(target_id, tid)) { + continue; + } + + if (issued_targets.count(target_id)) { + ++fetchStats.fdipTargetsAlreadyIssued; + continue; + } + + const auto &target = dbpbtb->ftqTarget(target_id, tid); + if (fdipMinTargetAgeCycles > 0) { + const uint64_t target_age = curTick() < target.predTick ? 0 : + static_cast( + cpu->ticksToCycles(curTick() - target.predTick)); + if (target_age < fdipMinTargetAgeCycles) { + continue; + } + } + + Addr end_pc = target.predEndPC; + if (end_pc <= target.startPC) { + end_pc = target.startPC + 1; + } + + Addr block = target.startPC & ~(Addr(cacheBlkSize) - 1); + const Addr end_block = (end_pc - 1) & ~(Addr(cacheBlkSize) - 1); + unsigned blocks_for_target = 0; + + ++fetchStats.fdipTargetsIdentified; + bool started_for_target = false; + if (fdipSkipTargetStartBlock) { + block += cacheBlkSize; + } + + while (block <= end_block && + blocks_for_target < fdipMaxBlocksPerTarget && + issued_this_cycle < fdipMaxPrefetchesPerCycle) { + ++fetchStats.fdipBlocksIdentified; + if (!startFdipTranslation(tid, block, target.startPC)) { + break; + } + issued_this_cycle++; + blocks_for_target++; + started_for_target = true; + block += cacheBlkSize; + } + + if (started_for_target) { + issued_targets.insert(target_id); + } else if (fdipSkipTargetStartBlock) { + issued_targets.insert(target_id); + } + + if (issued_this_cycle >= fdipMaxPrefetchesPerCycle) { + break; + } + } +} + +bool +Fetch::startFdipTranslation(ThreadID tid, Addr vaddr, Addr pc) +{ + if (fdipPendingTranslations >= fdipMaxPendingTranslations) { + ++fetchStats.fdipTranslationThrottled; + return false; + } + + RequestPtr req = std::make_shared( + vaddr, cacheBlkSize, Request::INST_FETCH | Request::PREFETCH, + cpu->instRequestorId(), pc, cpu->thread[tid]->contextId()); + req->taskId(context_switch_task_id::Prefetcher); + + ++fdipPendingTranslations; + ++fetchStats.fdipTranslationsStarted; + + auto *translation = new FdipTranslation(this, fdipGeneration); + cpu->mmu->translateTiming(req, cpu->thread[tid]->getTC(), translation, + BaseMMU::Execute); + return true; +} + +void +Fetch::finishFdipTranslation(const Fault &fault, const RequestPtr &mem_req, + uint64_t generation) +{ + assert(fdipPendingTranslations > 0); + --fdipPendingTranslations; + + if (generation != fdipGeneration || cpu->switchedOut()) { + ++fetchStats.fdipStaleTranslations; + return; + } + + if (fault != NoFault) { + ++fetchStats.fdipTranslationFaults; + return; + } + + if (!cpu->system->isMemAddr(mem_req->getPaddr())) { + ++fetchStats.fdipPrefetchesDropped; + return; + } + + PacketPtr pkt = new Packet(mem_req, MemCmd::SoftPFReq); + pkt->allocate(); + pkt->setSendRightAway(); + fdipPacketGenerations[mem_req] = generation; + + if (cacheBlocked || !fdipRetryPkt.empty() || !icachePort.sendTimingReq(pkt)) { + ++fetchStats.fdipPrefetchRetriesQueued; + fdipRetryPkt.push_back(pkt); + return; + } + + ++fetchStats.fdipPrefetchesIssued; + ++fdipPendingPrefetches; +} + +void +Fetch::completeFdipPrefetch(PacketPtr pkt) +{ + auto it = fdipPacketGenerations.find(pkt->req); + const bool stale = it == fdipPacketGenerations.end() || + it->second != fdipGeneration; + if (it != fdipPacketGenerations.end()) { + fdipPacketGenerations.erase(it); + } + + assert(fdipPendingPrefetches > 0); + --fdipPendingPrefetches; + + if (stale) { + ++fetchStats.fdipStalePrefetchResponses; + } else { + ++fetchStats.fdipPrefetchResponses; + } + delete pkt; +} + +void +Fetch::retryFdipPrefetches() +{ + if (!fdip || cacheBlocked || fdipRetryPkt.empty()) { + return; + } + + for (auto it = fdipRetryPkt.begin(); it != fdipRetryPkt.end();) { + PacketPtr pkt = *it; + auto gen_it = fdipPacketGenerations.find(pkt->req); + if (gen_it == fdipPacketGenerations.end() || + gen_it->second != fdipGeneration) { + if (gen_it != fdipPacketGenerations.end()) { + fdipPacketGenerations.erase(gen_it); + } + ++fetchStats.fdipStaleTranslations; + delete pkt; + it = fdipRetryPkt.erase(it); + continue; + } + + if (!icachePort.sendTimingReq(pkt)) { + break; + } + + ++fetchStats.fdipPrefetchesIssued; + ++fetchStats.fdipPrefetchRetriesSent; + ++fdipPendingPrefetches; + it = fdipRetryPkt.erase(it); + } +} + +void +Fetch::discardFdipRetryPackets() +{ + for (PacketPtr pkt : fdipRetryPkt) { + fdipPacketGenerations.erase(pkt->req); + ++fetchStats.fdipPrefetchesDropped; + delete pkt; + } + fdipRetryPkt.clear(); +} + bool Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt) { @@ -650,9 +930,12 @@ Fetch::drainSanityCheck() const { assert(isDrained()); assert(retryPkt.size() == 0); + assert(fdipRetryPkt.size() == 0); assert(retryTid == InvalidThreadID); assert(!cacheBlocked); assert(!interruptPending); + assert(fdipPendingTranslations == 0); + assert(fdipPendingPrefetches == 0); for (ThreadID i = 0; i < numThreads; ++i) { assert(threads[i].cacheReq.packets.empty()); @@ -686,7 +969,10 @@ Fetch::isDrained() const * cycle if the finish translation event is scheduled, so make * sure that's not the case. */ - return !finishTranslationEvent.scheduled(); + return !finishTranslationEvent.scheduled() && + fdipPendingTranslations == 0 && + fdipPendingPrefetches == 0 && + fdipRetryPkt.empty(); } void @@ -1105,6 +1391,9 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN // Force a new I-cache request for the next FTQ head after squash. threads[tid].valid = false; ftqEntryFetchedInsts[tid] = 0; + ++fdipGeneration; + discardFdipRetryPackets(); + fdipIssuedTargets[tid].clear(); if (traceFetch) { traceFetch->handleTraceSquash(tid, new_pc, squashInst, seqNum); @@ -1207,6 +1496,10 @@ Fetch::tick() // Perform fetch operations and instruction delivery fetchAndProcessInstructions(status_change); + + for (auto tid : *activeThreads) { + issueFdipPrefetches(tid); + } } bool @@ -2019,6 +2312,7 @@ Fetch::recvReqRetry() // Access has been squashed since it was sent out. Just clear // the cache being blocked. cacheBlocked = false; + retryFdipPrefetches(); return; } assert(cacheBlocked); @@ -2043,6 +2337,7 @@ Fetch::recvReqRetry() if (retryPkt.size() == 0) { retryTid = InvalidThreadID; cacheBlocked = false; + retryFdipPrefetches(); } } @@ -2108,6 +2403,11 @@ Fetch::IcachePort::recvTimingResp(PacketPtr pkt) DPRINTF(Fetch, "received pkt addr=%#lx, req addr=%#lx\n", pkt->getAddr(), pkt->req->getVaddr()); + if (pkt->cmd == MemCmd::SoftPFResp || pkt->cmd == MemCmd::HardPFResp) { + fetch->completeFdipPrefetch(pkt); + return true; + } + fetch->processCacheCompletion(pkt); return true; diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 19091ef30e..72977f89ee 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #include "arch/generic/decoder.hh" @@ -132,6 +134,29 @@ class Fetch } }; + class FdipTranslation : public BaseMMU::Translation + { + protected: + Fetch *fetch; + uint64_t generation; + + public: + FdipTranslation(Fetch *_fetch, uint64_t _generation) + : fetch(_fetch), generation(_generation) + {} + + void markDelayed() {} + + void + finish(const Fault &fault, const RequestPtr &req, + gem5::ThreadContext *tc, BaseMMU::Mode mode) + { + assert(mode == BaseMMU::Execute); + fetch->finishFdipTranslation(fault, req, generation); + delete this; + } + }; + private: /* Event to delay delivery of a fetch translation result in case of * a fault and the nop to carry the fault cannot be generated @@ -421,6 +446,25 @@ class Fetch */ bool processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt); + /** Try to issue fetch-directed instruction prefetches from queued FSQ targets. */ + void issueFdipPrefetches(ThreadID tid); + + /** Start an address translation for a single FDIP cache block. */ + bool startFdipTranslation(ThreadID tid, Addr vaddr, Addr pc); + + /** Finish an FDIP translation and send the prefetch if it succeeded. */ + void finishFdipTranslation(const Fault &fault, const RequestPtr &mem_req, + uint64_t generation); + + /** Complete and discard an FDIP response packet. */ + void completeFdipPrefetch(PacketPtr pkt); + + /** Try to send queued FDIP prefetches after the I-cache port retries. */ + void retryFdipPrefetches(); + + /** Drop queued FDIP retry packets that have not reached the cache. */ + void discardFdipRetryPackets(); + /** Check if an interrupt is pending and that we need to handle */ @@ -666,6 +710,22 @@ class Fetch /** Cache block size. */ unsigned int cacheBlkSize; + const bool fdip; + const unsigned fdipLookaheadTargets; + const unsigned fdipMaxPrefetchesPerCycle; + const unsigned fdipMaxBlocksPerTarget; + const unsigned fdipMinTargetDistance; + const unsigned fdipMinTargetAgeCycles; + const bool fdipSkipTargetStartBlock; + const unsigned fdipMaxPendingTranslations; + unsigned fdipPendingTranslations; + unsigned fdipPendingPrefetches; + uint64_t fdipGeneration; + std::vector fdipRetryPkt; + std::unordered_map fdipPacketGenerations; + std::unordered_set + fdipIssuedTargets[MaxThreads]; + // Constants for misaligned fetch handling static constexpr unsigned CACHE_LINE_SIZE_BYTES = 64; @@ -1105,6 +1165,20 @@ class Fetch statistics::Scalar traceMetaCleanupSquashEntries; /** Number of times cleanup was called on successful commit. */ statistics::Scalar traceMetaCleanupCommitCalls; + + statistics::Scalar fdipTargetsIdentified; + statistics::Scalar fdipTargetsAlreadyIssued; + statistics::Scalar fdipBlocksIdentified; + statistics::Scalar fdipTranslationsStarted; + statistics::Scalar fdipTranslationThrottled; + statistics::Scalar fdipTranslationFaults; + statistics::Scalar fdipPrefetchesIssued; + statistics::Scalar fdipPrefetchesDropped; + statistics::Scalar fdipPrefetchRetriesQueued; + statistics::Scalar fdipPrefetchRetriesSent; + statistics::Scalar fdipPrefetchResponses; + statistics::Scalar fdipStaleTranslations; + statistics::Scalar fdipStalePrefetchResponses; } fetchStats; SquashVersion localSquashVer; diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 8628e2fc4e..3096abe46d 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -424,6 +424,9 @@ class DecoupledBPUWithBTB : public BPredUnit // Fetch-facing interface: consume FSQ head directly (RTL-like single queue). bool ftqHasFetching(ThreadID tid) const { return ftq.hasTarget(ftq.fetchId(tid), tid); } FetchTargetId ftqHeadId(ThreadID tid) const { assert(ftqHasFetching(tid)); return ftq.fetchId(tid); } + FetchTargetId ftqBackId(ThreadID tid) const { assert(ftqHasFetching(tid)); return ftq.backId(tid); } + bool ftqHasTarget(FetchTargetId target_id, ThreadID tid) const { return ftq.hasTarget(target_id, tid); } + const FetchTarget &ftqTarget(FetchTargetId target_id, ThreadID tid) { return ftq.get(target_id, tid); } const FetchTarget &ftqFetchingTarget(ThreadID tid) { assert(ftqHasFetching(tid)); return ftq.fetching(tid); } void dumpFsq(const char *when); diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc index f59d0731df..acc9827335 100644 --- a/src/mem/cache/cache.cc +++ b/src/mem/cache/cache.cc @@ -404,6 +404,9 @@ Cache::handleTimingReqMiss(PacketPtr pkt, CacheBlk *blk, Tick forward_time, // If an outstanding request is in progress (we found an // MSHR) this is set to null pkt = pf; + if (pkt == nullptr) { + return; + } } WriteQueueEntry *wb_entry = writeBuffer.findMatch(pkt->getAddr(), From 58c7bb780c3fed076908c97c29bbe54636a3db96 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Tue, 19 May 2026 16:56:39 +0800 Subject: [PATCH 3/6] mem-cache: Add cache-side fetch directed prefetcher Change-Id: If14984c470393aa6cb22e89f2c1572401232183d --- configs/common/CacheConfig.py | 3 + configs/common/PrefetcherConfig.py | 7 +- ...decoupled-fe-fdip-comparison-2026-05-18.md | 205 +++++++++ src/cpu/o3/cpu.cc | 6 + src/cpu/o3/cpu.hh | 5 + src/cpu/pred/btb/decoupled_bpred.cc | 40 +- src/cpu/pred/btb/decoupled_bpred.hh | 5 + src/cpu/pred/btb/fdip_target.hh | 53 +++ src/mem/cache/prefetch/Prefetcher.py | 30 ++ src/mem/cache/prefetch/SConscript | 4 +- src/mem/cache/prefetch/fdp.cc | 392 ++++++++++++++++++ src/mem/cache/prefetch/fdp.hh | 179 ++++++++ 12 files changed, 926 insertions(+), 3 deletions(-) create mode 100644 docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md create mode 100644 src/cpu/pred/btb/fdip_target.hh create mode 100644 src/mem/cache/prefetch/fdp.cc create mode 100644 src/mem/cache/prefetch/fdp.hh diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py index 3adf07fe8c..cdc7218663 100644 --- a/configs/common/CacheConfig.py +++ b/configs/common/CacheConfig.py @@ -69,6 +69,9 @@ def _get_cache_opts(cpu, level, options): prefetcher_attr = '{}_hwp_type'.format(level) if hasattr(options, prefetcher_attr) and (not options.no_pf): opts['prefetcher'] = create_prefetcher(cpu, level, options) + if level == 'l1i' and getattr(options, prefetcher_attr) == \ + 'FetchDirectedPrefetcher': + opts['demand_mshr_reserve'] = 2 return opts diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py index bd19bbbe0a..afe5efbd0e 100644 --- a/configs/common/PrefetcherConfig.py +++ b/configs/common/PrefetcherConfig.py @@ -29,7 +29,12 @@ def create_prefetcher(cpu, cache_level, options): return NULL if cpu != NULL: - prefetcher.registerTLB(cpu.mmu.dtb, cpu.mmu.functional) + prefetcher.registerTLB( + cpu.mmu.itb if cache_level == 'l1i' else cpu.mmu.dtb, + cpu.mmu.functional) + + if prefetcher_name == 'FetchDirectedPrefetcher': + prefetcher.cpu = cpu if prefetcher_name == 'XSCompositePrefetcher': if options.l1d_enable_spp: diff --git a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md new file mode 100644 index 0000000000..19ab20caeb --- /dev/null +++ b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md @@ -0,0 +1,205 @@ +# 对比上游 decoupled frontend / FDIP 与当前 Kunminghu 前端 + +## 背景和目标 + +当前 `/nfs/home/yanyue/workspace/GEM5-raw` 是已经更新到 `develop` 的上游 gem5 仓库,和当前主要工作仓库 `/nfs/home/yanyue/workspace/GEM5_review` 分叉超过三年。上游已经合入通用 O3 decoupled frontend、Fetch Directed Prefetcher(FDP/FDIP 类似机制)以及相关配置;本仓库则长期演进出 Kunminghu v3 对齐的 decoupled BTB 前端、FTQ/FSQ、BTBTAGE、MGSC、AheadBTB、MicroTAGE 和本地 prefetch 体系。 + +本任务的目标不是直接移植上游实现,而是建立一份可持续更新的机制对照,回答: + +- 上游 decoupled frontend 和 FDP 的核心设计是什么,真实入口在哪里 +- 当前 Kunminghu v3 前端已经覆盖了哪些能力,结构上有哪些不同 +- 上游哪些特征值得借鉴,分别适合进入 BPU、Fetch/FTQ、ICache prefetch、统计观测或配置层 +- 哪些特征只适合上游 ARM/O3 通用路径,不适合直接搬到当前 RTL-aligned 路径 +- 后续若要尝试,应如何拆成低风险、可验证的小实验 + +最终产出应是一份分层推荐,而不是单一结论:先给出机制级差异和候选特征,再按收益潜力、接入风险、验证成本排序。 + +## 当前已知信息 + +- 当前仓库状态: + - `GEM5_review` 位于 `xs-dev...origin/xs-dev`,当前未见工作区改动 + - `GEM5-raw` 位于 `develop...origin/develop`,存在用户已有的 `.gitignore` 修改,应保持只读对比 +- 本仓库的架构文档明确指出,当前活跃路径是 `configs/example/kmhv3.py` 选择的 XiangShan/Kunminghu v3 O3 + decoupled BTB frontend;代码真实入口主要在 `src/cpu/o3/fetch.*` 和 `src/cpu/pred/btb/` +- 本仓库已有设计文档索引位于 `docs/design-docs/frontend/README.md`,相关主题包括 `bpu_top_level.md`、`mbtb_design.md`、`btb_tage_design.md`、`mgsc_design.md`、`ubtb_design.md`、`abtb_design.md`、`microtage_design.md` +- 上游 decoupled frontend 关键提交包括: + - `719c799dc3 cpu: Implement decoupled front-end` + - `145efd442b mem-cache: Add fetch directed prefetcher` + - `38c7e348a5 mem-cache: Prefetch for all cache blocks in a Fetch Target` + - `d55f46336f cpu: Branch predictor latency and overriding model for the decoupled frontend` + - `e676195c5f cpu-o3,stdlib: Stdlib configs for decoupled FE` +- 上游实现的初步入口: + - `src/cpu/o3/bac.*`:branch address calculation / FTQ producer + - `src/cpu/o3/ftq.*`:通用 FetchTarget / FTQ + - `src/cpu/o3/fetch.*`:消费 FTQ 的 fetch stage + - `src/mem/cache/prefetch/fdp.*`:Fetch Directed Prefetcher + - `src/cpu/o3/BaseO3CPU.py`:`decoupledFrontEnd`、`numFTQEntries`、`fetchTargetWidth` 等参数 + - `configs/common/cores/arm/neoverse_v2.py`:上游 ARM 配置示例启用 decoupled frontend +- 本仓库当前实现的初步入口: + - `src/cpu/pred/btb/decoupled_bpred.*` + - `src/cpu/pred/btb/ftq.*` + - `src/cpu/pred/btb/mbtb.*` + - `src/cpu/pred/btb/abtb.*` + - `src/cpu/pred/btb/microtage.*` + - `src/cpu/pred/btb/btb_tage.*` + - `src/cpu/pred/btb/btb_mgsc.*` + - `src/cpu/o3/fetch.*` + - `configs/example/kmhv3.py` + +## 假设和开放问题 + +- 假设:上游 FDP 最有可能被借鉴的部分不是预测算法本身,而是 `FTQInsert` / `FTQRemove` probe 驱动的生命周期、翻译队列、squash、cache snoop、按 FetchTarget 覆盖所有 cache block 的策略。 +- 假设:上游 decoupled frontend 的 `BAC -> FTQ -> Fetch` 切分,和本仓库的 `BPU 自产 FTQ/FSQ -> Fetch` 切分不同;直接搬上游 BAC 价值有限,但其 surprise branch、prediction latency、override/resteer 建模可能有参考价值。 +- 假设:本仓库已有更强的 Kunminghu 专用 BPU 结构,所以上游 ARM/Neoverse 配置中的 predictor 组合和宽度参数不能直接作为性能方向。 +- 开放问题:本仓库是否已有 FDIP-like 机制或预留接口,只是命名不同;需要从 `fetch.md`、`fetch.cc`、prefetcher 配置和 cache probe 接线确认。 +- 开放问题:上游 FDP 的虚拟地址翻译、跨页 prefetch、cache snoop 和 request 标记,与本仓库 RISC-V FS/checkpoint/difftest 路径是否存在副作用。 +- 开放问题:上游 `d55f46336f` 的 branch predictor latency / overriding model 与当前 `AheadBTB`、`MicroTAGE`、`numOverrideBubbles` 的关系是什么,能否用于改进观测或实验建模。 + +## 计划步骤 + +1. 梳理上游实现骨架,输出文件级结构图和参数表 + - 目标:确认上游 decoupled FE / FDP 的真实控制流 + - 产出:关键文件、类、参数、probe、stats 列表 + +2. 梳理当前 Kunminghu v3 本地基线 + - 目标:确认本仓库已有的 FTQ/FSQ、fetch target 生命周期、BPU update、ICache/prefetch 接口 + - 产出:本地对应文件、已有能力、缺口列表 + +3. 建立机制对照表 + - 目标:把上游特征映射到本地模块边界 + - 产出:`上游特征 -> 本地现状 -> 可借鉴方式 -> 风险 -> 验证` 表格 + +4. 选出第一批候选特征 + - 目标:优先挑不改变预测语义、主要增加观测或可控实验能力的低风险项 + - 产出:推荐优先级和每项最小 patch 思路 + +5. 定义验证路径 + - 目标:给每个候选项配一个最小可复现验证,而不是只靠直觉 + - 产出:本地 smoke/unit 测试、micro-test 或 SPEC slice 方案 + +6. 如用户确认方向,再拆分实现阶段 + - 目标:把长期任务拆成可 review 的独立提交 + - 产出:每阶段改动范围、预期收益和回退方式 + +## 验证 + +本阶段是分析与方案阶段,完成标准是: + +- 已确认上游和本地各自的关键代码入口和控制流 +- 已明确哪些上游特征可以借鉴,哪些不建议直接搬 +- 每个推荐候选都有本地接入点、风险说明和验证计划 +- 若进入代码阶段,优先从观测/配置/低语义风险 patch 开始,并用本仓库既有 BPU 单测、fetch smoke、代表 SPEC slice 或 CI 统计对照验证 + +## 进展 + +- [x] 2026-05-18 16:20 创建 ExecPlan,记录任务边界、已知入口和初始假设 +- [x] 2026-05-18 16:20 确认两个仓库分支状态:`GEM5_review` 在 `xs-dev`,`GEM5-raw` 在 `develop` +- [x] 2026-05-18 16:20 初步定位上游 decoupled FE/FDP 关键提交和入口文件 +- [x] 2026-05-18 16:35 梳理上游 `BAC/FTQ/Fetch/FDP` 详细控制流,确认 FDP 主要通过 FTQ probe 驱动 +- [x] 2026-05-18 16:40 梳理本地 `DecoupledBPUWithBTB/FTQ/Fetch/Prefetch` 对应能力,确认当前没有 BPU/FTQ 驱动的 FDIP-like instruction prefetch +- [x] 2026-05-18 16:45 形成第一版机制对照和候选特征推荐 +- [x] 2026-05-18 16:55 选择第一项低风险实验:先做 FDIP observability / target lifecycle stats,不直接发 instruction prefetch +- [x] 2026-05-18 16:55 新建分支 `fdip-align` +- [x] 2026-05-18 16:58 提交 `797e2e7cbc cpu: Add FDIP opportunity statistics` +- [x] 2026-05-18 16:59 push 到 `origin/fdip-align`,触发 CI run `26023246359` +- [x] 2026-05-18 17:55 扫描 `gcc15-spec06-0.3c` 最近归档,确认 ICache miss 最高的一批切片主要来自 gcc +- [x] 2026-05-18 18:15 本地完成 `gcc_typeck_4528`、`gcc_expr2_27` 两个切片的 FDIP 观测统计 +- [x] 2026-05-18 18:15 尝试本地跑最高 miss 的 `gcc_s04_7630`,但 `gem5.opt` 30 分钟超时,只保留 reset 前 20M 指令窗口作为参考,不作为最终 ROI 结论 +- [x] 2026-05-18 18:20 调整后续本地实验口径:优先跑短窗口 `5M+5M` 或 `10M+10M`,并发跑多个切片,先看趋势再决定是否扩展到完整 40M +- [x] 2026-05-18 19:15 实现 fetch 侧 FDIP prototype:从未来 FTQ target 取 `startPC -> predEndPC`,经 ITLB 翻译后向 L1I 发 `SoftPFReq` +- [x] 2026-05-18 19:35 修复 prototype 稳定性问题:补 FDIP retry 队列、squash/reset generation、drain 等待、stale response 丢弃,并修复 cache 里 SoftPFReq 合并到已有 MSHR 后 `pkt == nullptr` 的返回路径 +- [x] 2026-05-18 19:40 本地完成 4 个 gcc 高 I-cache-miss 候选切片的 `5M+5M` A/B:`gcc_typeck_4528`、`gcc_expr2_27`、`gcc_200_28`、`gcc_expr_4892` +- [x] 2026-05-18 19:55 试过三类过滤:target distance、跳过 target start block、target age >= 16 cycles;均未形成稳定正收益 +- [x] 2026-05-18 20:05 结论:不 push FDIP 行为 patch 到 CI;保留本地 off-by-default prototype 和实验数据,下一步应转向 cache-side FDP/snoop 或更强过滤 +- [x] 2026-05-19 16:05 提交 `a7624bfc77 cpu: Add off-by-default FDIP prototype`,把 fetch 侧 prototype 固化为本地备选但默认关闭 +- [x] 2026-05-19 16:40 实现 cache-side `FetchDirectedPrefetcher` prototype:BPU/FTQ 发 `FTQInsert`/`FTQRemove` probe,L1I prefetcher 监听 target 生命周期,经 ITB timing translation 后以 `HardPFReq` 进入 cache 侧 snoop/MSHR 路径 +- [x] 2026-05-19 16:55 完成 cache-side FDP 的稳定化:默认 `pfq_size=1`、`tq_size=1`、`min_target_distance=32`、`latency=64`,并在 L1I 使用 FDP 时把 `demand_mshr_reserve` 提到 2,避免 4-entry L1I MSHR 被 FDP 抢占过多 +- [x] 2026-05-19 17:00 完成本地 `gcc_typeck_4528`、`gcc_expr2_27` 的 `5M+5M` A/B,结果为小幅正向但很接近噪声;准备以单独实验提交启用 `kmhv3.py` 默认 L1I FDP 后 push CI 观察全套 0.3c + +## 发现和意外 + +- 上游实现把通用 O3 decoupled frontend 明确拆成 `BAC` 和 `FTQ`,并用 FTQ probe 驱动 FDP;这和本仓库把预测前端主体放在 `src/cpu/pred/btb/` 内部的组织方式明显不同。 +- 上游 FDP 不是简单 next-line prefetch;它跟随 FetchTarget 生命周期,并且后续提交已经扩展到对 FetchTarget 覆盖的所有 cache block 产生候选。 +- 上游 `BAC` 的基本模型是从当前 PC 按 `minInstSize` 扫描,借助 `BPredUnit::BTBValid()` 找到第一个 BTB hit,然后把最多一个控制流的 FetchTarget 放入 FTQ。这个模型适合通用 O3 解耦,但不适合直接替代本地 `UBTB/AheadBTB/MicroTAGE/MBTB/BTBTAGE/ITTAGE/MGSC/RAS` 的多级块级预测。 +- 上游 `FetchDirectedPrefetcher` 的核心价值在于生命周期接口:FTQ insert 时按 FetchTarget 覆盖的 cache block 产生候选,走 MMU 翻译和 cache/MSHR snoop;FTQ remove/squash 时取消同一 target 的在途翻译和 PFQ 项。 +- 本地 `FetchTarget` 已经比上游通用 FetchTarget 丰富很多:包含预测/执行 branch 信息、BTB entries、pred metas、GHR/PHR/BWHR/LHR、统计字段等。因此若做 FDIP,应复用本地结构,不应搬上游结构。 +- 本地 `FetchTargetQueue` 目前是预测器内部的 deque + target id,`insert()`、`finishTarget()`、`commitTarget()`、`squashAfter()` 没有对外 probe;fetch 侧只有 demand request 的 `FetchRequestSent` probe。这意味着本地缺的不是普通 cache prefetch 框架,而是“预测 target 生命周期事件”。 +- 当前 `kmhv3.py` 对 `DecoupledBPUWithBTB` 设置 `ftq_size=64`、`fsq_size=64`,并启用 UBTB/ABTB/MicroTAGE/MBTB/TAGE/ITTAGE/MGSC/RAS;但 L1I 侧没有默认 FDIP-like prefetcher。 +- 当前 Fetch 每拍先处理 redirect/squash,再 `dbpbtb->tick()` 推进预测流水,最后按 `ftqFetchingTarget()` 的 startPC 发 I-cache demand request。真正接入 FDIP 时需要小心这个时序,避免 prefetch 生命周期和 resolve/update/squash 顺序不一致。 +- 最近 `gcc15-spec06-0.3c` 归档中,L1I miss 最重的点集中在 gcc:`gcc_s04_7630` 约 20.1 万 misses、2.37% miss rate;`gcc_typeck_4528` 约 10.0 万、1.14%;`gcc_200_28` 约 8.2 万、0.80%;`gcc_expr_4892` 约 6.7 万、0.63%;`gcc_expr2_27` 约 5.8 万、0.61%。 +- 本地 `fdip-align` 观测结果显示,`gcc_typeck_4528` 的 prediction-to-fetch 平均只有约 4.0 cycles,约 5.0% target 超过 15 cycles;`gcc_expr2_27` 平均约 5.5 cycles,约 5.4% target 超过 15 cycles。它们的 L1I miss latency 均值分别约 11.0 和 26.8 cycles。 +- 本地观测也显示潜在污染压力不小:`gcc_typeck_4528` 的 candidate target 约 69.6% commit、30.4% 被 squash;`gcc_expr2_27` 约 55.5% commit、44.5% 被 squash。`gcc_s04_7630` 前半段参考窗口中 squash 比例约 45.8%。 +- 初步判断:当前 BPU/FTQ 提前量对 FDIP 偏短,直接“对所有 target blocks 发 L1I prefetch”可能覆盖有限且错误路径污染较高;若继续做 prototype,应优先加距离/置信过滤,例如只对 ahead >= 16 cycles 的 target、已跨 cache block 的 target、或更可能 commit 的 target 发。 +- 后续本地实验不默认完整 40M 指令。先用 `--warmup-insts-no-switch` / `--maxinsts` 风格的短窗口或等价配置跑 `5M+5M`、`10M+10M`,并发覆盖多个代表切片;只有趋势明确或需要和 CI 对齐时再补完整窗口。 +- Fetch 侧直接发 `SoftPFReq` 的初版可以稳定运行,但收益不稳。默认近距离策略在 4 个短窗口上的 cycles 变化为:`gcc_typeck_4528` -0.041%、`gcc_expr2_27` -0.198%、`gcc_200_28` +0.527%、`gcc_expr_4892` +0.186%。 +- 关键负面信号是 SoftPFReq 绝大多数命中 L1I,几乎没有形成 demand merge:例如默认策略在 `gcc_expr_4892` 发出 231,790 个 FDIP prefetch,其中 SoftPF miss 只有 68;`gcc_expr2_27` 发出 86,680 个,SoftPF miss 只有 73。`system.cpu.icache.demandMergedIntoPfMSHR` 和 `pfMergedWithDemand` 均为 0。 +- 距离过滤能局部改善但不稳定:`fdipMinTargetDistance=3`、`fdipLookaheadTargets=8`、`fdipMaxPrefetchesPerCycle=1` 让 `gcc_expr_4892` 从 +0.186% 改到 -0.052%,但 `gcc_expr2_27` 从 -0.198% 变成 +0.290%。 +- 跳过 FetchTarget 起始 cache block 的策略整体更差:`gcc_typeck_4528` +0.005%、`gcc_expr2_27` +0.344%、`gcc_200_28` +0.472%、`gcc_expr_4892` +0.351%。 +- target age 过滤减小了扰动但仍不够稳:`fdipMinTargetAgeCycles=16` 结果为 `gcc_typeck_4528` +0.132%、`gcc_expr2_27` -0.189%、`gcc_200_28` +0.170%、`gcc_expr_4892` +0.017%。 +- 当前 FDIP prototype 的主要问题不是翻译或 retry 稳定性,而是没有 cache-side snoop/去重,fetch 侧 `SoftPFReq` 会消耗大量已经命中的 L1I 访问。更接近上游 FDP 的下一步应放到 L1I prefetcher/cache 侧,利用 cache tag/MSHR snoop 过滤掉已命中的候选,而不是继续在 fetch 侧盲发。 +- Cache-side FDP 确认可以复用上游最重要的生命周期思想,但在本仓库 L1I 上必须非常保守。默认 L1I 只有 4 个 MSHR,cache-side `HardPFReq` 会真实占用 MSHR 和下游端口;初始宽松配置在 `gcc_typeck_4528` 10K smoke 中触发 commit stuck。 +- 把 cache-side FDP 收紧到 `pfq/tq=1`、`min_target_distance=32`、`latency=64`,并把 L1I `demand_mshr_reserve=2` 后,100K smoke 稳定运行:`gcc_typeck_4528` cycles 173602 -> 172758,I-cache demand misses 1941 -> 1903,但 no-MSHR blocked cycles 10849 -> 17098。 +- `gcc_typeck_4528` 的 `5M+5M` 测量段结果为 cycles 1824859 -> 1822698(约 -0.12%),I-cache misses 15411 -> 15428,`pfIssued=113`、`pfUseful=41`、`pfUnused=49`、`demandMergedIntoPfMSHR=10`,no-MSHR blocked cycles 4171 -> 6655。 +- `gcc_expr2_27` 的 `5M+5M` 测量段结果为 cycles 2410537 -> 2410225(约 -0.013%),I-cache misses 13294 -> 13243,`pfIssued=293`、`pfUseful=146`、`pfUnused=115`、`demandMergedIntoPfMSHR=31`,no-MSHR blocked cycles 12493 -> 14211。 +- 目前 cache-side FDP 的局部信号是“有少量有效覆盖,但 MSHR/port 资源代价也可见”。这值得 push 一轮 CI 看全套 SPEC06 0.3c,但还不能说已经是可合入的收益方向。 + +## 第一版机制对照 + +| 主题 | 上游 GEM5-raw | 当前 GEM5_review | 借鉴判断 | +| --- | --- | --- | --- | +| 前端切分 | `BAC -> FTQ -> Fetch`,BAC 是 O3 stage | `DecoupledBPUWithBTB` 内部生成 FSQ/FTQ,Fetch 直接消费 BPU target | 不搬 BAC;只借鉴解耦边界描述和状态统计 | +| FetchTarget 表达 | 简化 basic-block-like target,最多记录 exit branch/pred target/history | 保存块级预测、多个 BTB entry、各组件 meta、history、resolve/update 信息 | 复用本地结构,不降级 | +| BPU 生成方式 | BTBValid 线性扫描到 branch,predict 后插 FTQ | 多组件分 stage 产生 FullBTBPrediction,按 override bubble 延迟入队 | 不搬扫描逻辑;可借鉴 `maxFTPerCycle/maxTakenPredPerCycle` 作为观测维度 | +| FTQ 生命周期 | insert/remove 都有 probe,供 FDP 监听 | insert/finish/commit/squash 都在内部,缺少 target-level probe | 值得补 target-level probe/callback | +| FDIP/FDP | FTQ insert 触发 prefetch,remove/squash 取消,支持 TQ/PFQ/cache snoop/stats | 未见 BPU/FTQ 驱动 instruction prefetch;只有 demand fetch probe 和通用 prefetch 框架 | 最值得借鉴,建议作为独立实验 | +| 配置 | Neoverse/stdlib 示例启用 decoupled FE + L1I FDP/Tagged | `kmhv3.py` 强 RTL-aligned BTB 前端,prefetch 主要在 data/L2/L3 路径 | 只借鉴配置挂法,不借 ARM 数值 | +| 统计 | FTQ occupancy、BAC 状态、FDP 队列/翻译/cache-snoop 统计 | BPU stage/override/FSQ/branch stats 很丰富,但缺 FDIP 专项 | 可以低风险补 FDIP 专项 stats | + +## 第一批候选特征 + +1. Target-level lifecycle probe/callback + - 接入点:`src/cpu/pred/btb/ftq.hh`、`src/cpu/pred/btb/ftq.cc`、`src/cpu/pred/btb/decoupled_bpred.cc` + - 做法:为本地 `FetchTargetQueue::insert()`、`finishTarget()`、`squashAfter()`、必要时 `commitTarget()` 增加 target id + FetchTarget 摘要事件,先用于统计或 debug,不改变预测语义。 + - 价值:为 FDIP、FSQ ahead distance、错误路径 prefetch 污染统计提供干净挂点。 + - 风险:低到中。需要注意 `FetchTargetQueue` 当前没有 CPU/probe manager 指针,可能更适合先在 `DecoupledBPUWithBTB` 层发事件。 + +2. Kunminghu-FDIP prototype + - 接入点:本地 L1I prefetcher 或新的 `FetchTarget` listener;配置入口可从 `kmhv3.py` 加显式开关。 + - 做法:参考上游 `FetchDirectedPrefetcher` 的 TQ/PFQ/cache snoop/cancel 语义,但输入改成本地 `FetchTarget` 的 `startPC -> predEndPC` 范围。 + - 价值:最可能带来前端 I-cache latency 改善,尤其对 BTB 已能提前看到后续 target 的场景。 + - 风险:中到高。会跨 BPU target 生命周期、RISC-V MMU 翻译、I-cache MSHR、错误路径 squash;必须从 off-by-default 实验开关开始。 + +3. FDIP observability first + - 接入点:新 stats 或 debug flag。 + - 做法:即使暂不发 prefetch,也先统计 FSQ target 的 ahead distance、覆盖 cache block 数、被 squash/commit/finish 的比例、target 从入队到 fetch 的提前周期。 + - 价值:判断当前 BPU ahead depth 是否足以支撑 FDIP;如果提前量不够,先做 FDIP 发包意义不大。 + - 风险:低,是最适合第一步的实验。 + +4. Surprise branch / no-history taxonomy + - 接入点:当前 `controlSquash`、`topMispredictsByBranch`、BTB miss/false hit stats。 + - 做法:借鉴上游“BTB 没看到但 fetch/decode 发现”的分类思路,细分当前 no-pred / false-hit / target-wrong 场景。 + - 价值:帮助区分 BPU 没学到、BTB 容量不够、方向错、target 错、fetch block 截断等原因。 + - 风险:低,主要是统计口径设计。 + +5. Branch predictor latency / override model 对照 + - 接入点:当前 `numOverrideBubbles`、`predsOfEachStage`、`overrideReason`。 + - 做法:不搬上游 latency 模型,但检查上游 `d55f46336f` 后的 `Prediction.latency` 和 override/resteer 分类,看是否能补充本地 stage latency 观测。 + - 价值:让 `AheadBTB/MicroTAGE` 的收益和 override bubble 成本更容易解释。 + - 风险:低到中,取决于是否只补统计还是改行为。 + +## 决策记录 + +- Decision: 本轮先做机制对比和候选特征筛选,不直接移植代码。 +- Reason: 两边前端架构边界不同,直接搬上游 BAC/FTQ 容易破坏本仓库 RTL-aligned 路径;先筛选低风险特征更稳。 +- Date: 2026-05-18 +- Decision: 第一优先级建议从 FDIP observability / target lifecycle event 开始,而不是直接发 I-cache prefetch。 +- Reason: 当前本地缺的是 target-level 生命周期挂点和 ahead-distance/污染统计;先补观测能判断 FDIP 是否有足够提前量,并降低对 Fetch/ResolveQueue 语义的扰动。 +- Date: 2026-05-18 +- Decision: `fdip-align` 首个 patch 只增加统计,不改变预测、fetch 或 cache 行为。 +- Reason: 需要先确认当前 FSQ/FTQ ahead distance、candidate cache block 数、fetch/commit/squash 生命周期比例;如果 SPEC06 的 L1I miss 很少或预测提前量不够,FDIP prototype 的收益预期会很弱。 +- Date: 2026-05-18 +- Decision: 不把 fetch-side FDIP prototype push 到 CI。 +- Reason: 本地 `5M+5M` 短窗口没有稳定收益,且 SoftPFReq 绝大多数是 L1I hit,说明当前直接从 fetch 端发 prefetch 缺少上游 FDP 的 cache-side snoop/去重能力。继续推 CI 大概率浪费完整 SPEC 资源。 +- Date: 2026-05-18 +- Decision: 若继续 FDIP,应优先做 cache-side FDP/snoop 型实现,或至少为 fetch-side prototype 增加“只对 cache/MSHR miss 候选发包”的过滤接口。 +- Reason: 上游 FDP 的 TQ/PFQ/cache snoop 生命周期正是当前 prototype 缺失的关键能力;仅靠 target distance、skip-start-block、target-age 过滤无法稳定避免 L1I-hit prefetch 扰动。 +- Date: 2026-05-18 diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 502f02c25e..4754cad217 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -340,6 +340,12 @@ CPU::regProbePoints() ppDataAccessComplete = new ProbePointArg< std::pair>( getProbeManager(), "DataAccessComplete"); + ppFTQInsert = + new ProbePointArg( + getProbeManager(), "FTQInsert"); + ppFTQRemove = + new ProbePointArg( + getProbeManager(), "FTQRemove"); fetch.regProbePoints(); rename.regProbePoints(); diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index fae5eea4d4..1df4749c57 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -69,6 +69,7 @@ #include "cpu/o3/rob.hh" #include "cpu/o3/scoreboard.hh" #include "cpu/o3/thread_state.hh" +#include "cpu/pred/btb/fdip_target.hh" #include "cpu/simple_thread.hh" #include "cpu/timebuf.hh" #include "cpu/valuepred/valuepred_unit.hh" @@ -193,6 +194,10 @@ class CPU : public BaseCPU ProbePointArg *ppInstAccessComplete; ProbePointArg > *ppDataAccessComplete; + ProbePointArg + *ppFTQInsert; + ProbePointArg + *ppFTQRemove; /** Register probe points. */ void regProbePoints() override; diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index fc668212f1..1ee376a809 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -1,6 +1,7 @@ #include "cpu/pred/btb/decoupled_bpred.hh" #include +#include #include "base/debug_helper.hh" #include "base/output.hh" @@ -24,9 +25,11 @@ namespace btb_pred void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { + const auto target_id = ftq.fetchId(tid); auto &target = ftq.fetching(tid); target.fetchInstNum = fetched_inst_num; recordFdipFetchedTarget(target); + notifyFdipTargetRemove(target, target_id); ftq.finishTarget(tid); } @@ -180,6 +183,7 @@ DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid, const auto &target = ftq.get(id, tid); dbpBtbStats.fdipTargetsSquashed++; dbpBtbStats.fdipTargetSquashLatency.sample(fdipTargetAgeCycles(target), 1); + notifyFdipTargetRemove(target, id); squashed++; } @@ -188,6 +192,32 @@ DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid, } } +void +DecoupledBPUWithBTB::notifyFdipTargetInsert(const FetchTarget &target, + FetchTargetId target_id, + uint64_t distance_from_fetch_head) const +{ + if (!cpu || !cpu->ppFTQInsert) { + return; + } + + cpu->ppFTQInsert->notify( + std::make_shared( + target, target_id, distance_from_fetch_head)); +} + +void +DecoupledBPUWithBTB::notifyFdipTargetRemove(const FetchTarget &target, + FetchTargetId target_id) const +{ + if (!cpu || !cpu->ppFTQRemove) { + return; + } + + cpu->ppFTQRemove->notify( + std::make_shared(target, target_id, 0)); +} + void DecoupledBPUWithBTB::tick() @@ -449,14 +479,22 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid) // 4. Fill ahead pipeline fillAheadPipeline(entry); + const FetchTargetId target_id = + ftq.empty(tid) ? ftq.frontId(tid) : ftq.backId(tid) + 1; + const FetchTargetId fetch_id = + ftq.empty(tid) ? target_id : ftq.fetchId(tid); + const uint64_t distance_from_fetch_head = + target_id >= fetch_id ? target_id - fetch_id : 0; + if (enablePredFSQTrace) { - predTraceManager->write_record(PredictionTrace(ftq.backId(tid), entry)); + predTraceManager->write_record(PredictionTrace(target_id, entry)); } recordFdipCandidateTarget(entry); // 5. Add entry to fetch target queue ftq.insert(entry); + notifyFdipTargetInsert(entry, target_id, distance_from_fetch_head); threads[tid].validprediction = false; // 6. Debug output and update statistics diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 3096abe46d..138760d9d8 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -194,6 +194,11 @@ class DecoupledBPUWithBTB : public BPredUnit void recordFdipCommittedTarget(const FetchTarget &target); void recordFdipSquashedTargets(ThreadID tid, FetchTargetId firstTargetId, FetchTargetId lastTargetId); + void notifyFdipTargetInsert(const FetchTarget &target, + FetchTargetId target_id, + uint64_t distance_from_fetch_head) const; + void notifyFdipTargetRemove(const FetchTarget &target, + FetchTargetId target_id) const; void clearPreds(ThreadID tid) { for (auto &stagePred : threads[tid].predsOfEachStage) { diff --git a/src/cpu/pred/btb/fdip_target.hh b/src/cpu/pred/btb/fdip_target.hh new file mode 100644 index 0000000000..9354e4f0dd --- /dev/null +++ b/src/cpu/pred/btb/fdip_target.hh @@ -0,0 +1,53 @@ +#ifndef __CPU_PRED_BTB_FDIP_TARGET_HH__ +#define __CPU_PRED_BTB_FDIP_TARGET_HH__ + +#include + +#include "base/types.hh" +#include "cpu/o3/limits.hh" +#include "cpu/pred/btb/common.hh" + +namespace gem5 +{ + +namespace branch_prediction +{ + +namespace btb_pred +{ + +struct FdipFetchTarget +{ + ThreadID tid; + FetchTargetId id; + Addr startPC; + Addr predEndPC; + Tick predTick; + uint64_t distanceFromFetchHead; + + FdipFetchTarget(ThreadID _tid, FetchTargetId _id, Addr _start_pc, + Addr _pred_end_pc, Tick _pred_tick, + uint64_t _distance_from_fetch_head) + : tid(_tid), + id(_id), + startPC(_start_pc), + predEndPC(_pred_end_pc), + predTick(_pred_tick), + distanceFromFetchHead(_distance_from_fetch_head) + {} + + FdipFetchTarget(const FetchTarget &target, FetchTargetId _id, + uint64_t _distance_from_fetch_head) + : FdipFetchTarget(target.tid, _id, target.startPC, + target.predEndPC, target.predTick, + _distance_from_fetch_head) + {} +}; + +using FdipFetchTargetPtr = std::shared_ptr; + +} // namespace btb_pred +} // namespace branch_prediction +} // namespace gem5 + +#endif // __CPU_PRED_BTB_FDIP_TARGET_HH__ diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py index eea10b7229..8a3db37c44 100644 --- a/src/mem/cache/prefetch/Prefetcher.py +++ b/src/mem/cache/prefetch/Prefetcher.py @@ -932,6 +932,36 @@ def listenFromProbeRetiredInstructions(self, simObj): self.addEvent(HWPProbeEventRetiredInsts(self, simObj,"RetiredInstsPC")) +class FetchDirectedPrefetcher(BasePrefetcher): + type = "FetchDirectedPrefetcher" + cxx_class = "gem5::prefetch::FetchDirectedPrefetcher" + cxx_header = "mem/cache/prefetch/fdp.hh" + + cpu = Param.BaseCPU(Parent.any, "CPU whose FTQ target probes are tracked") + + latency = Param.Cycles(64, "Latency for generated prefetches") + pfq_size = Param.Unsigned(1, "Maximum number of queued prefetches") + tq_size = Param.Unsigned(1, "Maximum outstanding translations") + mark_req_as_prefetch = Param.Bool( + True, + "Mark translation requests as prefetches") + squash_prefetches = Param.Bool( + True, + "Squash queued prefetches when the source FTQ target is removed") + cache_snoop = Param.Bool( + True, + "Drop candidates that already hit in the cache or MSHR") + max_blocks_per_target = Param.Unsigned( + 1, + "Maximum cache blocks to prefetch per target; 0 means unlimited") + skip_target_start_block = Param.Bool( + False, + "Skip the cache block containing the target start PC") + min_target_distance = Param.Unsigned( + 32, + "Minimum FTQ distance from fetch head before generating candidates") + + class IPCPrefetcher(QueuedPrefetcher): type = 'IPCPrefetcher' cxx_class = 'gem5::prefetch::IPCP' diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript index 2e074e2fdd..2754e16071 100644 --- a/src/mem/cache/prefetch/SConscript +++ b/src/mem/cache/prefetch/SConscript @@ -39,7 +39,8 @@ SimObject('Prefetcher.py', sim_objects=[ 'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher', 'WorkerPrefetcher', 'DespacitoStreamPrefetcher', 'BOPPrefetcher', 'SBOOEPrefetcher', 'STeMSPrefetcher', 'PIFPrefetcher', 'IPCPrefetcher', - 'CompositeWithWorkerPrefetcher', 'L2CompositeWithWorkerPrefetcher', 'PrefetcherForwarder']) + 'CompositeWithWorkerPrefetcher', 'L2CompositeWithWorkerPrefetcher', + 'PrefetcherForwarder', 'FetchDirectedPrefetcher']) DebugFlag('BOPPrefetcher') @@ -89,4 +90,5 @@ Source('composite_with_worker.cc') Source('l2_composite_with_worker.cc') Source('despacito_stream.cc') Source('forwarder.cc') +Source('fdp.cc') Source('prefetch_filter.cc') diff --git a/src/mem/cache/prefetch/fdp.cc b/src/mem/cache/prefetch/fdp.cc new file mode 100644 index 0000000000..d47d650de0 --- /dev/null +++ b/src/mem/cache/prefetch/fdp.cc @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2022-2023 The University of Edinburgh + * Copyright (c) 2025 Arm Limited + * All rights reserved + * + * The license below extends only to copyright in this file. You may use this + * file subject to the license terms below provided that you ensure that this + * notice is replicated unmodified and in its entirety in all distributions, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer; redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution; neither the name of the copyright holders nor + * the names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mem/cache/prefetch/fdp.hh" + +#include + +#include "debug/HWPrefetch.hh" +#include "mem/request.hh" +#include "params/FetchDirectedPrefetcher.hh" + +namespace gem5 +{ + +namespace prefetch +{ + +FetchDirectedPrefetcher::FetchDirectedPrefetcher( + const FetchDirectedPrefetcherParams &p) + : Base(p), + cpu(p.cpu), + markReqAsPrefetch(p.mark_req_as_prefetch), + squashPrefetches(p.squash_prefetches), + latency(cyclesToTicks(p.latency)), + pfqSize(p.pfq_size), + tqSize(p.tq_size), + cacheSnoop(p.cache_snoop), + maxBlocksPerTarget(p.max_blocks_per_target), + skipTargetStartBlock(p.skip_target_start_block), + minTargetDistance(p.min_target_distance), + stats(this, p.pfq_size, p.tq_size) +{} + +FetchDirectedPrefetcher::~FetchDirectedPrefetcher() +{ + for (auto *listener : listeners) { + delete listener; + } + + for (auto &pr : pfq) { + delete pr.pkt; + } + for (auto &pr : translationq) { + delete pr.pkt; + } +} + +FetchDirectedPrefetcher::FdipListener::FdipListener( + FetchDirectedPrefetcher &_parent, ProbeManager *pm, + const std::string &name, bool _insert) + : ProbeListenerArgBase(pm, name), + parent(_parent), + insert(_insert) +{} + +void +FetchDirectedPrefetcher::FdipListener::notify(const FdipFetchTargetPtr &ft) +{ + if (insert) { + parent.notifyFTQInsert(ft); + } else { + parent.notifyFTQRemove(ft); + } +} + +void +FetchDirectedPrefetcher::regProbeListeners() +{ + Base::regProbeListeners(); + + if (cpu == nullptr) { + warn("FetchDirectedPrefetcher: no CPU to listen from\n"); + return; + } + + listeners.push_back( + new FdipListener(*this, cpu->getProbeManager(), "FTQInsert", true)); + listeners.push_back( + new FdipListener(*this, cpu->getProbeManager(), "FTQRemove", false)); +} + +void +FetchDirectedPrefetcher::notifyFTQInsert(const FdipFetchTargetPtr &ft) +{ + stats.fdipInsertions++; + + if (ft->distanceFromFetchHead < minTargetDistance) { + stats.targetTooNear++; + return; + } + + Addr end_pc = ft->predEndPC; + if (end_pc <= ft->startPC) { + end_pc = ft->startPC + 1; + } + + Addr blk_addr = blockAddress(ft->startPC); + const Addr end_blk_addr = blockAddress(end_pc - 1); + if (skipTargetStartBlock) { + blk_addr += blkSize; + } + + unsigned blocks = 0; + while (blk_addr <= end_blk_addr && + (maxBlocksPerTarget == 0 || blocks < maxBlocksPerTarget)) { + auto pfq_it = std::find_if( + pfq.begin(), pfq.end(), + [blk_addr](const PrefetchRequest &pr) { + return pr.sameBlock(blk_addr); + }); + if (pfq_it != pfq.end()) { + DPRINTF(HWPrefetch, "%#x already in FDP prefetch queue\n", + blk_addr); + stats.pfInPFQ++; + blk_addr += blkSize; + blocks++; + continue; + } + + auto tq_it = std::find_if( + translationq.begin(), translationq.end(), + [blk_addr](const PrefetchRequest &pr) { + return pr.sameBlock(blk_addr); + }); + if (tq_it != translationq.end()) { + DPRINTF(HWPrefetch, "%#x already in FDP translation queue\n", + blk_addr); + stats.pfInTQ++; + blk_addr += blkSize; + blocks++; + continue; + } + + stats.pfIdentified++; + + if (translationq.size() >= tqSize) { + DPRINTF(HWPrefetch, + "FDP translation queue full, dropping %#x\n", blk_addr); + stats.tqDrops++; + blk_addr += blkSize; + blocks++; + continue; + } + + translationq.emplace_back(*this, blk_addr, ft->tid, ft->id); + DPRINTF(HWPrefetch, "FDP starts translation for %#x ftq=%lu\n", + blk_addr, ft->id); + translationq.back().startTranslation(); + stats.tqInserts++; + stats.tqSizeDistAtNotify.sample(translationq.size()); + stats.pfqSizeDistAtNotify.sample(pfq.size()); + + blk_addr += blkSize; + blocks++; + } +} + +void +FetchDirectedPrefetcher::notifyFTQRemove(const FdipFetchTargetPtr &ft) +{ + stats.fdipRemovals++; + + if (!squashPrefetches) { + return; + } + + for (auto &pr : translationq) { + if (pr.ftid == ft->id) { + pr.markCanceled(); + stats.pfSquashed++; + } + } + + auto it = pfq.begin(); + while (it != pfq.end()) { + if (it->ftid == ft->id) { + delete it->pkt; + it = pfq.erase(it); + stats.pfSquashed++; + } else { + ++it; + } + } +} + +void +FetchDirectedPrefetcher::translationComplete(PrefetchRequest *pfr, bool failed) +{ + auto it = translationq.begin(); + while (it != translationq.end()) { + if (&(*it) == pfr) { + break; + } + ++it; + } + assert(it != translationq.end()); + + if (failed) { + DPRINTF(HWPrefetch, "FDP translation of %#x failed\n", it->addr); + stats.translationFail++; + } else { + stats.translationSuccess++; + const Addr paddr = it->req->getPaddr(); + const bool secure = it->req->isSecure(); + + if (it->isCanceled()) { + DPRINTF(HWPrefetch, + "FDP drops %#x after FTQ removal during translation\n", + it->addr); + } else if (it->req->isUncacheable()) { + DPRINTF(HWPrefetch, "FDP drops uncacheable request %#x\n", + it->addr); + } else if (!system->isMemAddr(paddr)) { + DPRINTF(HWPrefetch, "FDP drops non-memory paddr %#x\n", paddr); + } else if (cacheSnoop && + (inCache(paddr, secure) || inMissQueue(paddr, secure))) { + stats.pfInCache++; + if (hasBeenPrefetched(paddr, secure)) { + stats.pfInCachePrefetched++; + } + DPRINTF(HWPrefetch, + "FDP drops redundant cache/MSHR candidate %#x\n", paddr); + } else if (pfq.size() < pfqSize) { + it->createPkt(); + it->readyTime = curTick() + latency; + stats.pfPacketsCreated++; + stats.pfCandidatesAdded++; + pfq.push_back(*it); + stats.pfqInserts++; + DPRINTF(HWPrefetch, + "FDP queued prefetch VA %#x PA %#x ftq=%lu pfq=%lu\n", + it->addr, paddr, it->ftid, pfq.size()); + } else { + DPRINTF(HWPrefetch, + "FDP prefetch queue full, dropping %#x\n", it->addr); + stats.pfqDrops++; + } + } + + translationq.erase(it); + stats.tqPops++; +} + +PacketPtr +FetchDirectedPrefetcher::getPacket() +{ + if (pfq.empty()) { + return nullptr; + } + + PacketPtr pkt = pfq.front().pkt; + DPRINTF(HWPrefetch, "FDP issues prefetch PA %#x VA %#x ftq=%lu\n", + pkt->getAddr(), pfq.front().addr, pfq.front().ftid); + + pfq.pop_front(); + stats.pfqPops++; + prefetchStats.pfIssued++; + prefetchStats.pfIssued_srcs[pkt->req->getXsMetadata().prefetchSource]++; + issuedPrefetches++; + + return pkt; +} + +FetchDirectedPrefetcher::PrefetchRequest::PrefetchRequest( + FetchDirectedPrefetcher &_owner, Addr _addr, ThreadID tid, + FdipTargetId _ftid) + : owner(_owner), + addr(_addr), + ftid(_ftid), + req(nullptr), + pkt(nullptr), + readyTime(MaxTick), + canceled(false) +{ + auto *tc = owner.cpu->getContext(tid); + req = std::make_shared(addr, owner.blkSize, Request::INST_FETCH, + owner.requestorId, addr, + tc->contextId()); + if (owner.markReqAsPrefetch) { + req->setFlags(Request::PREFETCH); + } + req->setXsMetadata(Request::XsMetadata(PrefetchSourceType::PF_NONE, 0)); + req->setPFSource(PrefetchSourceType::PF_NONE); + req->setPFDepth(0); +} + +void +FetchDirectedPrefetcher::PrefetchRequest::createPkt() +{ + req->taskId(context_switch_task_id::Prefetcher); + pkt = new Packet(req, MemCmd::HardPFReq); + pkt->allocate(); +} + +void +FetchDirectedPrefetcher::PrefetchRequest::startTranslation() +{ + fatal_if(owner.tlb == nullptr, + "FetchDirectedPrefetcher requires an instruction TLB\n"); + auto *tc = owner.system->threads[req->contextId()]; + owner.tlb->translateTiming(req, tc, this, BaseMMU::Execute); +} + +void +FetchDirectedPrefetcher::PrefetchRequest::finish( + const Fault &fault, const RequestPtr &req, ThreadContext *tc, + BaseMMU::Mode mode) +{ + owner.translationComplete(this, fault != NoFault); +} + +FetchDirectedPrefetcher::Stats::Stats( + statistics::Group *parent, int pfq_size, int tq_size) + : statistics::Group(parent), + ADD_STAT(fdipInsertions, statistics::units::Count::get(), + "Number of FTQ insert notifications observed by FDP"), + ADD_STAT(fdipRemovals, statistics::units::Count::get(), + "Number of FTQ remove notifications observed by FDP"), + ADD_STAT(targetTooNear, statistics::units::Count::get(), + "Number of FTQ targets skipped by FDP distance filter"), + ADD_STAT(pfIdentified, statistics::units::Count::get(), + "Number of FDP prefetch candidates identified"), + ADD_STAT(pfSquashed, statistics::units::Count::get(), + "Number of FDP prefetches squashed by FTQ removal"), + ADD_STAT(pfInPFQ, statistics::units::Count::get(), + "Number of FDP candidates already in the prefetch queue"), + ADD_STAT(pfInTQ, statistics::units::Count::get(), + "Number of FDP candidates already in the translation queue"), + ADD_STAT(pfInCache, statistics::units::Count::get(), + "Number of FDP candidates dropped by cache/MSHR snoop"), + ADD_STAT(pfInCachePrefetched, statistics::units::Count::get(), + "Number of FDP snoop drops on already-prefetched blocks"), + ADD_STAT(pfPacketsCreated, statistics::units::Count::get(), + "Number of FDP HardPF packets created"), + ADD_STAT(pfCandidatesAdded, statistics::units::Count::get(), + "Number of FDP candidates added to the prefetch queue"), + ADD_STAT(translationFail, statistics::units::Count::get(), + "Number of FDP translations that failed"), + ADD_STAT(translationSuccess, statistics::units::Count::get(), + "Number of FDP translations that succeeded"), + ADD_STAT(pfqSizeDistAtNotify, statistics::units::Count::get(), + "Distribution of FDP prefetch queue size at notification"), + ADD_STAT(tqSizeDistAtNotify, statistics::units::Count::get(), + "Distribution of FDP translation queue size at notification"), + ADD_STAT(pfqInserts, statistics::units::Count::get(), + "Number of insertions into the FDP prefetch queue"), + ADD_STAT(pfqPops, statistics::units::Count::get(), + "Number of pops from the FDP prefetch queue"), + ADD_STAT(pfqDrops, statistics::units::Count::get(), + "Number of FDP candidates dropped because PFQ was full"), + ADD_STAT(tqInserts, statistics::units::Count::get(), + "Number of insertions into the FDP translation queue"), + ADD_STAT(tqPops, statistics::units::Count::get(), + "Number of pops from the FDP translation queue"), + ADD_STAT(tqDrops, statistics::units::Count::get(), + "Number of FDP candidates dropped because TQ was full") +{ + pfqSizeDistAtNotify.init(0, pfq_size, 4); + tqSizeDistAtNotify.init(0, tq_size, 4); +} + +} // namespace prefetch +} // namespace gem5 diff --git a/src/mem/cache/prefetch/fdp.hh b/src/mem/cache/prefetch/fdp.hh new file mode 100644 index 0000000000..607a9217c2 --- /dev/null +++ b/src/mem/cache/prefetch/fdp.hh @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2022-2023 The University of Edinburgh + * Copyright (c) 2025 Arm Limited + * All rights reserved + * + * The license below extends only to copyright in this file. You may use this + * file subject to the license terms below provided that you ensure that this + * notice is replicated unmodified and in its entirety in all distributions, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer; redistributions in binary + * form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials + * provided with the distribution; neither the name of the copyright holders nor + * the names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __MEM_CACHE_PREFETCH_FDP_HH__ +#define __MEM_CACHE_PREFETCH_FDP_HH__ + +#include +#include + +#include "arch/generic/mmu.hh" +#include "cpu/base.hh" +#include "cpu/pred/btb/fdip_target.hh" +#include "mem/cache/prefetch/base.hh" +#include "sim/probe/probe.hh" + +namespace gem5 +{ + +struct FetchDirectedPrefetcherParams; + +namespace prefetch +{ + +class FetchDirectedPrefetcher : public Base +{ + public: + FetchDirectedPrefetcher(const FetchDirectedPrefetcherParams &p); + ~FetchDirectedPrefetcher(); + + void regProbeListeners() override; + void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override {} + PacketPtr getPacket() override; + + bool + hasPendingPacket() override + { + return !pfq.empty(); + } + + Tick + nextPrefetchReadyTime() const override + { + return pfq.empty() ? MaxTick : pfq.front().readyTime; + } + + void + rxHint(BaseMMU::Translation *dpp) override + {} + + void + pfHitNotify(float accuracy, PrefetchSourceType pf_source, + const PacketPtr &pkt) override + {} + + private: + using FdipFetchTargetPtr = + branch_prediction::btb_pred::FdipFetchTargetPtr; + using FdipTargetId = branch_prediction::btb_pred::FetchTargetId; + + class FdipListener : public ProbeListenerArgBase + { + public: + FdipListener(FetchDirectedPrefetcher &_parent, ProbeManager *pm, + const std::string &name, bool _insert); + void notify(const FdipFetchTargetPtr &ft) override; + + private: + FetchDirectedPrefetcher &parent; + const bool insert; + }; + + std::vector listeners; + + BaseCPU *cpu; + + const bool markReqAsPrefetch; + const bool squashPrefetches; + const Tick latency; + const unsigned pfqSize; + const unsigned tqSize; + const bool cacheSnoop; + const unsigned maxBlocksPerTarget; + const bool skipTargetStartBlock; + const unsigned minTargetDistance; + + struct PrefetchRequest : public BaseMMU::Translation + { + PrefetchRequest(FetchDirectedPrefetcher &_owner, Addr _addr, + ThreadID tid, FdipTargetId _ftid); + + FetchDirectedPrefetcher &owner; + const Addr addr; + const FdipTargetId ftid; + RequestPtr req; + PacketPtr pkt; + Tick readyTime; + bool canceled; + + bool sameBlock(Addr block_addr) const { return addr == block_addr; } + + void createPkt(); + void startTranslation(); + void markDelayed() override {} + void finish(const Fault &fault, const RequestPtr &req, + ThreadContext *tc, BaseMMU::Mode mode) override; + + void markCanceled() { canceled = true; } + bool isCanceled() const { return canceled; } + }; + + std::list pfq; + std::list translationq; + + void notifyFTQInsert(const FdipFetchTargetPtr &ft); + void notifyFTQRemove(const FdipFetchTargetPtr &ft); + void translationComplete(PrefetchRequest *pf_req, bool failed); + + protected: + struct Stats : public statistics::Group + { + Stats(statistics::Group *parent, int pfq_size, int tq_size); + + statistics::Scalar fdipInsertions; + statistics::Scalar fdipRemovals; + statistics::Scalar targetTooNear; + statistics::Scalar pfIdentified; + statistics::Scalar pfSquashed; + statistics::Scalar pfInPFQ; + statistics::Scalar pfInTQ; + statistics::Scalar pfInCache; + statistics::Scalar pfInCachePrefetched; + statistics::Scalar pfPacketsCreated; + statistics::Scalar pfCandidatesAdded; + statistics::Scalar translationFail; + statistics::Scalar translationSuccess; + statistics::Distribution pfqSizeDistAtNotify; + statistics::Distribution tqSizeDistAtNotify; + statistics::Scalar pfqInserts; + statistics::Scalar pfqPops; + statistics::Scalar pfqDrops; + statistics::Scalar tqInserts; + statistics::Scalar tqPops; + statistics::Scalar tqDrops; + } stats; +}; + +} // namespace prefetch +} // namespace gem5 + +#endif // __MEM_CACHE_PREFETCH_FDP_HH__ From 2957b897f40bf657627515b7054913d3d287fc21 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Tue, 19 May 2026 16:57:41 +0800 Subject: [PATCH 4/6] configs: Enable FDIP experiment for kmhv3 Change-Id: I5ad66132b422fb1c6bb2c72c9a0a259e46c6fb83 --- configs/example/kmhv3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py index 48bbe80b75..452bc6fdb1 100644 --- a/configs/example/kmhv3.py +++ b/configs/example/kmhv3.py @@ -176,6 +176,8 @@ def setKmhV3Params(args, system): # Enable prefetch buffers for all hardware prefetchers in this config. args.enable_pf_buffer = True + if not args.no_pf and args.l1i_hwp_type is None: + args.l1i_hwp_type = 'FetchDirectedPrefetcher' # Set default bp_type based on ideal_kmhv3 flag # If user didn't specify bp_type, set default based on ideal_kmhv3 From aab29b82b63de0dc2d6e537478e4fb27500e1527 Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Tue, 19 May 2026 17:01:35 +0800 Subject: [PATCH 5/6] misc: Rebuild DRAMSim3 from a clean cache copy Change-Id: I030e229cb6448ef468c15b838c8c3a85e74b781f --- .github/actions/build-dramsim/action.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/build-dramsim/action.yml b/.github/actions/build-dramsim/action.yml index aeff06114d..d45bd22409 100644 --- a/.github/actions/build-dramsim/action.yml +++ b/.github/actions/build-dramsim/action.yml @@ -12,6 +12,7 @@ runs: if [ ! -d "DRAMsim3" ]; then cp -r /nfs/home/share/gem5_ci/DRAMsim3 . fi + rm -rf DRAMsim3/build cd DRAMsim3 && mkdir -p build cd build cmake .. From 86787a83620bb0b0986c0806b977e3292bb3f2dc Mon Sep 17 00:00:00 2001 From: Yan Yue <1131531947@qq.com> Date: Tue, 19 May 2026 18:10:47 +0800 Subject: [PATCH 6/6] misc: Record FDIP CI findings Change-Id: I85e97dc4a7dbd7ce8f54de8ba6de40a68e189936 --- ...am-decoupled-fe-fdip-comparison-2026-05-18.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md index 19ab20caeb..9604a1c2ac 100644 --- a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md +++ b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md @@ -114,6 +114,10 @@ - [x] 2026-05-19 16:40 实现 cache-side `FetchDirectedPrefetcher` prototype:BPU/FTQ 发 `FTQInsert`/`FTQRemove` probe,L1I prefetcher 监听 target 生命周期,经 ITB timing translation 后以 `HardPFReq` 进入 cache 侧 snoop/MSHR 路径 - [x] 2026-05-19 16:55 完成 cache-side FDP 的稳定化:默认 `pfq_size=1`、`tq_size=1`、`min_target_distance=32`、`latency=64`,并在 L1I 使用 FDP 时把 `demand_mshr_reserve` 提到 2,避免 4-entry L1I MSHR 被 FDP 抢占过多 - [x] 2026-05-19 17:00 完成本地 `gcc_typeck_4528`、`gcc_expr2_27` 的 `5M+5M` A/B,结果为小幅正向但很接近噪声;准备以单独实验提交启用 `kmhv3.py` 默认 L1I FDP 后 push CI 观察全套 0.3c +- [x] 2026-05-19 17:10 提交并 push cache-side FDP 实验序列:`58c7bb780c mem-cache: Add cache-side fetch directed prefetcher`、`2957b897f4 configs: Enable FDIP experiment for kmhv3`、`aab29b82b6 misc: Rebuild DRAMSim3 from a clean cache copy` +- [x] 2026-05-19 18:20 CI run `26087243838` 成功完成,归档路径为 `/nfs/home/share/gem5_ci/performance_data/gcc15-spec06-0.3c/20260519_170615_aab29b8_kmhv3_run571` +- [x] 2026-05-19 18:35 完成 CI 对比:cache-side FDP 的 SPEC06 0.3c overall score 为 19.921159,对比 stats-only baseline `20260518_201303_797e2e7_kmhv3_run569` 的 19.926709,约 -0.028% +- [x] 2026-05-19 18:40 在高 I-cache-MPKI 切片 `gcc_s04_7630` 上本地测试 `min_target_distance=32/64/96/128/256`:`32` 会发包并变慢,`64+` 基本完全不发 FDP ## 发现和意外 @@ -141,6 +145,12 @@ - `gcc_typeck_4528` 的 `5M+5M` 测量段结果为 cycles 1824859 -> 1822698(约 -0.12%),I-cache misses 15411 -> 15428,`pfIssued=113`、`pfUseful=41`、`pfUnused=49`、`demandMergedIntoPfMSHR=10`,no-MSHR blocked cycles 4171 -> 6655。 - `gcc_expr2_27` 的 `5M+5M` 测量段结果为 cycles 2410537 -> 2410225(约 -0.013%),I-cache misses 13294 -> 13243,`pfIssued=293`、`pfUseful=146`、`pfUnused=115`、`demandMergedIntoPfMSHR=31`,no-MSHR blocked cycles 12493 -> 14211。 - 目前 cache-side FDP 的局部信号是“有少量有效覆盖,但 MSHR/port 资源代价也可见”。这值得 push 一轮 CI 看全套 SPEC06 0.3c,但还不能说已经是可合入的收益方向。 +- 全套 CI 结果确认 cache-side FDP 初版不是可合入收益方向。对比 stats-only baseline,overall score 19.926709 -> 19.921159(约 -0.028%),Int 18.724052 -> 18.697521(约 -0.142%),FP 20.821866 -> 20.832814(约 +0.053%)。 +- benchmark 级主要负向来自 `omnetpp`(score -0.774%)、`libquantum`(-0.537%)、`milc`(-0.343%)、`sjeng`(-0.237%)、`gcc`(-0.172%)、`perlbench`(-0.171%)、`mcf`(-0.152%);主要正向有 `zeusmp`(+0.762%)、`GemsFDTD`(+0.281%)、`gromacs`(+0.256%)。 +- 143 个切片 raw sum 中,FDP 减少了 4422 个 I-cache overall misses,但增加了 231269 个 L1I `noMshrBlockedCycles` 和 556832 个 cycles。全套发出 `pfIssued=24700`,`pfUseful=12176`,`demandMergedIntoPfMSHR=2780`,`pfOnlyFill=19818`。这说明机制确实覆盖到了一些将来 demand,但共享 L1I MSHR/端口压力抵消了收益。 +- 最明显负向切片包括 `gcc_s04_7630`(cycles +1.921%、miss +1246、noMSHR +12850、pfIssued 1620、pfUseful 776)、`gcc_typeck_4528`(+0.615%、miss +498、noMSHR +697、pfIssued 679、pfUseful 281)、`gcc_expr2_27`(+0.392%、miss +10、noMSHR +14479、pfIssued 944、pfUseful 319)。也存在正向切片,例如 `gcc_g23_8607`(cycles -0.583%、miss -161,但 noMSHR +6434、pfIssued 791、pfUseful 351)。 +- 在 `gcc_s04_7630` 本地 `5M+5M` 参数探测中,`min_target_distance=32`:cycles 1858483、pfIssued 342、pfUseful 161、noMSHR 2146;`64/96/128/256` 全部不发 FDP,cycles 1836924、noMSHR 1146。当前 target 距离分布很窄,阈值从 32 提到 64 就几乎等于关闭 FDP,说明简单调大距离阈值没有可用余地。 +- 机制判断:当前本地 BPU/FTQ 虽然能提供 target lifecycle,但大多数可预取 target 的 lead time 仍太短;cache-side HardPFReq 又与 demand fetch 共用 L1I MSHR 和下游端口。因此这个初版更像“late prefetch / MSHR competitor”,而不是有效隐藏 I-cache miss latency 的 side-channel。 ## 第一版机制对照 @@ -203,3 +213,9 @@ - Decision: 若继续 FDIP,应优先做 cache-side FDP/snoop 型实现,或至少为 fetch-side prototype 增加“只对 cache/MSHR miss 候选发包”的过滤接口。 - Reason: 上游 FDP 的 TQ/PFQ/cache snoop 生命周期正是当前 prototype 缺失的关键能力;仅靠 target distance、skip-start-block、target-age 过滤无法稳定避免 L1I-hit prefetch 扰动。 - Date: 2026-05-18 +- Decision: 不建议把当前 cache-side FDP 初版作为性能优化合入;`kmhv3.py` 默认启用 FDP 的提交只应视作实验开关。 +- Reason: 全套 SPEC06 0.3c CI 轻微负向,且机制计数显示 I-cache miss 减少不足以抵消 L1I MSHR/端口压力。`min_target_distance` 从 32 调到 64 就几乎完全不发 FDP,说明当前 ahead window 太窄,单纯调参很难获得稳定收益。 +- Date: 2026-05-19 +- Decision: 下一步若继续 FDIP,应转向更强 ahead source 或低优先级/不占 demand MSHR 的 cache-side 机制,而不是继续沿用当前 `HardPFReq` 直接进入 L1I MSHR 的路径。 +- Reason: 有效 prefetch 的必要条件是比 demand 早足够多且不明显抢 demand 资源;当前实现满足 lifecycle/snoop,但没有独立资源或足够提前量。 +- Date: 2026-05-19