From 797e2e7cbc507052c9460c14a969450fff258cd7 Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Mon, 18 May 2026 16:49:39 +0800
Subject: [PATCH 1/6] cpu: Add FDIP opportunity statistics

Change-Id: Idec3aab68bdefcc9256801ba47c8038e618e030e
---
 src/cpu/pred/btb/decoupled_bpred.cc       | 83 ++++++++++++++++++++++-
 src/cpu/pred/btb/decoupled_bpred.hh       | 21 ++++++
 src/cpu/pred/btb/decoupled_bpred_stats.cc | 25 +++++++
 3 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index a1fee43d87..fc668212f1 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -24,7 +24,9 @@ namespace btb_pred
 void
 DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid)
 {
-    ftq.fetching(tid).fetchInstNum = fetched_inst_num;
+    auto &target = ftq.fetching(tid);
+    target.fetchInstNum = fetched_inst_num;
+    recordFdipFetchedTarget(target);
     ftq.finishTarget(tid);
 }
 
@@ -115,6 +117,77 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
     });
 }
 
+unsigned
+DecoupledBPUWithBTB::fdipCandidateCacheBlocks(const FetchTarget &target) const
+{
+    const Addr blk_size = cpu ? cpu->cacheLineSize() : 64;
+    Addr end_pc = target.predEndPC;
+    if (end_pc <= target.startPC) {
+        end_pc = target.startPC + 1;
+    }
+
+    const Addr start_blk = target.startPC & ~(blk_size - 1);
+    const Addr end_blk = (end_pc - 1) & ~(blk_size - 1);
+    return (end_blk - start_blk) / blk_size + 1;
+}
+
+uint64_t
+DecoupledBPUWithBTB::fdipTargetAgeCycles(const FetchTarget &target) const
+{
+    if (!cpu || curTick() < target.predTick) {
+        return 0;
+    }
+    return static_cast<uint64_t>(cpu->ticksToCycles(curTick() - target.predTick));
+}
+
+void
+DecoupledBPUWithBTB::recordFdipCandidateTarget(const FetchTarget &target)
+{
+    const unsigned blocks = fdipCandidateCacheBlocks(target);
+    dbpBtbStats.fdipCandidateTargets++;
+    dbpBtbStats.fdipCandidateCacheBlocks.sample(blocks, 1);
+    dbpBtbStats.fdipCandidateCacheBlocksTotal += blocks;
+}
+
+void
+DecoupledBPUWithBTB::recordFdipFetchedTarget(const FetchTarget &target)
+{
+    dbpBtbStats.fdipTargetFetched++;
+    dbpBtbStats.fdipTargetFetchLatency.sample(fdipTargetAgeCycles(target), 1);
+}
+
+void
+DecoupledBPUWithBTB::recordFdipCommittedTarget(const FetchTarget &target)
+{
+    dbpBtbStats.fdipTargetCommitted++;
+    dbpBtbStats.fdipTargetCommitLatency.sample(fdipTargetAgeCycles(target), 1);
+}
+
+void
+DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid,
+                                               FetchTargetId firstTargetId,
+                                               FetchTargetId lastTargetId)
+{
+    if (lastTargetId < firstTargetId) {
+        return;
+    }
+
+    unsigned squashed = 0;
+    for (auto id = firstTargetId; id <= lastTargetId; ++id) {
+        if (!ftq.hasTarget(id, tid)) {
+            continue;
+        }
+        const auto &target = ftq.get(id, tid);
+        dbpBtbStats.fdipTargetsSquashed++;
+        dbpBtbStats.fdipTargetSquashLatency.sample(fdipTargetAgeCycles(target), 1);
+        squashed++;
+    }
+
+    if (squashed > 0) {
+        dbpBtbStats.fdipSquashBatchSize.sample(squashed, 1);
+    }
+}
+
 
 void
 DecoupledBPUWithBTB::tick()
@@ -380,6 +453,8 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid)
         predTraceManager->write_record(PredictionTrace(ftq.backId(tid), entry));
     }
 
+    recordFdipCandidateTarget(entry);
+
     // 5. Add entry to fetch target queue
     ftq.insert(entry);
     threads[tid].validprediction = false;
@@ -449,6 +524,10 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id,
         dumpFsq("Before control squash");
     }
 
+    if (ftq.backId(tid) > target_id) {
+        recordFdipSquashedTargets(tid, target_id + 1, ftq.backId(tid));
+    }
+
     // Remove targets after the squashed one
     ftq.squashAfter(target_id, tid);
 
@@ -562,6 +641,8 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid)
                 target.startPC, target.exeBranchInfo.pc, target.exeBranchInfo.target, target.predBranchInfo.pc,
                 target.predBranchInfo.target);
 
+        recordFdipCommittedTarget(target);
+
         // Update statistics
         updateStatistics(target);
 
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 288450001f..8628e2fc4e 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -187,6 +187,14 @@ class DecoupledBPUWithBTB : public BPredUnit
      */
     void generateFinalPredAndCreateBubbles(ThreadID tid);
 
+    unsigned fdipCandidateCacheBlocks(const FetchTarget &target) const;
+    uint64_t fdipTargetAgeCycles(const FetchTarget &target) const;
+    void recordFdipCandidateTarget(const FetchTarget &target);
+    void recordFdipFetchedTarget(const FetchTarget &target);
+    void recordFdipCommittedTarget(const FetchTarget &target);
+    void recordFdipSquashedTargets(ThreadID tid, FetchTargetId firstTargetId,
+                                   FetchTargetId lastTargetId);
+
     void clearPreds(ThreadID tid) {
         for (auto &stagePred : threads[tid].predsOfEachStage) {
             stagePred.condTakens.clear();
@@ -287,6 +295,19 @@ class DecoupledBPUWithBTB : public BPredUnit
         statistics::Distribution commitFsqEntryFetchedInsts;
         statistics::Scalar commitFsqEntryOnlyHasOneJump;
 
+        // FDIP opportunity statistics. These do not issue prefetches; they
+        // measure whether predicted FSQ targets live long enough to be useful.
+        statistics::Scalar fdipCandidateTargets;
+        statistics::Distribution fdipCandidateCacheBlocks;
+        statistics::Scalar fdipCandidateCacheBlocksTotal;
+        statistics::Scalar fdipTargetFetched;
+        statistics::Distribution fdipTargetFetchLatency;
+        statistics::Scalar fdipTargetCommitted;
+        statistics::Distribution fdipTargetCommitLatency;
+        statistics::Scalar fdipTargetsSquashed;
+        statistics::Distribution fdipTargetSquashLatency;
+        statistics::Distribution fdipSquashBatchSize;
+
         statistics::Scalar btbHit;
         statistics::Scalar btbMiss;
         statistics::Scalar btbEntriesWithDifferentStart;
diff --git a/src/cpu/pred/btb/decoupled_bpred_stats.cc b/src/cpu/pred/btb/decoupled_bpred_stats.cc
index a64d943428..2c8255c010 100644
--- a/src/cpu/pred/btb/decoupled_bpred_stats.cc
+++ b/src/cpu/pred/btb/decoupled_bpred_stats.cc
@@ -451,6 +451,26 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats(
     ADD_STAT(commitFsqEntryHasInsts, statistics::units::Count::get(), "number of insts that commit fsq entries have"),
     ADD_STAT(commitFsqEntryFetchedInsts, statistics::units::Count::get(), "number of insts that commit fsq entries fetched"),
     ADD_STAT(commitFsqEntryOnlyHasOneJump, statistics::units::Count::get(), "number of fsq entries with only one instruction (jump)"),
+    ADD_STAT(fdipCandidateTargets, statistics::units::Count::get(),
+             "number of FSQ entries that could seed FDIP"),
+    ADD_STAT(fdipCandidateCacheBlocks, statistics::units::Count::get(),
+             "cache blocks covered by each FDIP candidate FSQ entry"),
+    ADD_STAT(fdipCandidateCacheBlocksTotal, statistics::units::Count::get(),
+             "total cache blocks covered by FDIP candidate FSQ entries"),
+    ADD_STAT(fdipTargetFetched, statistics::units::Count::get(),
+             "number of FDIP candidate FSQ entries consumed by fetch"),
+    ADD_STAT(fdipTargetFetchLatency, statistics::units::Cycle::get(),
+             "cycles from prediction to fetch consuming the FSQ entry"),
+    ADD_STAT(fdipTargetCommitted, statistics::units::Count::get(),
+             "number of FDIP candidate FSQ entries committed"),
+    ADD_STAT(fdipTargetCommitLatency, statistics::units::Cycle::get(),
+             "cycles from prediction to committing the FSQ entry"),
+    ADD_STAT(fdipTargetsSquashed, statistics::units::Count::get(),
+             "number of younger FDIP candidate FSQ entries removed by squash"),
+    ADD_STAT(fdipTargetSquashLatency, statistics::units::Cycle::get(),
+             "cycles from prediction to squash removal for FDIP candidates"),
+    ADD_STAT(fdipSquashBatchSize, statistics::units::Count::get(),
+             "number of younger FSQ entries removed by each squash"),
     ADD_STAT(btbHit, statistics::units::Count::get(), "btb hits (in predict block)"),
     ADD_STAT(btbMiss, statistics::units::Count::get(), "btb misses (in predict block)"),
     ADD_STAT(btbEntriesWithDifferentStart, statistics::units::Count::get(), "number of btb entries with different start PC"),
@@ -474,6 +494,11 @@ DecoupledBPUWithBTB::DBPBTBStats::DBPBTBStats(
     fsqEntryDist.init(0, fsqSize, 20).flags(statistics::total);
     commitFsqEntryHasInsts.init(0, maxInstsNum >> 1, 1);
     commitFsqEntryFetchedInsts.init(0, maxInstsNum >> 1, 1);
+    fdipCandidateCacheBlocks.init(0, 8, 1);
+    fdipTargetFetchLatency.init(0, 4096, 16);
+    fdipTargetCommitLatency.init(0, 4096, 16);
+    fdipTargetSquashLatency.init(0, 4096, 16);
+    fdipSquashBatchSize.init(0, fsqSize, 1);
     branchClassCounts.init(NumBranchClasses);
     branchClassMisses.init(NumBranchClasses);
     controlSquashByClass.init(NumBranchClasses);

From a7624bfc779daa8fb089e6b875afbc620c321dc2 Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Tue, 19 May 2026 16:11:40 +0800
Subject: [PATCH 2/6] cpu: Add off-by-default FDIP prototype

Change-Id: Id5d5d004a72d7a5a8cec6ac877a5de5e89a78835
---
 src/cpu/o3/BaseO3CPU.py             |  16 ++
 src/cpu/o3/fetch.cc                 | 304 +++++++++++++++++++++++++++-
 src/cpu/o3/fetch.hh                 |  74 +++++++
 src/cpu/pred/btb/decoupled_bpred.hh |   3 +
 src/mem/cache/cache.cc              |   3 +
 5 files changed, 398 insertions(+), 2 deletions(-)

diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index f6f46d85b8..3b3dac27eb 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -255,6 +255,22 @@ def support_take_over(cls):
 
     store_prefetch_train = Param.Bool(True, "Training store prefetcher with store addresses")
 
+    fdip = Param.Bool(False, "Enable fetch-directed instruction prefetch")
+    fdipLookaheadTargets = Param.Unsigned(
+        4, "Number of future FSQ targets to scan for FDIP")
+    fdipMaxPrefetchesPerCycle = Param.Unsigned(
+        2, "Maximum FDIP prefetch requests generated per cycle")
+    fdipMaxBlocksPerTarget = Param.Unsigned(
+        1, "Maximum cache blocks to prefetch for each FDIP target")
+    fdipMinTargetDistance = Param.Unsigned(
+        1, "Minimum distance from the current fetch target to prefetch")
+    fdipMinTargetAgeCycles = Param.Unsigned(
+        0, "Minimum predicted-target age before FDIP prefetches it")
+    fdipSkipTargetStartBlock = Param.Bool(
+        False, "Skip the first cache block of each FDIP target")
+    fdipMaxPendingTranslations = Param.Unsigned(
+        32, "Maximum FDIP translations in flight")
+
     # value predictor
     valuePred = Param.ValuePredictor(NULL, "valuepred unit")
     enableSelectiveVPFlush = Param.Bool(False,
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 21c9cec4e6..f31e2a5fb0 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -100,6 +100,17 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
       retryPkt(),
       retryTid(InvalidThreadID),
       cacheBlkSize(cpu->cacheLineSize()),
+      fdip(params.fdip),
+      fdipLookaheadTargets(params.fdipLookaheadTargets),
+      fdipMaxPrefetchesPerCycle(params.fdipMaxPrefetchesPerCycle),
+      fdipMaxBlocksPerTarget(params.fdipMaxBlocksPerTarget),
+      fdipMinTargetDistance(params.fdipMinTargetDistance),
+      fdipMinTargetAgeCycles(params.fdipMinTargetAgeCycles),
+      fdipSkipTargetStartBlock(params.fdipSkipTargetStartBlock),
+      fdipMaxPendingTranslations(params.fdipMaxPendingTranslations),
+      fdipPendingTranslations(0),
+      fdipPendingPrefetches(0),
+      fdipGeneration(0),
       fetchBufferSize(params.fetchBufferSize),
       fetchQueueSize(params.fetchQueueSize),
       numThreads(params.numThreads),
@@ -277,7 +288,33 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
     ADD_STAT(traceMetaCleanupSquashEntries, statistics::units::Count::get(),
              "Total entries erased by squash/rollback cleanups"),
     ADD_STAT(traceMetaCleanupCommitCalls, statistics::units::Count::get(),
-             "Number of times cleanup was called on successful commit")
+             "Number of times cleanup was called on successful commit"),
+    ADD_STAT(fdipTargetsIdentified, statistics::units::Count::get(),
+             "Number of future FSQ targets considered by FDIP"),
+    ADD_STAT(fdipTargetsAlreadyIssued, statistics::units::Count::get(),
+             "Number of future FSQ targets skipped because FDIP already issued them"),
+    ADD_STAT(fdipBlocksIdentified, statistics::units::Count::get(),
+             "Number of cache blocks identified by FDIP"),
+    ADD_STAT(fdipTranslationsStarted, statistics::units::Count::get(),
+             "Number of FDIP translations started"),
+    ADD_STAT(fdipTranslationThrottled, statistics::units::Count::get(),
+             "Number of FDIP translations blocked by the pending translation limit"),
+    ADD_STAT(fdipTranslationFaults, statistics::units::Count::get(),
+             "Number of FDIP translations that faulted"),
+    ADD_STAT(fdipPrefetchesIssued, statistics::units::Count::get(),
+             "Number of FDIP prefetch packets issued to the I-cache"),
+    ADD_STAT(fdipPrefetchesDropped, statistics::units::Count::get(),
+             "Number of queued FDIP prefetch packets dropped before reaching the I-cache"),
+    ADD_STAT(fdipPrefetchRetriesQueued, statistics::units::Count::get(),
+             "Number of FDIP prefetch packets queued for I-cache retry"),
+    ADD_STAT(fdipPrefetchRetriesSent, statistics::units::Count::get(),
+             "Number of FDIP prefetch retry packets accepted by the I-cache"),
+    ADD_STAT(fdipPrefetchResponses, statistics::units::Count::get(),
+             "Number of FDIP prefetch responses received"),
+    ADD_STAT(fdipStaleTranslations, statistics::units::Count::get(),
+             "Number of FDIP translations discarded after squash or reset"),
+    ADD_STAT(fdipStalePrefetchResponses, statistics::units::Count::get(),
+             "Number of FDIP prefetch responses discarded after squash or reset")
 {
         icacheStallCycles
             .prereq(icacheStallCycles);
@@ -315,6 +352,32 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
             .prereq(icacheSquashes);
         tlbSquashes
             .prereq(tlbSquashes);
+        fdipTargetsIdentified
+            .prereq(fdipTargetsIdentified);
+        fdipTargetsAlreadyIssued
+            .prereq(fdipTargetsAlreadyIssued);
+        fdipBlocksIdentified
+            .prereq(fdipBlocksIdentified);
+        fdipTranslationsStarted
+            .prereq(fdipTranslationsStarted);
+        fdipTranslationThrottled
+            .prereq(fdipTranslationThrottled);
+        fdipTranslationFaults
+            .prereq(fdipTranslationFaults);
+        fdipPrefetchesIssued
+            .prereq(fdipPrefetchesIssued);
+        fdipPrefetchesDropped
+            .prereq(fdipPrefetchesDropped);
+        fdipPrefetchRetriesQueued
+            .prereq(fdipPrefetchRetriesQueued);
+        fdipPrefetchRetriesSent
+            .prereq(fdipPrefetchRetriesSent);
+        fdipPrefetchResponses
+            .prereq(fdipPrefetchResponses);
+        fdipStaleTranslations
+            .prereq(fdipStaleTranslations);
+        fdipStalePrefetchResponses
+            .prereq(fdipStalePrefetchResponses);
         nisnDist
             .init(/* base value */ 0,
               /* last value */ fetch->fetchWidth,
@@ -424,6 +487,8 @@ Fetch::resetStage()
     numInst = 0;
     interruptPending = false;
     cacheBlocked = false;
+    ++fdipGeneration;
+    discardFdipRetryPackets();
 
     priorityList.clear();
 
@@ -438,6 +503,7 @@ Fetch::resetStage()
 
         threads[tid].reset();
         ftqEntryFetchedInsts[tid] = 0;
+        fdipIssuedTargets[tid].clear();
 
         fetchQueue[tid].clear();
 
@@ -523,6 +589,220 @@ Fetch::handleMultiCacheLineFetch(Addr vaddr, ThreadID tid, Addr pc)
     return true;
 }
 
+void
+Fetch::issueFdipPrefetches(ThreadID tid)
+{
+    if (!fdip || isTraceMode() || cacheBlocked || !fdipRetryPkt.empty() ||
+        fetchStatus[tid] != Running || !dbpbtb->ftqHasFetching(tid)) {
+        return;
+    }
+
+    const CacheRequestStatus cache_status =
+        threads[tid].cacheReq.getOverallStatus();
+    if (cache_status == TlbWait || cache_status == CacheWaitRetry ||
+        cache_status == CacheWaitResponse) {
+        return;
+    }
+
+    auto &issued_targets = fdipIssuedTargets[tid];
+    if (issued_targets.size() > 4096) {
+        issued_targets.clear();
+    }
+
+    const auto fetch_id = dbpbtb->ftqHeadId(tid);
+    const auto back_id = dbpbtb->ftqBackId(tid);
+    if (back_id <= fetch_id || fdipMaxPrefetchesPerCycle == 0 ||
+        fdipMaxBlocksPerTarget == 0) {
+        return;
+    }
+
+    unsigned issued_this_cycle = 0;
+    const auto first_id = fetch_id + fdipMinTargetDistance;
+    const auto last_id = std::min(back_id, fetch_id + fdipLookaheadTargets);
+
+    for (auto target_id = first_id; target_id <= last_id; ++target_id) {
+        if (!dbpbtb->ftqHasTarget(target_id, tid)) {
+            continue;
+        }
+
+        if (issued_targets.count(target_id)) {
+            ++fetchStats.fdipTargetsAlreadyIssued;
+            continue;
+        }
+
+        const auto &target = dbpbtb->ftqTarget(target_id, tid);
+        if (fdipMinTargetAgeCycles > 0) {
+            const uint64_t target_age = curTick() < target.predTick ? 0 :
+                static_cast<uint64_t>(
+                    cpu->ticksToCycles(curTick() - target.predTick));
+            if (target_age < fdipMinTargetAgeCycles) {
+                continue;
+            }
+        }
+
+        Addr end_pc = target.predEndPC;
+        if (end_pc <= target.startPC) {
+            end_pc = target.startPC + 1;
+        }
+
+        Addr block = target.startPC & ~(Addr(cacheBlkSize) - 1);
+        const Addr end_block = (end_pc - 1) & ~(Addr(cacheBlkSize) - 1);
+        unsigned blocks_for_target = 0;
+
+        ++fetchStats.fdipTargetsIdentified;
+        bool started_for_target = false;
+        if (fdipSkipTargetStartBlock) {
+            block += cacheBlkSize;
+        }
+
+        while (block <= end_block &&
+               blocks_for_target < fdipMaxBlocksPerTarget &&
+               issued_this_cycle < fdipMaxPrefetchesPerCycle) {
+            ++fetchStats.fdipBlocksIdentified;
+            if (!startFdipTranslation(tid, block, target.startPC)) {
+                break;
+            }
+            issued_this_cycle++;
+            blocks_for_target++;
+            started_for_target = true;
+            block += cacheBlkSize;
+        }
+
+        if (started_for_target) {
+            issued_targets.insert(target_id);
+        } else if (fdipSkipTargetStartBlock) {
+            issued_targets.insert(target_id);
+        }
+
+        if (issued_this_cycle >= fdipMaxPrefetchesPerCycle) {
+            break;
+        }
+    }
+}
+
+bool
+Fetch::startFdipTranslation(ThreadID tid, Addr vaddr, Addr pc)
+{
+    if (fdipPendingTranslations >= fdipMaxPendingTranslations) {
+        ++fetchStats.fdipTranslationThrottled;
+        return false;
+    }
+
+    RequestPtr req = std::make_shared<Request>(
+        vaddr, cacheBlkSize, Request::INST_FETCH | Request::PREFETCH,
+        cpu->instRequestorId(), pc, cpu->thread[tid]->contextId());
+    req->taskId(context_switch_task_id::Prefetcher);
+
+    ++fdipPendingTranslations;
+    ++fetchStats.fdipTranslationsStarted;
+
+    auto *translation = new FdipTranslation(this, fdipGeneration);
+    cpu->mmu->translateTiming(req, cpu->thread[tid]->getTC(), translation,
+                              BaseMMU::Execute);
+    return true;
+}
+
+void
+Fetch::finishFdipTranslation(const Fault &fault, const RequestPtr &mem_req,
+                             uint64_t generation)
+{
+    assert(fdipPendingTranslations > 0);
+    --fdipPendingTranslations;
+
+    if (generation != fdipGeneration || cpu->switchedOut()) {
+        ++fetchStats.fdipStaleTranslations;
+        return;
+    }
+
+    if (fault != NoFault) {
+        ++fetchStats.fdipTranslationFaults;
+        return;
+    }
+
+    if (!cpu->system->isMemAddr(mem_req->getPaddr())) {
+        ++fetchStats.fdipPrefetchesDropped;
+        return;
+    }
+
+    PacketPtr pkt = new Packet(mem_req, MemCmd::SoftPFReq);
+    pkt->allocate();
+    pkt->setSendRightAway();
+    fdipPacketGenerations[mem_req] = generation;
+
+    if (cacheBlocked || !fdipRetryPkt.empty() || !icachePort.sendTimingReq(pkt)) {
+        ++fetchStats.fdipPrefetchRetriesQueued;
+        fdipRetryPkt.push_back(pkt);
+        return;
+    }
+
+    ++fetchStats.fdipPrefetchesIssued;
+    ++fdipPendingPrefetches;
+}
+
+void
+Fetch::completeFdipPrefetch(PacketPtr pkt)
+{
+    auto it = fdipPacketGenerations.find(pkt->req);
+    const bool stale = it == fdipPacketGenerations.end() ||
+        it->second != fdipGeneration;
+    if (it != fdipPacketGenerations.end()) {
+        fdipPacketGenerations.erase(it);
+    }
+
+    assert(fdipPendingPrefetches > 0);
+    --fdipPendingPrefetches;
+
+    if (stale) {
+        ++fetchStats.fdipStalePrefetchResponses;
+    } else {
+        ++fetchStats.fdipPrefetchResponses;
+    }
+    delete pkt;
+}
+
+void
+Fetch::retryFdipPrefetches()
+{
+    if (!fdip || cacheBlocked || fdipRetryPkt.empty()) {
+        return;
+    }
+
+    for (auto it = fdipRetryPkt.begin(); it != fdipRetryPkt.end();) {
+        PacketPtr pkt = *it;
+        auto gen_it = fdipPacketGenerations.find(pkt->req);
+        if (gen_it == fdipPacketGenerations.end() ||
+            gen_it->second != fdipGeneration) {
+            if (gen_it != fdipPacketGenerations.end()) {
+                fdipPacketGenerations.erase(gen_it);
+            }
+            ++fetchStats.fdipStaleTranslations;
+            delete pkt;
+            it = fdipRetryPkt.erase(it);
+            continue;
+        }
+
+        if (!icachePort.sendTimingReq(pkt)) {
+            break;
+        }
+
+        ++fetchStats.fdipPrefetchesIssued;
+        ++fetchStats.fdipPrefetchRetriesSent;
+        ++fdipPendingPrefetches;
+        it = fdipRetryPkt.erase(it);
+    }
+}
+
+void
+Fetch::discardFdipRetryPackets()
+{
+    for (PacketPtr pkt : fdipRetryPkt) {
+        fdipPacketGenerations.erase(pkt->req);
+        ++fetchStats.fdipPrefetchesDropped;
+        delete pkt;
+    }
+    fdipRetryPkt.clear();
+}
+
 bool
 Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt)
 {
@@ -650,9 +930,12 @@ Fetch::drainSanityCheck() const
 {
     assert(isDrained());
     assert(retryPkt.size() == 0);
+    assert(fdipRetryPkt.size() == 0);
     assert(retryTid == InvalidThreadID);
     assert(!cacheBlocked);
     assert(!interruptPending);
+    assert(fdipPendingTranslations == 0);
+    assert(fdipPendingPrefetches == 0);
 
     for (ThreadID i = 0; i < numThreads; ++i) {
         assert(threads[i].cacheReq.packets.empty());
@@ -686,7 +969,10 @@ Fetch::isDrained() const
      * cycle if the finish translation event is scheduled, so make
      * sure that's not the case.
      */
-    return !finishTranslationEvent.scheduled();
+    return !finishTranslationEvent.scheduled() &&
+        fdipPendingTranslations == 0 &&
+        fdipPendingPrefetches == 0 &&
+        fdipRetryPkt.empty();
 }
 
 void
@@ -1105,6 +1391,9 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN
     // Force a new I-cache request for the next FTQ head after squash.
     threads[tid].valid = false;
     ftqEntryFetchedInsts[tid] = 0;
+    ++fdipGeneration;
+    discardFdipRetryPackets();
+    fdipIssuedTargets[tid].clear();
 
     if (traceFetch) {
         traceFetch->handleTraceSquash(tid, new_pc, squashInst, seqNum);
@@ -1207,6 +1496,10 @@ Fetch::tick()
 
     // Perform fetch operations and instruction delivery
     fetchAndProcessInstructions(status_change);
+
+    for (auto tid : *activeThreads) {
+        issueFdipPrefetches(tid);
+    }
 }
 
 bool
@@ -2019,6 +2312,7 @@ Fetch::recvReqRetry()
         // Access has been squashed since it was sent out.  Just clear
         // the cache being blocked.
         cacheBlocked = false;
+        retryFdipPrefetches();
         return;
     }
     assert(cacheBlocked);
@@ -2043,6 +2337,7 @@ Fetch::recvReqRetry()
     if (retryPkt.size() == 0) {
         retryTid = InvalidThreadID;
         cacheBlocked = false;
+        retryFdipPrefetches();
     }
 }
 
@@ -2108,6 +2403,11 @@ Fetch::IcachePort::recvTimingResp(PacketPtr pkt)
     DPRINTF(Fetch, "received pkt addr=%#lx, req addr=%#lx\n", pkt->getAddr(),
             pkt->req->getVaddr());
 
+    if (pkt->cmd == MemCmd::SoftPFResp || pkt->cmd == MemCmd::HardPFResp) {
+        fetch->completeFdipPrefetch(pkt);
+        return true;
+    }
+
     fetch->processCacheCompletion(pkt);
 
     return true;
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 19091ef30e..72977f89ee 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -44,6 +44,8 @@
 #include <cstring>
 #include <deque>
 #include <memory>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "arch/generic/decoder.hh"
@@ -132,6 +134,29 @@ class Fetch
         }
     };
 
+    class FdipTranslation : public BaseMMU::Translation
+    {
+      protected:
+        Fetch *fetch;
+        uint64_t generation;
+
+      public:
+        FdipTranslation(Fetch *_fetch, uint64_t _generation)
+            : fetch(_fetch), generation(_generation)
+        {}
+
+        void markDelayed() {}
+
+        void
+        finish(const Fault &fault, const RequestPtr &req,
+            gem5::ThreadContext *tc, BaseMMU::Mode mode)
+        {
+            assert(mode == BaseMMU::Execute);
+            fetch->finishFdipTranslation(fault, req, generation);
+            delete this;
+        }
+    };
+
   private:
     /* Event to delay delivery of a fetch translation result in case of
      * a fault and the nop to carry the fault cannot be generated
@@ -421,6 +446,25 @@ class Fetch
      */
     bool processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt);
 
+    /** Try to issue fetch-directed instruction prefetches from queued FSQ targets. */
+    void issueFdipPrefetches(ThreadID tid);
+
+    /** Start an address translation for a single FDIP cache block. */
+    bool startFdipTranslation(ThreadID tid, Addr vaddr, Addr pc);
+
+    /** Finish an FDIP translation and send the prefetch if it succeeded. */
+    void finishFdipTranslation(const Fault &fault, const RequestPtr &mem_req,
+                               uint64_t generation);
+
+    /** Complete and discard an FDIP response packet. */
+    void completeFdipPrefetch(PacketPtr pkt);
+
+    /** Try to send queued FDIP prefetches after the I-cache port retries. */
+    void retryFdipPrefetches();
+
+    /** Drop queued FDIP retry packets that have not reached the cache. */
+    void discardFdipRetryPackets();
+
 
     /** Check if an interrupt is pending and that we need to handle
      */
@@ -666,6 +710,22 @@ class Fetch
     /** Cache block size. */
     unsigned int cacheBlkSize;
 
+    const bool fdip;
+    const unsigned fdipLookaheadTargets;
+    const unsigned fdipMaxPrefetchesPerCycle;
+    const unsigned fdipMaxBlocksPerTarget;
+    const unsigned fdipMinTargetDistance;
+    const unsigned fdipMinTargetAgeCycles;
+    const bool fdipSkipTargetStartBlock;
+    const unsigned fdipMaxPendingTranslations;
+    unsigned fdipPendingTranslations;
+    unsigned fdipPendingPrefetches;
+    uint64_t fdipGeneration;
+    std::vector<PacketPtr> fdipRetryPkt;
+    std::unordered_map<RequestPtr, uint64_t> fdipPacketGenerations;
+    std::unordered_set<branch_prediction::btb_pred::FetchTargetId>
+        fdipIssuedTargets[MaxThreads];
+
     // Constants for misaligned fetch handling
     static constexpr unsigned CACHE_LINE_SIZE_BYTES = 64;
 
@@ -1105,6 +1165,20 @@ class Fetch
         statistics::Scalar traceMetaCleanupSquashEntries;
         /** Number of times cleanup was called on successful commit. */
         statistics::Scalar traceMetaCleanupCommitCalls;
+
+        statistics::Scalar fdipTargetsIdentified;
+        statistics::Scalar fdipTargetsAlreadyIssued;
+        statistics::Scalar fdipBlocksIdentified;
+        statistics::Scalar fdipTranslationsStarted;
+        statistics::Scalar fdipTranslationThrottled;
+        statistics::Scalar fdipTranslationFaults;
+        statistics::Scalar fdipPrefetchesIssued;
+        statistics::Scalar fdipPrefetchesDropped;
+        statistics::Scalar fdipPrefetchRetriesQueued;
+        statistics::Scalar fdipPrefetchRetriesSent;
+        statistics::Scalar fdipPrefetchResponses;
+        statistics::Scalar fdipStaleTranslations;
+        statistics::Scalar fdipStalePrefetchResponses;
     } fetchStats;
 
     SquashVersion localSquashVer;
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 8628e2fc4e..3096abe46d 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -424,6 +424,9 @@ class DecoupledBPUWithBTB : public BPredUnit
     // Fetch-facing interface: consume FSQ head directly (RTL-like single queue).
     bool ftqHasFetching(ThreadID tid) const { return ftq.hasTarget(ftq.fetchId(tid), tid); }
     FetchTargetId ftqHeadId(ThreadID tid) const { assert(ftqHasFetching(tid)); return ftq.fetchId(tid); }
+    FetchTargetId ftqBackId(ThreadID tid) const { assert(ftqHasFetching(tid)); return ftq.backId(tid); }
+    bool ftqHasTarget(FetchTargetId target_id, ThreadID tid) const { return ftq.hasTarget(target_id, tid); }
+    const FetchTarget &ftqTarget(FetchTargetId target_id, ThreadID tid) { return ftq.get(target_id, tid); }
     const FetchTarget &ftqFetchingTarget(ThreadID tid) { assert(ftqHasFetching(tid)); return ftq.fetching(tid); }
 
     void dumpFsq(const char *when);
diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc
index f59d0731df..acc9827335 100644
--- a/src/mem/cache/cache.cc
+++ b/src/mem/cache/cache.cc
@@ -404,6 +404,9 @@ Cache::handleTimingReqMiss(PacketPtr pkt, CacheBlk *blk, Tick forward_time,
         // If an outstanding request is in progress (we found an
         // MSHR) this is set to null
         pkt = pf;
+        if (pkt == nullptr) {
+            return;
+        }
     }
 
     WriteQueueEntry *wb_entry = writeBuffer.findMatch(pkt->getAddr(),

From 58c7bb780c3fed076908c97c29bbe54636a3db96 Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Tue, 19 May 2026 16:56:39 +0800
Subject: [PATCH 3/6] mem-cache: Add cache-side fetch directed prefetcher

Change-Id: If14984c470393aa6cb22e89f2c1572401232183d
---
 configs/common/CacheConfig.py                 |   3 +
 configs/common/PrefetcherConfig.py            |   7 +-
 ...decoupled-fe-fdip-comparison-2026-05-18.md | 205 +++++++++
 src/cpu/o3/cpu.cc                             |   6 +
 src/cpu/o3/cpu.hh                             |   5 +
 src/cpu/pred/btb/decoupled_bpred.cc           |  40 +-
 src/cpu/pred/btb/decoupled_bpred.hh           |   5 +
 src/cpu/pred/btb/fdip_target.hh               |  53 +++
 src/mem/cache/prefetch/Prefetcher.py          |  30 ++
 src/mem/cache/prefetch/SConscript             |   4 +-
 src/mem/cache/prefetch/fdp.cc                 | 392 ++++++++++++++++++
 src/mem/cache/prefetch/fdp.hh                 | 179 ++++++++
 12 files changed, 926 insertions(+), 3 deletions(-)
 create mode 100644 docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
 create mode 100644 src/cpu/pred/btb/fdip_target.hh
 create mode 100644 src/mem/cache/prefetch/fdp.cc
 create mode 100644 src/mem/cache/prefetch/fdp.hh

diff --git a/configs/common/CacheConfig.py b/configs/common/CacheConfig.py
index 3adf07fe8c..cdc7218663 100644
--- a/configs/common/CacheConfig.py
+++ b/configs/common/CacheConfig.py
@@ -69,6 +69,9 @@ def _get_cache_opts(cpu, level, options):
     prefetcher_attr = '{}_hwp_type'.format(level)
     if hasattr(options, prefetcher_attr) and (not options.no_pf):
         opts['prefetcher'] = create_prefetcher(cpu, level, options)
+        if level == 'l1i' and getattr(options, prefetcher_attr) == \
+                'FetchDirectedPrefetcher':
+            opts['demand_mshr_reserve'] = 2
 
     return opts
 
diff --git a/configs/common/PrefetcherConfig.py b/configs/common/PrefetcherConfig.py
index bd19bbbe0a..afe5efbd0e 100644
--- a/configs/common/PrefetcherConfig.py
+++ b/configs/common/PrefetcherConfig.py
@@ -29,7 +29,12 @@ def create_prefetcher(cpu, cache_level, options):
         return NULL
 
     if cpu != NULL:
-        prefetcher.registerTLB(cpu.mmu.dtb, cpu.mmu.functional)
+        prefetcher.registerTLB(
+            cpu.mmu.itb if cache_level == 'l1i' else cpu.mmu.dtb,
+            cpu.mmu.functional)
+
+    if prefetcher_name == 'FetchDirectedPrefetcher':
+        prefetcher.cpu = cpu
 
     if prefetcher_name == 'XSCompositePrefetcher':
         if options.l1d_enable_spp:
diff --git a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
new file mode 100644
index 0000000000..19ab20caeb
--- /dev/null
+++ b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
@@ -0,0 +1,205 @@
+# 对比上游 decoupled frontend / FDIP 与当前 Kunminghu 前端
+
+## 背景和目标
+
+当前 `/nfs/home/yanyue/workspace/GEM5-raw` 是已经更新到 `develop` 的上游 gem5 仓库，和当前主要工作仓库 `/nfs/home/yanyue/workspace/GEM5_review` 分叉超过三年。上游已经合入通用 O3 decoupled frontend、Fetch Directed Prefetcher（FDP/FDIP 类似机制）以及相关配置；本仓库则长期演进出 Kunminghu v3 对齐的 decoupled BTB 前端、FTQ/FSQ、BTBTAGE、MGSC、AheadBTB、MicroTAGE 和本地 prefetch 体系。
+
+本任务的目标不是直接移植上游实现，而是建立一份可持续更新的机制对照，回答：
+
+- 上游 decoupled frontend 和 FDP 的核心设计是什么，真实入口在哪里
+- 当前 Kunminghu v3 前端已经覆盖了哪些能力，结构上有哪些不同
+- 上游哪些特征值得借鉴，分别适合进入 BPU、Fetch/FTQ、ICache prefetch、统计观测或配置层
+- 哪些特征只适合上游 ARM/O3 通用路径，不适合直接搬到当前 RTL-aligned 路径
+- 后续若要尝试，应如何拆成低风险、可验证的小实验
+
+最终产出应是一份分层推荐，而不是单一结论：先给出机制级差异和候选特征，再按收益潜力、接入风险、验证成本排序。
+
+## 当前已知信息
+
+- 当前仓库状态：
+  - `GEM5_review` 位于 `xs-dev...origin/xs-dev`，当前未见工作区改动
+  - `GEM5-raw` 位于 `develop...origin/develop`，存在用户已有的 `.gitignore` 修改，应保持只读对比
+- 本仓库的架构文档明确指出，当前活跃路径是 `configs/example/kmhv3.py` 选择的 XiangShan/Kunminghu v3 O3 + decoupled BTB frontend；代码真实入口主要在 `src/cpu/o3/fetch.*` 和 `src/cpu/pred/btb/`
+- 本仓库已有设计文档索引位于 `docs/design-docs/frontend/README.md`，相关主题包括 `bpu_top_level.md`、`mbtb_design.md`、`btb_tage_design.md`、`mgsc_design.md`、`ubtb_design.md`、`abtb_design.md`、`microtage_design.md`
+- 上游 decoupled frontend 关键提交包括：
+  - `719c799dc3 cpu: Implement decoupled front-end`
+  - `145efd442b mem-cache: Add fetch directed prefetcher`
+  - `38c7e348a5 mem-cache: Prefetch for all cache blocks in a Fetch Target`
+  - `d55f46336f cpu: Branch predictor latency and overriding model for the decoupled frontend`
+  - `e676195c5f cpu-o3,stdlib: Stdlib configs for decoupled FE`
+- 上游实现的初步入口：
+  - `src/cpu/o3/bac.*`：branch address calculation / FTQ producer
+  - `src/cpu/o3/ftq.*`：通用 FetchTarget / FTQ
+  - `src/cpu/o3/fetch.*`：消费 FTQ 的 fetch stage
+  - `src/mem/cache/prefetch/fdp.*`：Fetch Directed Prefetcher
+  - `src/cpu/o3/BaseO3CPU.py`：`decoupledFrontEnd`、`numFTQEntries`、`fetchTargetWidth` 等参数
+  - `configs/common/cores/arm/neoverse_v2.py`：上游 ARM 配置示例启用 decoupled frontend
+- 本仓库当前实现的初步入口：
+  - `src/cpu/pred/btb/decoupled_bpred.*`
+  - `src/cpu/pred/btb/ftq.*`
+  - `src/cpu/pred/btb/mbtb.*`
+  - `src/cpu/pred/btb/abtb.*`
+  - `src/cpu/pred/btb/microtage.*`
+  - `src/cpu/pred/btb/btb_tage.*`
+  - `src/cpu/pred/btb/btb_mgsc.*`
+  - `src/cpu/o3/fetch.*`
+  - `configs/example/kmhv3.py`
+
+## 假设和开放问题
+
+- 假设：上游 FDP 最有可能被借鉴的部分不是预测算法本身，而是 `FTQInsert` / `FTQRemove` probe 驱动的生命周期、翻译队列、squash、cache snoop、按 FetchTarget 覆盖所有 cache block 的策略。
+- 假设：上游 decoupled frontend 的 `BAC -> FTQ -> Fetch` 切分，和本仓库的 `BPU 自产 FTQ/FSQ -> Fetch` 切分不同；直接搬上游 BAC 价值有限，但其 surprise branch、prediction latency、override/resteer 建模可能有参考价值。
+- 假设：本仓库已有更强的 Kunminghu 专用 BPU 结构，所以上游 ARM/Neoverse 配置中的 predictor 组合和宽度参数不能直接作为性能方向。
+- 开放问题：本仓库是否已有 FDIP-like 机制或预留接口，只是命名不同；需要从 `fetch.md`、`fetch.cc`、prefetcher 配置和 cache probe 接线确认。
+- 开放问题：上游 FDP 的虚拟地址翻译、跨页 prefetch、cache snoop 和 request 标记，与本仓库 RISC-V FS/checkpoint/difftest 路径是否存在副作用。
+- 开放问题：上游 `d55f46336f` 的 branch predictor latency / overriding model 与当前 `AheadBTB`、`MicroTAGE`、`numOverrideBubbles` 的关系是什么，能否用于改进观测或实验建模。
+
+## 计划步骤
+
+1. 梳理上游实现骨架，输出文件级结构图和参数表
+   - 目标：确认上游 decoupled FE / FDP 的真实控制流
+   - 产出：关键文件、类、参数、probe、stats 列表
+
+2. 梳理当前 Kunminghu v3 本地基线
+   - 目标：确认本仓库已有的 FTQ/FSQ、fetch target 生命周期、BPU update、ICache/prefetch 接口
+   - 产出：本地对应文件、已有能力、缺口列表
+
+3. 建立机制对照表
+   - 目标：把上游特征映射到本地模块边界
+   - 产出：`上游特征 -> 本地现状 -> 可借鉴方式 -> 风险 -> 验证` 表格
+
+4. 选出第一批候选特征
+   - 目标：优先挑不改变预测语义、主要增加观测或可控实验能力的低风险项
+   - 产出：推荐优先级和每项最小 patch 思路
+
+5. 定义验证路径
+   - 目标：给每个候选项配一个最小可复现验证，而不是只靠直觉
+   - 产出：本地 smoke/unit 测试、micro-test 或 SPEC slice 方案
+
+6. 如用户确认方向，再拆分实现阶段
+   - 目标：把长期任务拆成可 review 的独立提交
+   - 产出：每阶段改动范围、预期收益和回退方式
+
+## 验证
+
+本阶段是分析与方案阶段，完成标准是：
+
+- 已确认上游和本地各自的关键代码入口和控制流
+- 已明确哪些上游特征可以借鉴，哪些不建议直接搬
+- 每个推荐候选都有本地接入点、风险说明和验证计划
+- 若进入代码阶段，优先从观测/配置/低语义风险 patch 开始，并用本仓库既有 BPU 单测、fetch smoke、代表 SPEC slice 或 CI 统计对照验证
+
+## 进展
+
+- [x] 2026-05-18 16:20 创建 ExecPlan，记录任务边界、已知入口和初始假设
+- [x] 2026-05-18 16:20 确认两个仓库分支状态：`GEM5_review` 在 `xs-dev`，`GEM5-raw` 在 `develop`
+- [x] 2026-05-18 16:20 初步定位上游 decoupled FE/FDP 关键提交和入口文件
+- [x] 2026-05-18 16:35 梳理上游 `BAC/FTQ/Fetch/FDP` 详细控制流，确认 FDP 主要通过 FTQ probe 驱动
+- [x] 2026-05-18 16:40 梳理本地 `DecoupledBPUWithBTB/FTQ/Fetch/Prefetch` 对应能力，确认当前没有 BPU/FTQ 驱动的 FDIP-like instruction prefetch
+- [x] 2026-05-18 16:45 形成第一版机制对照和候选特征推荐
+- [x] 2026-05-18 16:55 选择第一项低风险实验：先做 FDIP observability / target lifecycle stats，不直接发 instruction prefetch
+- [x] 2026-05-18 16:55 新建分支 `fdip-align`
+- [x] 2026-05-18 16:58 提交 `797e2e7cbc cpu: Add FDIP opportunity statistics`
+- [x] 2026-05-18 16:59 push 到 `origin/fdip-align`，触发 CI run `26023246359`
+- [x] 2026-05-18 17:55 扫描 `gcc15-spec06-0.3c` 最近归档，确认 ICache miss 最高的一批切片主要来自 gcc
+- [x] 2026-05-18 18:15 本地完成 `gcc_typeck_4528`、`gcc_expr2_27` 两个切片的 FDIP 观测统计
+- [x] 2026-05-18 18:15 尝试本地跑最高 miss 的 `gcc_s04_7630`，但 `gem5.opt` 30 分钟超时，只保留 reset 前 20M 指令窗口作为参考，不作为最终 ROI 结论
+- [x] 2026-05-18 18:20 调整后续本地实验口径：优先跑短窗口 `5M+5M` 或 `10M+10M`，并发跑多个切片，先看趋势再决定是否扩展到完整 40M
+- [x] 2026-05-18 19:15 实现 fetch 侧 FDIP prototype：从未来 FTQ target 取 `startPC -> predEndPC`，经 ITLB 翻译后向 L1I 发 `SoftPFReq`
+- [x] 2026-05-18 19:35 修复 prototype 稳定性问题：补 FDIP retry 队列、squash/reset generation、drain 等待、stale response 丢弃，并修复 cache 里 SoftPFReq 合并到已有 MSHR 后 `pkt == nullptr` 的返回路径
+- [x] 2026-05-18 19:40 本地完成 4 个 gcc 高 I-cache-miss 候选切片的 `5M+5M` A/B：`gcc_typeck_4528`、`gcc_expr2_27`、`gcc_200_28`、`gcc_expr_4892`
+- [x] 2026-05-18 19:55 试过三类过滤：target distance、跳过 target start block、target age >= 16 cycles；均未形成稳定正收益
+- [x] 2026-05-18 20:05 结论：不 push FDIP 行为 patch 到 CI；保留本地 off-by-default prototype 和实验数据，下一步应转向 cache-side FDP/snoop 或更强过滤
+- [x] 2026-05-19 16:05 提交 `a7624bfc77 cpu: Add off-by-default FDIP prototype`，把 fetch 侧 prototype 固化为本地备选但默认关闭
+- [x] 2026-05-19 16:40 实现 cache-side `FetchDirectedPrefetcher` prototype：BPU/FTQ 发 `FTQInsert`/`FTQRemove` probe，L1I prefetcher 监听 target 生命周期，经 ITB timing translation 后以 `HardPFReq` 进入 cache 侧 snoop/MSHR 路径
+- [x] 2026-05-19 16:55 完成 cache-side FDP 的稳定化：默认 `pfq_size=1`、`tq_size=1`、`min_target_distance=32`、`latency=64`，并在 L1I 使用 FDP 时把 `demand_mshr_reserve` 提到 2，避免 4-entry L1I MSHR 被 FDP 抢占过多
+- [x] 2026-05-19 17:00 完成本地 `gcc_typeck_4528`、`gcc_expr2_27` 的 `5M+5M` A/B，结果为小幅正向但很接近噪声；准备以单独实验提交启用 `kmhv3.py` 默认 L1I FDP 后 push CI 观察全套 0.3c
+
+## 发现和意外
+
+- 上游实现把通用 O3 decoupled frontend 明确拆成 `BAC` 和 `FTQ`，并用 FTQ probe 驱动 FDP；这和本仓库把预测前端主体放在 `src/cpu/pred/btb/` 内部的组织方式明显不同。
+- 上游 FDP 不是简单 next-line prefetch；它跟随 FetchTarget 生命周期，并且后续提交已经扩展到对 FetchTarget 覆盖的所有 cache block 产生候选。
+- 上游 `BAC` 的基本模型是从当前 PC 按 `minInstSize` 扫描，借助 `BPredUnit::BTBValid()` 找到第一个 BTB hit，然后把最多一个控制流的 FetchTarget 放入 FTQ。这个模型适合通用 O3 解耦，但不适合直接替代本地 `UBTB/AheadBTB/MicroTAGE/MBTB/BTBTAGE/ITTAGE/MGSC/RAS` 的多级块级预测。
+- 上游 `FetchDirectedPrefetcher` 的核心价值在于生命周期接口：FTQ insert 时按 FetchTarget 覆盖的 cache block 产生候选，走 MMU 翻译和 cache/MSHR snoop；FTQ remove/squash 时取消同一 target 的在途翻译和 PFQ 项。
+- 本地 `FetchTarget` 已经比上游通用 FetchTarget 丰富很多：包含预测/执行 branch 信息、BTB entries、pred metas、GHR/PHR/BWHR/LHR、统计字段等。因此若做 FDIP，应复用本地结构，不应搬上游结构。
+- 本地 `FetchTargetQueue` 目前是预测器内部的 deque + target id，`insert()`、`finishTarget()`、`commitTarget()`、`squashAfter()` 没有对外 probe；fetch 侧只有 demand request 的 `FetchRequestSent` probe。这意味着本地缺的不是普通 cache prefetch 框架，而是“预测 target 生命周期事件”。
+- 当前 `kmhv3.py` 对 `DecoupledBPUWithBTB` 设置 `ftq_size=64`、`fsq_size=64`，并启用 UBTB/ABTB/MicroTAGE/MBTB/TAGE/ITTAGE/MGSC/RAS；但 L1I 侧没有默认 FDIP-like prefetcher。
+- 当前 Fetch 每拍先处理 redirect/squash，再 `dbpbtb->tick()` 推进预测流水，最后按 `ftqFetchingTarget()` 的 startPC 发 I-cache demand request。真正接入 FDIP 时需要小心这个时序，避免 prefetch 生命周期和 resolve/update/squash 顺序不一致。
+- 最近 `gcc15-spec06-0.3c` 归档中，L1I miss 最重的点集中在 gcc：`gcc_s04_7630` 约 20.1 万 misses、2.37% miss rate；`gcc_typeck_4528` 约 10.0 万、1.14%；`gcc_200_28` 约 8.2 万、0.80%；`gcc_expr_4892` 约 6.7 万、0.63%；`gcc_expr2_27` 约 5.8 万、0.61%。
+- 本地 `fdip-align` 观测结果显示，`gcc_typeck_4528` 的 prediction-to-fetch 平均只有约 4.0 cycles，约 5.0% target 超过 15 cycles；`gcc_expr2_27` 平均约 5.5 cycles，约 5.4% target 超过 15 cycles。它们的 L1I miss latency 均值分别约 11.0 和 26.8 cycles。
+- 本地观测也显示潜在污染压力不小：`gcc_typeck_4528` 的 candidate target 约 69.6% commit、30.4% 被 squash；`gcc_expr2_27` 约 55.5% commit、44.5% 被 squash。`gcc_s04_7630` 前半段参考窗口中 squash 比例约 45.8%。
+- 初步判断：当前 BPU/FTQ 提前量对 FDIP 偏短，直接“对所有 target blocks 发 L1I prefetch”可能覆盖有限且错误路径污染较高；若继续做 prototype，应优先加距离/置信过滤，例如只对 ahead >= 16 cycles 的 target、已跨 cache block 的 target、或更可能 commit 的 target 发。
+- 后续本地实验不默认完整 40M 指令。先用 `--warmup-insts-no-switch` / `--maxinsts` 风格的短窗口或等价配置跑 `5M+5M`、`10M+10M`，并发覆盖多个代表切片；只有趋势明确或需要和 CI 对齐时再补完整窗口。
+- Fetch 侧直接发 `SoftPFReq` 的初版可以稳定运行，但收益不稳。默认近距离策略在 4 个短窗口上的 cycles 变化为：`gcc_typeck_4528` -0.041%、`gcc_expr2_27` -0.198%、`gcc_200_28` +0.527%、`gcc_expr_4892` +0.186%。
+- 关键负面信号是 SoftPFReq 绝大多数命中 L1I，几乎没有形成 demand merge：例如默认策略在 `gcc_expr_4892` 发出 231,790 个 FDIP prefetch，其中 SoftPF miss 只有 68；`gcc_expr2_27` 发出 86,680 个，SoftPF miss 只有 73。`system.cpu.icache.demandMergedIntoPfMSHR` 和 `pfMergedWithDemand` 均为 0。
+- 距离过滤能局部改善但不稳定：`fdipMinTargetDistance=3`、`fdipLookaheadTargets=8`、`fdipMaxPrefetchesPerCycle=1` 让 `gcc_expr_4892` 从 +0.186% 改到 -0.052%，但 `gcc_expr2_27` 从 -0.198% 变成 +0.290%。
+- 跳过 FetchTarget 起始 cache block 的策略整体更差：`gcc_typeck_4528` +0.005%、`gcc_expr2_27` +0.344%、`gcc_200_28` +0.472%、`gcc_expr_4892` +0.351%。
+- target age 过滤减小了扰动但仍不够稳：`fdipMinTargetAgeCycles=16` 结果为 `gcc_typeck_4528` +0.132%、`gcc_expr2_27` -0.189%、`gcc_200_28` +0.170%、`gcc_expr_4892` +0.017%。
+- 当前 FDIP prototype 的主要问题不是翻译或 retry 稳定性，而是没有 cache-side snoop/去重，fetch 侧 `SoftPFReq` 会消耗大量已经命中的 L1I 访问。更接近上游 FDP 的下一步应放到 L1I prefetcher/cache 侧，利用 cache tag/MSHR snoop 过滤掉已命中的候选，而不是继续在 fetch 侧盲发。
+- Cache-side FDP 确认可以复用上游最重要的生命周期思想，但在本仓库 L1I 上必须非常保守。默认 L1I 只有 4 个 MSHR，cache-side `HardPFReq` 会真实占用 MSHR 和下游端口；初始宽松配置在 `gcc_typeck_4528` 10K smoke 中触发 commit stuck。
+- 把 cache-side FDP 收紧到 `pfq/tq=1`、`min_target_distance=32`、`latency=64`，并把 L1I `demand_mshr_reserve=2` 后，100K smoke 稳定运行：`gcc_typeck_4528` cycles 173602 -> 172758，I-cache demand misses 1941 -> 1903，但 no-MSHR blocked cycles 10849 -> 17098。
+- `gcc_typeck_4528` 的 `5M+5M` 测量段结果为 cycles 1824859 -> 1822698（约 -0.12%），I-cache misses 15411 -> 15428，`pfIssued=113`、`pfUseful=41`、`pfUnused=49`、`demandMergedIntoPfMSHR=10`，no-MSHR blocked cycles 4171 -> 6655。
+- `gcc_expr2_27` 的 `5M+5M` 测量段结果为 cycles 2410537 -> 2410225（约 -0.013%），I-cache misses 13294 -> 13243，`pfIssued=293`、`pfUseful=146`、`pfUnused=115`、`demandMergedIntoPfMSHR=31`，no-MSHR blocked cycles 12493 -> 14211。
+- 目前 cache-side FDP 的局部信号是“有少量有效覆盖，但 MSHR/port 资源代价也可见”。这值得 push 一轮 CI 看全套 SPEC06 0.3c，但还不能说已经是可合入的收益方向。
+
+## 第一版机制对照
+
+| 主题 | 上游 GEM5-raw | 当前 GEM5_review | 借鉴判断 |
+| --- | --- | --- | --- |
+| 前端切分 | `BAC -> FTQ -> Fetch`，BAC 是 O3 stage | `DecoupledBPUWithBTB` 内部生成 FSQ/FTQ，Fetch 直接消费 BPU target | 不搬 BAC；只借鉴解耦边界描述和状态统计 |
+| FetchTarget 表达 | 简化 basic-block-like target，最多记录 exit branch/pred target/history | 保存块级预测、多个 BTB entry、各组件 meta、history、resolve/update 信息 | 复用本地结构，不降级 |
+| BPU 生成方式 | BTBValid 线性扫描到 branch，predict 后插 FTQ | 多组件分 stage 产生 FullBTBPrediction，按 override bubble 延迟入队 | 不搬扫描逻辑；可借鉴 `maxFTPerCycle/maxTakenPredPerCycle` 作为观测维度 |
+| FTQ 生命周期 | insert/remove 都有 probe，供 FDP 监听 | insert/finish/commit/squash 都在内部，缺少 target-level probe | 值得补 target-level probe/callback |
+| FDIP/FDP | FTQ insert 触发 prefetch，remove/squash 取消，支持 TQ/PFQ/cache snoop/stats | 未见 BPU/FTQ 驱动 instruction prefetch；只有 demand fetch probe 和通用 prefetch 框架 | 最值得借鉴，建议作为独立实验 |
+| 配置 | Neoverse/stdlib 示例启用 decoupled FE + L1I FDP/Tagged | `kmhv3.py` 强 RTL-aligned BTB 前端，prefetch 主要在 data/L2/L3 路径 | 只借鉴配置挂法，不借 ARM 数值 |
+| 统计 | FTQ occupancy、BAC 状态、FDP 队列/翻译/cache-snoop 统计 | BPU stage/override/FSQ/branch stats 很丰富，但缺 FDIP 专项 | 可以低风险补 FDIP 专项 stats |
+
+## 第一批候选特征
+
+1. Target-level lifecycle probe/callback
+   - 接入点：`src/cpu/pred/btb/ftq.hh`、`src/cpu/pred/btb/ftq.cc`、`src/cpu/pred/btb/decoupled_bpred.cc`
+   - 做法：为本地 `FetchTargetQueue::insert()`、`finishTarget()`、`squashAfter()`、必要时 `commitTarget()` 增加 target id + FetchTarget 摘要事件，先用于统计或 debug，不改变预测语义。
+   - 价值：为 FDIP、FSQ ahead distance、错误路径 prefetch 污染统计提供干净挂点。
+   - 风险：低到中。需要注意 `FetchTargetQueue` 当前没有 CPU/probe manager 指针，可能更适合先在 `DecoupledBPUWithBTB` 层发事件。
+
+2. Kunminghu-FDIP prototype
+   - 接入点：本地 L1I prefetcher 或新的 `FetchTarget` listener；配置入口可从 `kmhv3.py` 加显式开关。
+   - 做法：参考上游 `FetchDirectedPrefetcher` 的 TQ/PFQ/cache snoop/cancel 语义，但输入改成本地 `FetchTarget` 的 `startPC -> predEndPC` 范围。
+   - 价值：最可能带来前端 I-cache latency 改善，尤其对 BTB 已能提前看到后续 target 的场景。
+   - 风险：中到高。会跨 BPU target 生命周期、RISC-V MMU 翻译、I-cache MSHR、错误路径 squash；必须从 off-by-default 实验开关开始。
+
+3. FDIP observability first
+   - 接入点：新 stats 或 debug flag。
+   - 做法：即使暂不发 prefetch，也先统计 FSQ target 的 ahead distance、覆盖 cache block 数、被 squash/commit/finish 的比例、target 从入队到 fetch 的提前周期。
+   - 价值：判断当前 BPU ahead depth 是否足以支撑 FDIP；如果提前量不够，先做 FDIP 发包意义不大。
+   - 风险：低，是最适合第一步的实验。
+
+4. Surprise branch / no-history taxonomy
+   - 接入点：当前 `controlSquash`、`topMispredictsByBranch`、BTB miss/false hit stats。
+   - 做法：借鉴上游“BTB 没看到但 fetch/decode 发现”的分类思路，细分当前 no-pred / false-hit / target-wrong 场景。
+   - 价值：帮助区分 BPU 没学到、BTB 容量不够、方向错、target 错、fetch block 截断等原因。
+   - 风险：低，主要是统计口径设计。
+
+5. Branch predictor latency / override model 对照
+   - 接入点：当前 `numOverrideBubbles`、`predsOfEachStage`、`overrideReason`。
+   - 做法：不搬上游 latency 模型，但检查上游 `d55f46336f` 后的 `Prediction.latency` 和 override/resteer 分类，看是否能补充本地 stage latency 观测。
+   - 价值：让 `AheadBTB/MicroTAGE` 的收益和 override bubble 成本更容易解释。
+   - 风险：低到中，取决于是否只补统计还是改行为。
+
+## 决策记录
+
+- Decision: 本轮先做机制对比和候选特征筛选，不直接移植代码。
+- Reason: 两边前端架构边界不同，直接搬上游 BAC/FTQ 容易破坏本仓库 RTL-aligned 路径；先筛选低风险特征更稳。
+- Date: 2026-05-18
+- Decision: 第一优先级建议从 FDIP observability / target lifecycle event 开始，而不是直接发 I-cache prefetch。
+- Reason: 当前本地缺的是 target-level 生命周期挂点和 ahead-distance/污染统计；先补观测能判断 FDIP 是否有足够提前量，并降低对 Fetch/ResolveQueue 语义的扰动。
+- Date: 2026-05-18
+- Decision: `fdip-align` 首个 patch 只增加统计，不改变预测、fetch 或 cache 行为。
+- Reason: 需要先确认当前 FSQ/FTQ ahead distance、candidate cache block 数、fetch/commit/squash 生命周期比例；如果 SPEC06 的 L1I miss 很少或预测提前量不够，FDIP prototype 的收益预期会很弱。
+- Date: 2026-05-18
+- Decision: 不把 fetch-side FDIP prototype push 到 CI。
+- Reason: 本地 `5M+5M` 短窗口没有稳定收益，且 SoftPFReq 绝大多数是 L1I hit，说明当前直接从 fetch 端发 prefetch 缺少上游 FDP 的 cache-side snoop/去重能力。继续推 CI 大概率浪费完整 SPEC 资源。
+- Date: 2026-05-18
+- Decision: 若继续 FDIP，应优先做 cache-side FDP/snoop 型实现，或至少为 fetch-side prototype 增加“只对 cache/MSHR miss 候选发包”的过滤接口。
+- Reason: 上游 FDP 的 TQ/PFQ/cache snoop 生命周期正是当前 prototype 缺失的关键能力；仅靠 target distance、skip-start-block、target-age 过滤无法稳定避免 L1I-hit prefetch 扰动。
+- Date: 2026-05-18
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 502f02c25e..4754cad217 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -340,6 +340,12 @@ CPU::regProbePoints()
     ppDataAccessComplete = new ProbePointArg<
         std::pair<DynInstPtr, PacketPtr>>(
                 getProbeManager(), "DataAccessComplete");
+    ppFTQInsert =
+        new ProbePointArg<branch_prediction::btb_pred::FdipFetchTargetPtr>(
+            getProbeManager(), "FTQInsert");
+    ppFTQRemove =
+        new ProbePointArg<branch_prediction::btb_pred::FdipFetchTargetPtr>(
+            getProbeManager(), "FTQRemove");
 
     fetch.regProbePoints();
     rename.regProbePoints();
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index fae5eea4d4..1df4749c57 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -69,6 +69,7 @@
 #include "cpu/o3/rob.hh"
 #include "cpu/o3/scoreboard.hh"
 #include "cpu/o3/thread_state.hh"
+#include "cpu/pred/btb/fdip_target.hh"
 #include "cpu/simple_thread.hh"
 #include "cpu/timebuf.hh"
 #include "cpu/valuepred/valuepred_unit.hh"
@@ -193,6 +194,10 @@ class CPU : public BaseCPU
 
     ProbePointArg<PacketPtr> *ppInstAccessComplete;
     ProbePointArg<std::pair<DynInstPtr, PacketPtr> > *ppDataAccessComplete;
+    ProbePointArg<branch_prediction::btb_pred::FdipFetchTargetPtr>
+        *ppFTQInsert;
+    ProbePointArg<branch_prediction::btb_pred::FdipFetchTargetPtr>
+        *ppFTQRemove;
 
     /** Register probe points. */
     void regProbePoints() override;
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index fc668212f1..1ee376a809 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -1,6 +1,7 @@
 #include "cpu/pred/btb/decoupled_bpred.hh"
 
 #include <array>
+#include <memory>
 
 #include "base/debug_helper.hh"
 #include "base/output.hh"
@@ -24,9 +25,11 @@ namespace btb_pred
 void
 DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid)
 {
+    const auto target_id = ftq.fetchId(tid);
     auto &target = ftq.fetching(tid);
     target.fetchInstNum = fetched_inst_num;
     recordFdipFetchedTarget(target);
+    notifyFdipTargetRemove(target, target_id);
     ftq.finishTarget(tid);
 }
 
@@ -180,6 +183,7 @@ DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid,
         const auto &target = ftq.get(id, tid);
         dbpBtbStats.fdipTargetsSquashed++;
         dbpBtbStats.fdipTargetSquashLatency.sample(fdipTargetAgeCycles(target), 1);
+        notifyFdipTargetRemove(target, id);
         squashed++;
     }
 
@@ -188,6 +192,32 @@ DecoupledBPUWithBTB::recordFdipSquashedTargets(ThreadID tid,
     }
 }
 
+void
+DecoupledBPUWithBTB::notifyFdipTargetInsert(const FetchTarget &target,
+                                            FetchTargetId target_id,
+                                            uint64_t distance_from_fetch_head) const
+{
+    if (!cpu || !cpu->ppFTQInsert) {
+        return;
+    }
+
+    cpu->ppFTQInsert->notify(
+        std::make_shared<FdipFetchTarget>(
+            target, target_id, distance_from_fetch_head));
+}
+
+void
+DecoupledBPUWithBTB::notifyFdipTargetRemove(const FetchTarget &target,
+                                            FetchTargetId target_id) const
+{
+    if (!cpu || !cpu->ppFTQRemove) {
+        return;
+    }
+
+    cpu->ppFTQRemove->notify(
+        std::make_shared<FdipFetchTarget>(target, target_id, 0));
+}
+
 
 void
 DecoupledBPUWithBTB::tick()
@@ -449,14 +479,22 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid)
     // 4. Fill ahead pipeline
     fillAheadPipeline(entry);
 
+    const FetchTargetId target_id =
+        ftq.empty(tid) ? ftq.frontId(tid) : ftq.backId(tid) + 1;
+    const FetchTargetId fetch_id =
+        ftq.empty(tid) ? target_id : ftq.fetchId(tid);
+    const uint64_t distance_from_fetch_head =
+        target_id >= fetch_id ? target_id - fetch_id : 0;
+
     if (enablePredFSQTrace) {
-        predTraceManager->write_record(PredictionTrace(ftq.backId(tid), entry));
+        predTraceManager->write_record(PredictionTrace(target_id, entry));
     }
 
     recordFdipCandidateTarget(entry);
 
     // 5. Add entry to fetch target queue
     ftq.insert(entry);
+    notifyFdipTargetInsert(entry, target_id, distance_from_fetch_head);
     threads[tid].validprediction = false;
 
     // 6. Debug output and update statistics
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 3096abe46d..138760d9d8 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -194,6 +194,11 @@ class DecoupledBPUWithBTB : public BPredUnit
     void recordFdipCommittedTarget(const FetchTarget &target);
     void recordFdipSquashedTargets(ThreadID tid, FetchTargetId firstTargetId,
                                    FetchTargetId lastTargetId);
+    void notifyFdipTargetInsert(const FetchTarget &target,
+                                FetchTargetId target_id,
+                                uint64_t distance_from_fetch_head) const;
+    void notifyFdipTargetRemove(const FetchTarget &target,
+                                FetchTargetId target_id) const;
 
     void clearPreds(ThreadID tid) {
         for (auto &stagePred : threads[tid].predsOfEachStage) {
diff --git a/src/cpu/pred/btb/fdip_target.hh b/src/cpu/pred/btb/fdip_target.hh
new file mode 100644
index 0000000000..9354e4f0dd
--- /dev/null
+++ b/src/cpu/pred/btb/fdip_target.hh
@@ -0,0 +1,53 @@
+#ifndef __CPU_PRED_BTB_FDIP_TARGET_HH__
+#define __CPU_PRED_BTB_FDIP_TARGET_HH__
+
+#include <memory>
+
+#include "base/types.hh"
+#include "cpu/o3/limits.hh"
+#include "cpu/pred/btb/common.hh"
+
+namespace gem5
+{
+
+namespace branch_prediction
+{
+
+namespace btb_pred
+{
+
+struct FdipFetchTarget
+{
+    ThreadID tid;
+    FetchTargetId id;
+    Addr startPC;
+    Addr predEndPC;
+    Tick predTick;
+    uint64_t distanceFromFetchHead;
+
+    FdipFetchTarget(ThreadID _tid, FetchTargetId _id, Addr _start_pc,
+                    Addr _pred_end_pc, Tick _pred_tick,
+                    uint64_t _distance_from_fetch_head)
+        : tid(_tid),
+          id(_id),
+          startPC(_start_pc),
+          predEndPC(_pred_end_pc),
+          predTick(_pred_tick),
+          distanceFromFetchHead(_distance_from_fetch_head)
+    {}
+
+    FdipFetchTarget(const FetchTarget &target, FetchTargetId _id,
+                    uint64_t _distance_from_fetch_head)
+        : FdipFetchTarget(target.tid, _id, target.startPC,
+                          target.predEndPC, target.predTick,
+                          _distance_from_fetch_head)
+    {}
+};
+
+using FdipFetchTargetPtr = std::shared_ptr<FdipFetchTarget>;
+
+} // namespace btb_pred
+} // namespace branch_prediction
+} // namespace gem5
+
+#endif // __CPU_PRED_BTB_FDIP_TARGET_HH__
diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py
index eea10b7229..8a3db37c44 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -932,6 +932,36 @@ def listenFromProbeRetiredInstructions(self, simObj):
         self.addEvent(HWPProbeEventRetiredInsts(self, simObj,"RetiredInstsPC"))
 
 
+class FetchDirectedPrefetcher(BasePrefetcher):
+    type = "FetchDirectedPrefetcher"
+    cxx_class = "gem5::prefetch::FetchDirectedPrefetcher"
+    cxx_header = "mem/cache/prefetch/fdp.hh"
+
+    cpu = Param.BaseCPU(Parent.any, "CPU whose FTQ target probes are tracked")
+
+    latency = Param.Cycles(64, "Latency for generated prefetches")
+    pfq_size = Param.Unsigned(1, "Maximum number of queued prefetches")
+    tq_size = Param.Unsigned(1, "Maximum outstanding translations")
+    mark_req_as_prefetch = Param.Bool(
+        True,
+        "Mark translation requests as prefetches")
+    squash_prefetches = Param.Bool(
+        True,
+        "Squash queued prefetches when the source FTQ target is removed")
+    cache_snoop = Param.Bool(
+        True,
+        "Drop candidates that already hit in the cache or MSHR")
+    max_blocks_per_target = Param.Unsigned(
+        1,
+        "Maximum cache blocks to prefetch per target; 0 means unlimited")
+    skip_target_start_block = Param.Bool(
+        False,
+        "Skip the cache block containing the target start PC")
+    min_target_distance = Param.Unsigned(
+        32,
+        "Minimum FTQ distance from fetch head before generating candidates")
+
+
 class IPCPrefetcher(QueuedPrefetcher):
     type = 'IPCPrefetcher'
     cxx_class = 'gem5::prefetch::IPCP'
diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript
index 2e074e2fdd..2754e16071 100644
--- a/src/mem/cache/prefetch/SConscript
+++ b/src/mem/cache/prefetch/SConscript
@@ -39,7 +39,8 @@ SimObject('Prefetcher.py', sim_objects=[
     'IrregularStreamBufferPrefetcher', 'SlimAMPMPrefetcher',
     'WorkerPrefetcher', 'DespacitoStreamPrefetcher',
     'BOPPrefetcher', 'SBOOEPrefetcher', 'STeMSPrefetcher', 'PIFPrefetcher', 'IPCPrefetcher',
-    'CompositeWithWorkerPrefetcher', 'L2CompositeWithWorkerPrefetcher', 'PrefetcherForwarder'])
+    'CompositeWithWorkerPrefetcher', 'L2CompositeWithWorkerPrefetcher',
+    'PrefetcherForwarder', 'FetchDirectedPrefetcher'])
 
 
 DebugFlag('BOPPrefetcher')
@@ -89,4 +90,5 @@ Source('composite_with_worker.cc')
 Source('l2_composite_with_worker.cc')
 Source('despacito_stream.cc')
 Source('forwarder.cc')
+Source('fdp.cc')
 Source('prefetch_filter.cc')
diff --git a/src/mem/cache/prefetch/fdp.cc b/src/mem/cache/prefetch/fdp.cc
new file mode 100644
index 0000000000..d47d650de0
--- /dev/null
+++ b/src/mem/cache/prefetch/fdp.cc
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2022-2023 The University of Edinburgh
+ * Copyright (c) 2025 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in this file.  You may use this
+ * file subject to the license terms below provided that you ensure that this
+ * notice is replicated unmodified and in its entirety in all distributions,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer; redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution; neither the name of the copyright holders nor
+ * the names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mem/cache/prefetch/fdp.hh"
+
+#include <algorithm>
+
+#include "debug/HWPrefetch.hh"
+#include "mem/request.hh"
+#include "params/FetchDirectedPrefetcher.hh"
+
+namespace gem5
+{
+
+namespace prefetch
+{
+
+FetchDirectedPrefetcher::FetchDirectedPrefetcher(
+    const FetchDirectedPrefetcherParams &p)
+    : Base(p),
+      cpu(p.cpu),
+      markReqAsPrefetch(p.mark_req_as_prefetch),
+      squashPrefetches(p.squash_prefetches),
+      latency(cyclesToTicks(p.latency)),
+      pfqSize(p.pfq_size),
+      tqSize(p.tq_size),
+      cacheSnoop(p.cache_snoop),
+      maxBlocksPerTarget(p.max_blocks_per_target),
+      skipTargetStartBlock(p.skip_target_start_block),
+      minTargetDistance(p.min_target_distance),
+      stats(this, p.pfq_size, p.tq_size)
+{}
+
+FetchDirectedPrefetcher::~FetchDirectedPrefetcher()
+{
+    for (auto *listener : listeners) {
+        delete listener;
+    }
+
+    for (auto &pr : pfq) {
+        delete pr.pkt;
+    }
+    for (auto &pr : translationq) {
+        delete pr.pkt;
+    }
+}
+
+FetchDirectedPrefetcher::FdipListener::FdipListener(
+    FetchDirectedPrefetcher &_parent, ProbeManager *pm,
+    const std::string &name, bool _insert)
+    : ProbeListenerArgBase<FdipFetchTargetPtr>(pm, name),
+      parent(_parent),
+      insert(_insert)
+{}
+
+void
+FetchDirectedPrefetcher::FdipListener::notify(const FdipFetchTargetPtr &ft)
+{
+    if (insert) {
+        parent.notifyFTQInsert(ft);
+    } else {
+        parent.notifyFTQRemove(ft);
+    }
+}
+
+void
+FetchDirectedPrefetcher::regProbeListeners()
+{
+    Base::regProbeListeners();
+
+    if (cpu == nullptr) {
+        warn("FetchDirectedPrefetcher: no CPU to listen from\n");
+        return;
+    }
+
+    listeners.push_back(
+        new FdipListener(*this, cpu->getProbeManager(), "FTQInsert", true));
+    listeners.push_back(
+        new FdipListener(*this, cpu->getProbeManager(), "FTQRemove", false));
+}
+
+void
+FetchDirectedPrefetcher::notifyFTQInsert(const FdipFetchTargetPtr &ft)
+{
+    stats.fdipInsertions++;
+
+    if (ft->distanceFromFetchHead < minTargetDistance) {
+        stats.targetTooNear++;
+        return;
+    }
+
+    Addr end_pc = ft->predEndPC;
+    if (end_pc <= ft->startPC) {
+        end_pc = ft->startPC + 1;
+    }
+
+    Addr blk_addr = blockAddress(ft->startPC);
+    const Addr end_blk_addr = blockAddress(end_pc - 1);
+    if (skipTargetStartBlock) {
+        blk_addr += blkSize;
+    }
+
+    unsigned blocks = 0;
+    while (blk_addr <= end_blk_addr &&
+           (maxBlocksPerTarget == 0 || blocks < maxBlocksPerTarget)) {
+        auto pfq_it = std::find_if(
+            pfq.begin(), pfq.end(),
+            [blk_addr](const PrefetchRequest &pr) {
+                return pr.sameBlock(blk_addr);
+            });
+        if (pfq_it != pfq.end()) {
+            DPRINTF(HWPrefetch, "%#x already in FDP prefetch queue\n",
+                    blk_addr);
+            stats.pfInPFQ++;
+            blk_addr += blkSize;
+            blocks++;
+            continue;
+        }
+
+        auto tq_it = std::find_if(
+            translationq.begin(), translationq.end(),
+            [blk_addr](const PrefetchRequest &pr) {
+                return pr.sameBlock(blk_addr);
+            });
+        if (tq_it != translationq.end()) {
+            DPRINTF(HWPrefetch, "%#x already in FDP translation queue\n",
+                    blk_addr);
+            stats.pfInTQ++;
+            blk_addr += blkSize;
+            blocks++;
+            continue;
+        }
+
+        stats.pfIdentified++;
+
+        if (translationq.size() >= tqSize) {
+            DPRINTF(HWPrefetch,
+                    "FDP translation queue full, dropping %#x\n", blk_addr);
+            stats.tqDrops++;
+            blk_addr += blkSize;
+            blocks++;
+            continue;
+        }
+
+        translationq.emplace_back(*this, blk_addr, ft->tid, ft->id);
+        DPRINTF(HWPrefetch, "FDP starts translation for %#x ftq=%lu\n",
+                blk_addr, ft->id);
+        translationq.back().startTranslation();
+        stats.tqInserts++;
+        stats.tqSizeDistAtNotify.sample(translationq.size());
+        stats.pfqSizeDistAtNotify.sample(pfq.size());
+
+        blk_addr += blkSize;
+        blocks++;
+    }
+}
+
+void
+FetchDirectedPrefetcher::notifyFTQRemove(const FdipFetchTargetPtr &ft)
+{
+    stats.fdipRemovals++;
+
+    if (!squashPrefetches) {
+        return;
+    }
+
+    for (auto &pr : translationq) {
+        if (pr.ftid == ft->id) {
+            pr.markCanceled();
+            stats.pfSquashed++;
+        }
+    }
+
+    auto it = pfq.begin();
+    while (it != pfq.end()) {
+        if (it->ftid == ft->id) {
+            delete it->pkt;
+            it = pfq.erase(it);
+            stats.pfSquashed++;
+        } else {
+            ++it;
+        }
+    }
+}
+
+void
+FetchDirectedPrefetcher::translationComplete(PrefetchRequest *pfr, bool failed)
+{
+    auto it = translationq.begin();
+    while (it != translationq.end()) {
+        if (&(*it) == pfr) {
+            break;
+        }
+        ++it;
+    }
+    assert(it != translationq.end());
+
+    if (failed) {
+        DPRINTF(HWPrefetch, "FDP translation of %#x failed\n", it->addr);
+        stats.translationFail++;
+    } else {
+        stats.translationSuccess++;
+        const Addr paddr = it->req->getPaddr();
+        const bool secure = it->req->isSecure();
+
+        if (it->isCanceled()) {
+            DPRINTF(HWPrefetch,
+                    "FDP drops %#x after FTQ removal during translation\n",
+                    it->addr);
+        } else if (it->req->isUncacheable()) {
+            DPRINTF(HWPrefetch, "FDP drops uncacheable request %#x\n",
+                    it->addr);
+        } else if (!system->isMemAddr(paddr)) {
+            DPRINTF(HWPrefetch, "FDP drops non-memory paddr %#x\n", paddr);
+        } else if (cacheSnoop &&
+                   (inCache(paddr, secure) || inMissQueue(paddr, secure))) {
+            stats.pfInCache++;
+            if (hasBeenPrefetched(paddr, secure)) {
+                stats.pfInCachePrefetched++;
+            }
+            DPRINTF(HWPrefetch,
+                    "FDP drops redundant cache/MSHR candidate %#x\n", paddr);
+        } else if (pfq.size() < pfqSize) {
+            it->createPkt();
+            it->readyTime = curTick() + latency;
+            stats.pfPacketsCreated++;
+            stats.pfCandidatesAdded++;
+            pfq.push_back(*it);
+            stats.pfqInserts++;
+            DPRINTF(HWPrefetch,
+                    "FDP queued prefetch VA %#x PA %#x ftq=%lu pfq=%lu\n",
+                    it->addr, paddr, it->ftid, pfq.size());
+        } else {
+            DPRINTF(HWPrefetch,
+                    "FDP prefetch queue full, dropping %#x\n", it->addr);
+            stats.pfqDrops++;
+        }
+    }
+
+    translationq.erase(it);
+    stats.tqPops++;
+}
+
+PacketPtr
+FetchDirectedPrefetcher::getPacket()
+{
+    if (pfq.empty()) {
+        return nullptr;
+    }
+
+    PacketPtr pkt = pfq.front().pkt;
+    DPRINTF(HWPrefetch, "FDP issues prefetch PA %#x VA %#x ftq=%lu\n",
+            pkt->getAddr(), pfq.front().addr, pfq.front().ftid);
+
+    pfq.pop_front();
+    stats.pfqPops++;
+    prefetchStats.pfIssued++;
+    prefetchStats.pfIssued_srcs[pkt->req->getXsMetadata().prefetchSource]++;
+    issuedPrefetches++;
+
+    return pkt;
+}
+
+FetchDirectedPrefetcher::PrefetchRequest::PrefetchRequest(
+    FetchDirectedPrefetcher &_owner, Addr _addr, ThreadID tid,
+    FdipTargetId _ftid)
+    : owner(_owner),
+      addr(_addr),
+      ftid(_ftid),
+      req(nullptr),
+      pkt(nullptr),
+      readyTime(MaxTick),
+      canceled(false)
+{
+    auto *tc = owner.cpu->getContext(tid);
+    req = std::make_shared<Request>(addr, owner.blkSize, Request::INST_FETCH,
+                                    owner.requestorId, addr,
+                                    tc->contextId());
+    if (owner.markReqAsPrefetch) {
+        req->setFlags(Request::PREFETCH);
+    }
+    req->setXsMetadata(Request::XsMetadata(PrefetchSourceType::PF_NONE, 0));
+    req->setPFSource(PrefetchSourceType::PF_NONE);
+    req->setPFDepth(0);
+}
+
+void
+FetchDirectedPrefetcher::PrefetchRequest::createPkt()
+{
+    req->taskId(context_switch_task_id::Prefetcher);
+    pkt = new Packet(req, MemCmd::HardPFReq);
+    pkt->allocate();
+}
+
+void
+FetchDirectedPrefetcher::PrefetchRequest::startTranslation()
+{
+    fatal_if(owner.tlb == nullptr,
+             "FetchDirectedPrefetcher requires an instruction TLB\n");
+    auto *tc = owner.system->threads[req->contextId()];
+    owner.tlb->translateTiming(req, tc, this, BaseMMU::Execute);
+}
+
+void
+FetchDirectedPrefetcher::PrefetchRequest::finish(
+    const Fault &fault, const RequestPtr &req, ThreadContext *tc,
+    BaseMMU::Mode mode)
+{
+    owner.translationComplete(this, fault != NoFault);
+}
+
+FetchDirectedPrefetcher::Stats::Stats(
+    statistics::Group *parent, int pfq_size, int tq_size)
+    : statistics::Group(parent),
+      ADD_STAT(fdipInsertions, statistics::units::Count::get(),
+               "Number of FTQ insert notifications observed by FDP"),
+      ADD_STAT(fdipRemovals, statistics::units::Count::get(),
+               "Number of FTQ remove notifications observed by FDP"),
+      ADD_STAT(targetTooNear, statistics::units::Count::get(),
+               "Number of FTQ targets skipped by FDP distance filter"),
+      ADD_STAT(pfIdentified, statistics::units::Count::get(),
+               "Number of FDP prefetch candidates identified"),
+      ADD_STAT(pfSquashed, statistics::units::Count::get(),
+               "Number of FDP prefetches squashed by FTQ removal"),
+      ADD_STAT(pfInPFQ, statistics::units::Count::get(),
+               "Number of FDP candidates already in the prefetch queue"),
+      ADD_STAT(pfInTQ, statistics::units::Count::get(),
+               "Number of FDP candidates already in the translation queue"),
+      ADD_STAT(pfInCache, statistics::units::Count::get(),
+               "Number of FDP candidates dropped by cache/MSHR snoop"),
+      ADD_STAT(pfInCachePrefetched, statistics::units::Count::get(),
+               "Number of FDP snoop drops on already-prefetched blocks"),
+      ADD_STAT(pfPacketsCreated, statistics::units::Count::get(),
+               "Number of FDP HardPF packets created"),
+      ADD_STAT(pfCandidatesAdded, statistics::units::Count::get(),
+               "Number of FDP candidates added to the prefetch queue"),
+      ADD_STAT(translationFail, statistics::units::Count::get(),
+               "Number of FDP translations that failed"),
+      ADD_STAT(translationSuccess, statistics::units::Count::get(),
+               "Number of FDP translations that succeeded"),
+      ADD_STAT(pfqSizeDistAtNotify, statistics::units::Count::get(),
+               "Distribution of FDP prefetch queue size at notification"),
+      ADD_STAT(tqSizeDistAtNotify, statistics::units::Count::get(),
+               "Distribution of FDP translation queue size at notification"),
+      ADD_STAT(pfqInserts, statistics::units::Count::get(),
+               "Number of insertions into the FDP prefetch queue"),
+      ADD_STAT(pfqPops, statistics::units::Count::get(),
+               "Number of pops from the FDP prefetch queue"),
+      ADD_STAT(pfqDrops, statistics::units::Count::get(),
+               "Number of FDP candidates dropped because PFQ was full"),
+      ADD_STAT(tqInserts, statistics::units::Count::get(),
+               "Number of insertions into the FDP translation queue"),
+      ADD_STAT(tqPops, statistics::units::Count::get(),
+               "Number of pops from the FDP translation queue"),
+      ADD_STAT(tqDrops, statistics::units::Count::get(),
+               "Number of FDP candidates dropped because TQ was full")
+{
+    pfqSizeDistAtNotify.init(0, pfq_size, 4);
+    tqSizeDistAtNotify.init(0, tq_size, 4);
+}
+
+} // namespace prefetch
+} // namespace gem5
diff --git a/src/mem/cache/prefetch/fdp.hh b/src/mem/cache/prefetch/fdp.hh
new file mode 100644
index 0000000000..607a9217c2
--- /dev/null
+++ b/src/mem/cache/prefetch/fdp.hh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022-2023 The University of Edinburgh
+ * Copyright (c) 2025 Arm Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in this file.  You may use this
+ * file subject to the license terms below provided that you ensure that this
+ * notice is replicated unmodified and in its entirety in all distributions,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer; redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution; neither the name of the copyright holders nor
+ * the names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MEM_CACHE_PREFETCH_FDP_HH__
+#define __MEM_CACHE_PREFETCH_FDP_HH__
+
+#include <list>
+#include <vector>
+
+#include "arch/generic/mmu.hh"
+#include "cpu/base.hh"
+#include "cpu/pred/btb/fdip_target.hh"
+#include "mem/cache/prefetch/base.hh"
+#include "sim/probe/probe.hh"
+
+namespace gem5
+{
+
+struct FetchDirectedPrefetcherParams;
+
+namespace prefetch
+{
+
+class FetchDirectedPrefetcher : public Base
+{
+  public:
+    FetchDirectedPrefetcher(const FetchDirectedPrefetcherParams &p);
+    ~FetchDirectedPrefetcher();
+
+    void regProbeListeners() override;
+    void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) override {}
+    PacketPtr getPacket() override;
+
+    bool
+    hasPendingPacket() override
+    {
+        return !pfq.empty();
+    }
+
+    Tick
+    nextPrefetchReadyTime() const override
+    {
+        return pfq.empty() ? MaxTick : pfq.front().readyTime;
+    }
+
+    void
+    rxHint(BaseMMU::Translation *dpp) override
+    {}
+
+    void
+    pfHitNotify(float accuracy, PrefetchSourceType pf_source,
+                const PacketPtr &pkt) override
+    {}
+
+  private:
+    using FdipFetchTargetPtr =
+        branch_prediction::btb_pred::FdipFetchTargetPtr;
+    using FdipTargetId = branch_prediction::btb_pred::FetchTargetId;
+
+    class FdipListener : public ProbeListenerArgBase<FdipFetchTargetPtr>
+    {
+      public:
+        FdipListener(FetchDirectedPrefetcher &_parent, ProbeManager *pm,
+                     const std::string &name, bool _insert);
+        void notify(const FdipFetchTargetPtr &ft) override;
+
+      private:
+        FetchDirectedPrefetcher &parent;
+        const bool insert;
+    };
+
+    std::vector<FdipListener *> listeners;
+
+    BaseCPU *cpu;
+
+    const bool markReqAsPrefetch;
+    const bool squashPrefetches;
+    const Tick latency;
+    const unsigned pfqSize;
+    const unsigned tqSize;
+    const bool cacheSnoop;
+    const unsigned maxBlocksPerTarget;
+    const bool skipTargetStartBlock;
+    const unsigned minTargetDistance;
+
+    struct PrefetchRequest : public BaseMMU::Translation
+    {
+        PrefetchRequest(FetchDirectedPrefetcher &_owner, Addr _addr,
+                        ThreadID tid, FdipTargetId _ftid);
+
+        FetchDirectedPrefetcher &owner;
+        const Addr addr;
+        const FdipTargetId ftid;
+        RequestPtr req;
+        PacketPtr pkt;
+        Tick readyTime;
+        bool canceled;
+
+        bool sameBlock(Addr block_addr) const { return addr == block_addr; }
+
+        void createPkt();
+        void startTranslation();
+        void markDelayed() override {}
+        void finish(const Fault &fault, const RequestPtr &req,
+                    ThreadContext *tc, BaseMMU::Mode mode) override;
+
+        void markCanceled() { canceled = true; }
+        bool isCanceled() const { return canceled; }
+    };
+
+    std::list<PrefetchRequest> pfq;
+    std::list<PrefetchRequest> translationq;
+
+    void notifyFTQInsert(const FdipFetchTargetPtr &ft);
+    void notifyFTQRemove(const FdipFetchTargetPtr &ft);
+    void translationComplete(PrefetchRequest *pf_req, bool failed);
+
+  protected:
+    struct Stats : public statistics::Group
+    {
+        Stats(statistics::Group *parent, int pfq_size, int tq_size);
+
+        statistics::Scalar fdipInsertions;
+        statistics::Scalar fdipRemovals;
+        statistics::Scalar targetTooNear;
+        statistics::Scalar pfIdentified;
+        statistics::Scalar pfSquashed;
+        statistics::Scalar pfInPFQ;
+        statistics::Scalar pfInTQ;
+        statistics::Scalar pfInCache;
+        statistics::Scalar pfInCachePrefetched;
+        statistics::Scalar pfPacketsCreated;
+        statistics::Scalar pfCandidatesAdded;
+        statistics::Scalar translationFail;
+        statistics::Scalar translationSuccess;
+        statistics::Distribution pfqSizeDistAtNotify;
+        statistics::Distribution tqSizeDistAtNotify;
+        statistics::Scalar pfqInserts;
+        statistics::Scalar pfqPops;
+        statistics::Scalar pfqDrops;
+        statistics::Scalar tqInserts;
+        statistics::Scalar tqPops;
+        statistics::Scalar tqDrops;
+    } stats;
+};
+
+} // namespace prefetch
+} // namespace gem5
+
+#endif // __MEM_CACHE_PREFETCH_FDP_HH__

From 2957b897f40bf657627515b7054913d3d287fc21 Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Tue, 19 May 2026 16:57:41 +0800
Subject: [PATCH 4/6] configs: Enable FDIP experiment for kmhv3

Change-Id: I5ad66132b422fb1c6bb2c72c9a0a259e46c6fb83
---
 configs/example/kmhv3.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py
index 48bbe80b75..452bc6fdb1 100644
--- a/configs/example/kmhv3.py
+++ b/configs/example/kmhv3.py
@@ -176,6 +176,8 @@ def setKmhV3Params(args, system):
 
     # Enable prefetch buffers for all hardware prefetchers in this config.
     args.enable_pf_buffer = True
+    if not args.no_pf and args.l1i_hwp_type is None:
+        args.l1i_hwp_type = 'FetchDirectedPrefetcher'
 
     # Set default bp_type based on ideal_kmhv3 flag
     # If user didn't specify bp_type, set default based on ideal_kmhv3

From aab29b82b63de0dc2d6e537478e4fb27500e1527 Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Tue, 19 May 2026 17:01:35 +0800
Subject: [PATCH 5/6] misc: Rebuild DRAMSim3 from a clean cache copy

Change-Id: I030e229cb6448ef468c15b838c8c3a85e74b781f
---
 .github/actions/build-dramsim/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/build-dramsim/action.yml b/.github/actions/build-dramsim/action.yml
index aeff06114d..d45bd22409 100644
--- a/.github/actions/build-dramsim/action.yml
+++ b/.github/actions/build-dramsim/action.yml
@@ -12,6 +12,7 @@ runs:
         if [ ! -d "DRAMsim3" ]; then
           cp -r /nfs/home/share/gem5_ci/DRAMsim3 .
         fi
+        rm -rf DRAMsim3/build
         cd DRAMsim3 && mkdir -p build
         cd build
         cmake ..

From 86787a83620bb0b0986c0806b977e3292bb3f2dc Mon Sep 17 00:00:00 2001
From: Yan Yue <1131531947@qq.com>
Date: Tue, 19 May 2026 18:10:47 +0800
Subject: [PATCH 6/6] misc: Record FDIP CI findings

Change-Id: I85e97dc4a7dbd7ce8f54de8ba6de40a68e189936
---
 ...am-decoupled-fe-fdip-comparison-2026-05-18.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
index 19ab20caeb..9604a1c2ac 100644
--- a/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
+++ b/docs/exec-plans/active/upstream-decoupled-fe-fdip-comparison-2026-05-18.md
@@ -114,6 +114,10 @@
 - [x] 2026-05-19 16:40 实现 cache-side `FetchDirectedPrefetcher` prototype：BPU/FTQ 发 `FTQInsert`/`FTQRemove` probe，L1I prefetcher 监听 target 生命周期，经 ITB timing translation 后以 `HardPFReq` 进入 cache 侧 snoop/MSHR 路径
 - [x] 2026-05-19 16:55 完成 cache-side FDP 的稳定化：默认 `pfq_size=1`、`tq_size=1`、`min_target_distance=32`、`latency=64`，并在 L1I 使用 FDP 时把 `demand_mshr_reserve` 提到 2，避免 4-entry L1I MSHR 被 FDP 抢占过多
 - [x] 2026-05-19 17:00 完成本地 `gcc_typeck_4528`、`gcc_expr2_27` 的 `5M+5M` A/B，结果为小幅正向但很接近噪声；准备以单独实验提交启用 `kmhv3.py` 默认 L1I FDP 后 push CI 观察全套 0.3c
+- [x] 2026-05-19 17:10 提交并 push cache-side FDP 实验序列：`58c7bb780c mem-cache: Add cache-side fetch directed prefetcher`、`2957b897f4 configs: Enable FDIP experiment for kmhv3`、`aab29b82b6 misc: Rebuild DRAMSim3 from a clean cache copy`
+- [x] 2026-05-19 18:20 CI run `26087243838` 成功完成，归档路径为 `/nfs/home/share/gem5_ci/performance_data/gcc15-spec06-0.3c/20260519_170615_aab29b8_kmhv3_run571`
+- [x] 2026-05-19 18:35 完成 CI 对比：cache-side FDP 的 SPEC06 0.3c overall score 为 19.921159，对比 stats-only baseline `20260518_201303_797e2e7_kmhv3_run569` 的 19.926709，约 -0.028%
+- [x] 2026-05-19 18:40 在高 I-cache-MPKI 切片 `gcc_s04_7630` 上本地测试 `min_target_distance=32/64/96/128/256`：`32` 会发包并变慢，`64+` 基本完全不发 FDP
 
 ## 发现和意外
 
@@ -141,6 +145,12 @@
 - `gcc_typeck_4528` 的 `5M+5M` 测量段结果为 cycles 1824859 -> 1822698（约 -0.12%），I-cache misses 15411 -> 15428，`pfIssued=113`、`pfUseful=41`、`pfUnused=49`、`demandMergedIntoPfMSHR=10`，no-MSHR blocked cycles 4171 -> 6655。
 - `gcc_expr2_27` 的 `5M+5M` 测量段结果为 cycles 2410537 -> 2410225（约 -0.013%），I-cache misses 13294 -> 13243，`pfIssued=293`、`pfUseful=146`、`pfUnused=115`、`demandMergedIntoPfMSHR=31`，no-MSHR blocked cycles 12493 -> 14211。
 - 目前 cache-side FDP 的局部信号是“有少量有效覆盖，但 MSHR/port 资源代价也可见”。这值得 push 一轮 CI 看全套 SPEC06 0.3c，但还不能说已经是可合入的收益方向。
+- 全套 CI 结果确认 cache-side FDP 初版不是可合入收益方向。对比 stats-only baseline，overall score 19.926709 -> 19.921159（约 -0.028%），Int 18.724052 -> 18.697521（约 -0.142%），FP 20.821866 -> 20.832814（约 +0.053%）。
+- benchmark 级主要负向来自 `omnetpp`（score -0.774%）、`libquantum`（-0.537%）、`milc`（-0.343%）、`sjeng`（-0.237%）、`gcc`（-0.172%）、`perlbench`（-0.171%）、`mcf`（-0.152%）；主要正向有 `zeusmp`（+0.762%）、`GemsFDTD`（+0.281%）、`gromacs`（+0.256%）。
+- 143 个切片 raw sum 中，FDP 减少了 4422 个 I-cache overall misses，但增加了 231269 个 L1I `noMshrBlockedCycles` 和 556832 个 cycles。全套发出 `pfIssued=24700`，`pfUseful=12176`，`demandMergedIntoPfMSHR=2780`，`pfOnlyFill=19818`。这说明机制确实覆盖到了一些将来 demand，但共享 L1I MSHR/端口压力抵消了收益。
+- 最明显负向切片包括 `gcc_s04_7630`（cycles +1.921%、miss +1246、noMSHR +12850、pfIssued 1620、pfUseful 776）、`gcc_typeck_4528`（+0.615%、miss +498、noMSHR +697、pfIssued 679、pfUseful 281）、`gcc_expr2_27`（+0.392%、miss +10、noMSHR +14479、pfIssued 944、pfUseful 319）。也存在正向切片，例如 `gcc_g23_8607`（cycles -0.583%、miss -161，但 noMSHR +6434、pfIssued 791、pfUseful 351）。
+- 在 `gcc_s04_7630` 本地 `5M+5M` 参数探测中，`min_target_distance=32`：cycles 1858483、pfIssued 342、pfUseful 161、noMSHR 2146；`64/96/128/256` 全部不发 FDP，cycles 1836924、noMSHR 1146。当前 target 距离分布很窄，阈值从 32 提到 64 就几乎等于关闭 FDP，说明简单调大距离阈值没有可用余地。
+- 机制判断：当前本地 BPU/FTQ 虽然能提供 target lifecycle，但大多数可预取 target 的 lead time 仍太短；cache-side HardPFReq 又与 demand fetch 共用 L1I MSHR 和下游端口。因此这个初版更像“late prefetch / MSHR competitor”，而不是有效隐藏 I-cache miss latency 的 side-channel。
 
 ## 第一版机制对照
 
@@ -203,3 +213,9 @@
 - Decision: 若继续 FDIP，应优先做 cache-side FDP/snoop 型实现，或至少为 fetch-side prototype 增加“只对 cache/MSHR miss 候选发包”的过滤接口。
 - Reason: 上游 FDP 的 TQ/PFQ/cache snoop 生命周期正是当前 prototype 缺失的关键能力；仅靠 target distance、skip-start-block、target-age 过滤无法稳定避免 L1I-hit prefetch 扰动。
 - Date: 2026-05-18
+- Decision: 不建议把当前 cache-side FDP 初版作为性能优化合入；`kmhv3.py` 默认启用 FDP 的提交只应视作实验开关。
+- Reason: 全套 SPEC06 0.3c CI 轻微负向，且机制计数显示 I-cache miss 减少不足以抵消 L1I MSHR/端口压力。`min_target_distance` 从 32 调到 64 就几乎完全不发 FDP，说明当前 ahead window 太窄，单纯调参很难获得稳定收益。
+- Date: 2026-05-19
+- Decision: 下一步若继续 FDIP，应转向更强 ahead source 或低优先级/不占 demand MSHR 的 cache-side 机制，而不是继续沿用当前 `HardPFReq` 直接进入 L1I MSHR 的路径。
+- Reason: 有效 prefetch 的必要条件是比 demand 早足够多且不明显抢 demand 资源；当前实现满足 lifecycle/snoop，但没有独立资源或足够提前量。
+- Date: 2026-05-19