From 1ab1a92584f6797a8286f910ca797a66df4f30dd Mon Sep 17 00:00:00 2001
From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:38:31 +0800
Subject: [PATCH 01/38] add frontend and backend smt (#791)

Co-authored-by: mohaonan <mo.haonan1@sanechips.com.cn>
---
 src/cpu/o3/FuncScheduler.py |   7 +-
 src/cpu/o3/SConscript       |   2 +-
 src/cpu/o3/comm.hh          |  10 ++
 src/cpu/o3/fetch.cc         |  65 ++++++++++++-
 src/cpu/o3/fetch.hh         |  19 ++++
 src/cpu/o3/iew.cc           |   5 +-
 src/cpu/o3/inst_queue.cc    |   9 +-
 src/cpu/o3/inst_queue.hh    |   1 +
 src/cpu/o3/issue_queue.cc   | 177 ++++++++++++++++++++++++++++++------
 src/cpu/o3/issue_queue.hh   |  41 ++++++++-
 src/cpu/o3/smt_sched.hh     |  26 +++++-
 11 files changed, 323 insertions(+), 39 deletions(-)
diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py
index 2d088a6032..7676f6d643 100644
--- a/src/cpu/o3/FuncScheduler.py
+++ b/src/cpu/o3/FuncScheduler.py
@@ -75,6 +75,11 @@ class PAgeSelector(BaseSelector):
 
     piece = Param.Int(2, "number of instructions in a group")
 
+class SMTBasedSelector(BaseSelector):
+    type = 'SMTBasedSelector'
+    cxx_class = 'gem5::o3::SMTBasedSelector'
+    cxx_header = "cpu/o3/issue_queue.hh"
+
 class IssueQue(SimObject):
     type = 'IssueQue'
     cxx_class = 'gem5::o3::IssueQue'
@@ -85,7 +90,7 @@ class IssueQue(SimObject):
     inports = Param.Int(2, "")
     scheduleToExecDelay = Param.Cycles(2, "")
     oports = VectorParam.IssuePort("")
-    sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ (default: age first)")
+    sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)")
 
 class Scheduler(SimObject):
     type = 'Scheduler'
diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript
index 1ee4cf9448..463a8cdfc0 100755
--- a/src/cpu/o3/SConscript
+++ b/src/cpu/o3/SConscript
@@ -32,7 +32,7 @@ Import('*')
 
 if env['CONF']['TARGET_ISA'] != 'null':
     SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel',
-              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler'])
+              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler'])
     SimObject('FuncUnitConfig.py', sim_objects=[])
     SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[
         'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord'])
diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh
index cb88ad769f..f15257426f 100644
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -168,6 +168,12 @@ struct IssueStruct
     DynInstPtr insts[MaxWidth];
 };
 
+struct SquashInfo
+{
+    InstSeqNum squashSn;
+    ThreadID   squashTid;
+};
+
 struct SquashVersion
 {
     uint8_t version;
@@ -246,6 +252,10 @@ struct TimeStruct
         };
         /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */
         std::vector<ResolvedCFIEntry> resolvedCFIs;  // *F
+
+        unsigned iqCount;
+        unsigned ldstqCount;
+        unsigned robCount;
     };
 
     IewComm iewInfo[MaxThreads]; // iew to rename, fetch
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 21c9cec4e6..3c00c5937d 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -148,6 +148,8 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
         threads[tid].data = new uint8_t[fetchBufferSize];
     }
 
+    initDecodeScheduler();
+
     // Get the size of an instruction.
     // stallReason size should be the same as decodeWidth,renameWidth,dispWidth
     stallReason.resize(decodeWidth, StallReason::NoStall);
@@ -372,6 +374,41 @@ Fetch::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
     fromCommit = timeBuffer->getWire(-commitToFetchDelay);
 }
 
+void
+Fetch::initDecodeScheduler()
+{
+     // Initialize counters (same as before)
+    lsqCounter = new InstsCounter();
+    iqCounter  = new InstsCounter();
+    robCounter = new InstsCounter();
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 0\n");
+
+    for (ThreadID tid = 0; tid < numThreads; tid++) 
+    {
+        lsqCounter->setCounter(tid, 0);
+        iqCounter->setCounter(tid, 0);
+        robCounter->setCounter(tid, 0);
+    }
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 1\n");
+    
+    if (smtDecodePolicy == "icount") {
+        // Use ROB as default counter for icount
+        decodeScheduler = new ICountScheduler(numThreads, robCounter);
+    }
+    else if (smtDecodePolicy == "delayed") {
+        decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, delayedSchedulerDelay);
+    }
+    else if (smtDecodePolicy == "multi_priority") {
+        decodeScheduler = new MultiPrioritySched(numThreads, {lsqCounter, iqCounter, robCounter});
+    }
+    else {
+        // Default: round-robin like (use delayed with thread cycling)
+        decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, numThreads);
+    }
+
+    DPRINTF(Fetch, "Initialized SMT Decode Scheduler: %s\n", smtDecodePolicy.c_str());
+}
+
 void
 Fetch::setActiveThreads(std::list<ThreadID> *at_ptr)
 {
@@ -1285,6 +1322,32 @@ Fetch::handleInterrupts()
     }
 }
 
+ThreadID
+Fetch::selectUnstalledThread()
+{
+
+    // if (numThreads == 1) {
+    //     return 0;
+    // }
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (!stallSig->blockFetch[tid]) {
+            lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount);
+            iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount);
+            robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount);
+           
+        } else {
+            lsqCounter->setCounter(tid, UINT64_MAX);
+            iqCounter->setCounter(tid, UINT64_MAX);
+            robCounter->setCounter(tid, UINT64_MAX);
+            
+        }
+        DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount);
+    }
+
+    ThreadID selected = decodeScheduler->getThread();
+    return selected;
+}
+
 void
 Fetch::sendInstructionsToDecode()
 {
@@ -1321,7 +1384,7 @@ Fetch::sendInstructionsToDecode()
         return;
     }
 
-    ThreadID tid = 0; // TODO: smt support
+    ThreadID tid =selectUnstalledThread();
 
     // fetch totally stalled
     if (stallSig->blockFetch[tid]) {
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 19091ef30e..6e114487cf 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -65,6 +65,7 @@
 #include "mem/port.hh"
 #include "sim/eventq.hh"
 #include "sim/probe/probe.hh"
+#include "cpu/o3/smt_sched.hh"
 
 namespace gem5
 {
@@ -233,6 +234,18 @@ class Fetch
     /** To probe when a fetch request is successfully sent. */
     ProbePointArg<RequestPtr> *ppFetchRequestSent;
 
+    // SMT Decode Scheduler
+    SMTScheduler* decodeScheduler;
+
+    // Counters from backend structures (to be passed in)
+    InstsCounter* lsqCounter;
+    InstsCounter* iqCounter;
+    InstsCounter* robCounter;
+
+    // Configuration parameters
+    std::string smtDecodePolicy ="multi_priority";
+    int delayedSchedulerDelay;
+
   public:
     /** Fetch constructor. */
     Fetch(CPU *_cpu, const BaseO3CPUParams &params);
@@ -299,6 +312,12 @@ class Fetch
 
     /** For priority-based fetch policies, need to keep update priorityList */
     void deactivateThread(ThreadID tid);
+
+    // Function to initialize scheduler
+    void initDecodeScheduler();
+
+    // Select a thread that is not fetch-blocked, using scheduler
+    ThreadID selectUnstalledThread();
   private:
     /** Reset this pipeline stage */
     void resetStage();
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 81c261bc40..d9c815b86c 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -528,7 +528,7 @@ IEW::squash(ThreadID tid)
 
     for (auto& dp : dispQue) {
         for (auto& it : dp) {
-            if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum) {
+            if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum && (it->threadNumber == tid)) {
                 it->setSquashed();
             }
         }
@@ -1556,6 +1556,9 @@ IEW::executeInsts()
     ThreadID tid = *activeThreads->begin();
     toFetch->iewInfo[tid].resolvedCFIs.clear();
 
+    toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid);
+    toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid);
+    toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid);
     // Execute/writeback any instructions that are available.
     int insts_to_execute = fromIssue->size;
     fromIssue->size = 0;
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 89a027c3b1..29573959cf 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -151,7 +151,8 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
     scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue);
     scheduler->resetDepGraph(numPhysRegs);
     scheduler->setMemDepUnit(memDepUnit);
-
+    scheduler->initIQICountSmtScheduler(numThreads);
+    
     resetState();
 }
 
@@ -1121,7 +1122,9 @@ InstructionQueue::doSquash(ThreadID tid)
 
     DPRINTF(IQ, "[tid:%i] Squashing until sequence number %i!\n",
             tid, squashedSeqNum[tid]);
-    scheduler->doSquash(squashedSeqNum[tid]);
+    squashInfo.squashTid = tid;
+    squashInfo.squashSn  = squashedSeqNum[tid];
+    scheduler->doSquash(squashInfo);
 
     for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) {
         if (!it->inst ||
@@ -1134,7 +1137,7 @@ InstructionQueue::doSquash(ThreadID tid)
     }
 
     for (auto it = nonSpecInsts.begin(); it != nonSpecInsts.end();) {
-        if (it->first > squashedSeqNum[tid]) {
+        if (it->first > squashedSeqNum[tid]  && (it->second->threadNumber == tid)) {
             auto& squashed_inst = it->second;
             if (!squashed_inst->isIssued() ||
                 (squashed_inst->isMemRef() &&
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index db01710da9..f163ebb28e 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -427,6 +427,7 @@ class InstructionQueue
 
     /** The sequence number of the squashed instruction. */
     InstSeqNum squashedSeqNum[MaxThreads];
+    SquashInfo    squashInfo;
 
     struct IQStats : public statistics::Group
     {
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index e1ba93a79a..50fa7a5eb5 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -140,6 +140,58 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid)
     }
 }
 
+void
+SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq)
+{
+    BaseSelector::setparent(scheduler, iq);
+
+    smtScheduler = iq->getIndependentIQICountScheduler();
+}
+
+ReadyQue::iterator
+SMTBasedSelector::select(ReadyQue::iterator begin, int portid)
+{
+    if (begin == end) {
+        return end;
+    }
+    
+    ThreadID priorityThread = 0;
+    
+    if (smtScheduler) {
+        priorityThread = smtScheduler->getThread();
+        
+        DPRINTF(Schedule, 
+            "SMTBasedSelector: priority thread = %d\n", 
+            priorityThread);
+    }
+    
+    for (auto it = begin; it != end; it++) {
+        auto& inst = *it;
+        
+        if (inst->threadNumber == priorityThread) {
+            DPRINTF(Schedule, 
+                "[sn:%llu] selected by SMT policy (tid=%d)\n",
+                inst->seqNum, priorityThread);
+            return it;
+        }
+    }
+    
+    
+    for (auto it = begin; it != end; it++) {
+        auto& inst = *it;
+        
+        if (inst->threadNumber != priorityThread) {
+            DPRINTF(Schedule, 
+                "[sn:%llu] selected by default (tid=%d, priority=%d)\n",
+                inst->seqNum, inst->threadNumber, priorityThread);
+            return it;
+        }
+    }
+    
+    DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n");
+    return begin;
+}
+
 bool
 IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const
 {
@@ -312,6 +364,9 @@ IssueQue::IssueQue(const IssueQueParams& params)
         if (storePipeAcc)
             numStorePipe++;
     }
+
+    //Init InstsCounter
+    instsCounter = new InstsCounter();
 }
 
 void
@@ -361,6 +416,9 @@ IssueQue::addToFu(const DynInstPtr& inst)
     }
     inst->setIssued();
     POPINST(inst);
+    if (hasInstsCounter()) {
+        decInIQInstsCounter(inst->threadNumber);
+    }
     scheduler->addToFU(inst);
 }
 
@@ -513,14 +571,16 @@ IssueQue::wakeUpDependents(const DynInstPtr& inst, bool speculative)
         for (auto& it : depgraph) {
             int srcIdx = it.first;
             auto& consumer = it.second;
-            if (consumer->readySrcIdx(srcIdx)) {
-                continue;
-            }
-            consumer->markSrcRegReady(srcIdx);
+            if(consumer->threadNumber == inst->threadNumber){
+                if (consumer->readySrcIdx(srcIdx)) {
+                    continue;
+                }
+                consumer->markSrcRegReady(srcIdx);
 
 
-            DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx);
-            addIfReady(consumer);
+                DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx);
+                addIfReady(consumer);
+            }
         }
 
         if (!speculative) {
@@ -725,6 +785,9 @@ IssueQue::insert(const DynInstPtr& inst)
     selector->allocate(inst);
     inst->issueQue = this;
     instList.emplace_back(inst);
+    if (hasInstsCounter()) {
+        incInIQInstsCounter(inst->threadNumber);
+    }
     bool addToDepGraph = false;
     for (int i = 0; i < inst->numSrcRegs(); i++) {
         auto src = inst->renamedSrcIdx(i);
@@ -780,11 +843,14 @@ IssueQue::doCommit(const InstSeqNum seqNum)
 }
 
 void
-IssueQue::doSquash(const InstSeqNum seqNum)
+IssueQue::doSquash(SquashInfo squashInfo)
 {
     for (auto it = instList.begin(); it != instList.end();) {
-        if ((*it)->seqNum > seqNum) {
+        if (((*it)->seqNum > squashInfo.squashSn) && ((*it)->threadNumber == squashInfo.squashTid)) {
             if (!(*it)->isIssued()) {
+                if (hasInstsCounter()) {
+                    decInIQInstsCounter((*it)->threadNumber);
+                }
                 POPINST((*it));
                 (*it)->setIssued();
             }
@@ -807,7 +873,7 @@ IssueQue::doSquash(const InstSeqNum seqNum)
         int size = inflightIssues[-i].size;
         for (int j = 0; j < size; j++) {
             auto& inst = inflightIssues[-i].insts[j];
-            if (inst && inst->isSquashed()) {
+            if (inst && inst->isSquashed() && (inst->threadNumber == squashInfo.squashTid)) {
                 inst = nullptr;
             }
         }
@@ -816,7 +882,7 @@ IssueQue::doSquash(const InstSeqNum seqNum)
     // clear in depGraph
     for (auto& entrys : subDepGraph) {
         for (auto it = entrys.begin(); it != entrys.end();) {
-            if ((*it).second->isSquashed()) {
+            if ((*it).second->isSquashed() && ((*it).second->threadNumber == squashInfo.squashTid)) {
                 it = entrys.erase(it);
             } else {
                 it++;
@@ -825,6 +891,33 @@ IssueQue::doSquash(const InstSeqNum seqNum)
     }
 }
 
+void
+IssueQue::incInIQInstsCounter(ThreadID tid)
+{
+    if (instsCounter) {
+        instsCounter->incCounter(tid);
+    } 
+}
+    
+void
+IssueQue::decInIQInstsCounter(ThreadID tid)
+{
+    if (instsCounter) {
+        instsCounter->decCounter(tid);
+    }
+}
+
+void
+IssueQue::initIndependentIQICountScheduler(int numThreads)
+{
+       assert(instsCounter != nullptr && "InstsCounter must be set first");
+        
+        independentIQICountScheduler = new IndependentIQICountScheduler(
+            numThreads, instsCounter);
+        
+        DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname);    
+}
+
 Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to,
                                                       PendingWakeEventsType* owner)
     : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to)
@@ -1451,24 +1544,26 @@ Scheduler::loadCancel(const DynInstPtr& inst)
                 for (auto& it : iq->subDepGraph[dst->flatIndex()]) {
                     int srcIdx = it.first;
                     auto& depInst = it.second;
-                    if (depInst->readySrcIdx(srcIdx)) {
-                        DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum,
-                                depInst->renamedSrcIdx(srcIdx)->flatIndex());
-                        if (depInst->isIssued()) {
-                            if (inst->vpMisprediction) {
-                                // VP misprediction: consumer may already be in-flight.
-                                // Mark canceled and propagate to its dependents.
-                                depInst->setCancel();
-                                depInst->clearSrcRegReady(srcIdx);
-                                dfs.push(depInst);
-                                needSquashFallback = true;
+                    if (depInst->threadNumber == inst->threadNumber) {
+                        if (depInst->readySrcIdx(srcIdx)) {
+                            DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum,
+                                    depInst->renamedSrcIdx(srcIdx)->flatIndex());
+                            if (depInst->isIssued()) {
+                                if (inst->vpMisprediction) {
+                                    // VP misprediction: consumer may already be in-flight.
+                                    // Mark canceled and propagate to its dependents.
+                                    depInst->setCancel();
+                                    depInst->clearSrcRegReady(srcIdx);
+                                    dfs.push(depInst);
+                                    needSquashFallback = true;
+                                }
+                                continue;
                             }
-                            continue;
-                        }
 
-                        depInst->issueQue->cancel(depInst);
-                        depInst->clearSrcRegReady(srcIdx);
-                        dfs.push(depInst);
+                            depInst->issueQue->cancel(depInst);
+                            depInst->clearSrcRegReady(srcIdx);
+                            dfs.push(depInst);
+                        }
                     }
                 }
             }
@@ -1591,11 +1686,11 @@ Scheduler::doCommit(const InstSeqNum seqNum)
 }
 
 void
-Scheduler::doSquash(const InstSeqNum seqNum)
+Scheduler::doSquash(SquashInfo squashInfo)
 {
-    DPRINTF(Schedule, "doSquash until seqNum %lu\n", seqNum);
+    DPRINTF(Schedule, "doSquash until seqNum %lu\n", squashInfo.squashSn);
     for (auto it : issueQues) {
-        it->doSquash(seqNum);
+        it->doSquash(squashInfo);
     }
 }
 
@@ -1609,6 +1704,17 @@ Scheduler::getIQInsts()
     return total;
 }
 
+uint32_t
+Scheduler::getIQInsts(ThreadID tid)
+{
+    uint32_t total = 0;
+    for (auto iq : issueQues) {
+        total += iq->getInstsCounter()->getCounter(tid);;   
+    }
+    return total;
+}
+
+
 void
 Scheduler::setMainRdpOpt(bool enable)
 {
@@ -1617,5 +1723,18 @@ Scheduler::setMainRdpOpt(bool enable)
     }
 }
 
+void
+Scheduler::initIQICountSmtScheduler(int numThreads)
+{
+    DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads);
+        
+    // to do: add switch;add SMTSchedulingPolicy
+    for (auto iq : issueQues) {
+        InstsCounter* counter = iq->getInstsCounter();
+        assert(counter);
+        iq->initIndependentIQICountScheduler(numThreads);
+    }
+}
+
 }
 }
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index 0058bbb8df..b1ab4f361a 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -16,12 +16,14 @@
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
+#include "cpu/o3/smt_sched.hh"
 #include "cpu/reg_class.hh"
 #include "cpu/timebuf.hh"
 #include "params/BaseSelector.hh"
 #include "params/IssuePort.hh"
 #include "params/IssueQue.hh"
 #include "params/PAgeSelector.hh"
+#include "params/SMTBasedSelector.hh"
 #include "params/Scheduler.hh"
 #include "params/SpecWakeupChannel.hh"
 #include "sim/sim_object.hh"
@@ -99,11 +101,25 @@ class PAgeSelector : public BaseSelector
     ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
 };
 
+class SMTBasedSelector : public BaseSelector
+{
+  private:
+      IndependentIQICountScheduler* smtScheduler = nullptr;
+  public:
+    SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {}
+    void setparent(Scheduler* scheduler, IssueQue* iq) override;
+    void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);}
+    void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);}
+    ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
+};
+
 class IssueQue : public SimObject
 {
     friend class Scheduler;
     friend class BaseSelector;
     friend class PAgeSelector;
+    friend class InstsCounter;
+    friend class IndependentIQICountScheduler;
 
     std::string _name;
     const int inports;
@@ -172,6 +188,10 @@ class IssueQue : public SimObject
     Scheduler* scheduler = nullptr;
     BaseSelector* selector = nullptr;
 
+    //iq smt scheduler
+    InstsCounter* instsCounter = nullptr;
+    IndependentIQICountScheduler* independentIQICountScheduler = nullptr;
+
     struct IssueQueStats : public statistics::Group
     {
         IssueQueStats(statistics::Group* parent, IssueQue* que, std::string name);
@@ -207,6 +227,21 @@ class IssueQue : public SimObject
     void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; }
     void resetDepGraph(int numPhysRegs);
 
+    void setInstsCounter(InstsCounter* counter) { instsCounter = counter;}
+
+    InstsCounter* getInstsCounter() const {return instsCounter; }
+
+    void incInIQInstsCounter(ThreadID tid);
+    void decInIQInstsCounter(ThreadID tid);
+    bool hasInstsCounter() const { return instsCounter != nullptr; }
+
+    void initIndependentIQICountScheduler(int numThreads);
+
+    void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) {
+      independentIQICountScheduler = _independentIQICountScheduler;
+    }
+    IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; }
+
     void tick();
     bool ready();
     int emptyEntries() const { return iqsize - instNum; }
@@ -219,7 +254,7 @@ class IssueQue : public SimObject
     bool idle();
 
     void doCommit(const InstSeqNum inst);
-    void doSquash(const InstSeqNum seqNum);
+    void doSquash(SquashInfo squashInfo);
 
     int getIssueStages() { return scheduleToExecDelay; }
     int getId() { return IQID; }
@@ -331,6 +366,7 @@ class Scheduler : public SimObject
     void setAllScoreBoard(PhysRegIdPtr reg);
     void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; }
     void setMainRdpOpt(bool enable);
+    void initIQICountSmtScheduler(int numThreads);
 
     void tick();
     void issueAndSelect();
@@ -360,8 +396,9 @@ class Scheduler : public SimObject
     bool hasReadyInsts();
     bool isDrained();
     void doCommit(const InstSeqNum seqNum);
-    void doSquash(const InstSeqNum seqNum);
+    void doSquash(SquashInfo squashInfo);
     uint32_t getIQInsts();
+    uint32_t getIQInsts(ThreadID tid);
 
     SchedulerStats& getStats() { return stats; }
 };
diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh
index d5222e758d..e6b00ab4d8 100644
--- a/src/cpu/o3/smt_sched.hh
+++ b/src/cpu/o3/smt_sched.hh
@@ -28,6 +28,8 @@ class InstsCounter
 
     uint64_t getCounter(ThreadID tid) { return counter[tid]; }
     void setCounter(ThreadID tid, uint64_t value) { counter[tid] = value; }
+    void incCounter(ThreadID tid, uint64_t value = 1) { counter[tid] += value; }
+    void decCounter(ThreadID tid, uint64_t value = 1) { counter[tid] -= value; }
 };
 
 class SMTScheduler
@@ -36,7 +38,8 @@ class SMTScheduler
     int numThreads;
   public:
     SMTScheduler(int numThreads) : numThreads(numThreads) {}
-    virtual ThreadID getThread();
+    virtual ~SMTScheduler() = default;
+    virtual ThreadID getThread() = 0;
 };
 
 
@@ -124,7 +127,28 @@ class MultiPrioritySched : public SMTScheduler
     }
 };
 
+class IndependentIQICountScheduler : public SMTScheduler {
+private:
+     InstsCounter* counter;  // Counter for this IQ only
 
+public:
+    IndependentIQICountScheduler(int numThreads, InstsCounter* counter)
+        : SMTScheduler(numThreads), counter(counter){}
+
+    ThreadID getThread() override {
+        ThreadID selectedTid = 0;
+        uint64_t maxCount = counter->getCounter(0);
+        
+        for (ThreadID tid = 1; tid < numThreads; ++tid) {
+            uint64_t count = counter->getCounter(tid);
+            if (count > maxCount) {
+                maxCount = count;
+                selectedTid = tid;
+            }
+        }
+        return selectedTid;
+    }
+};
 
 }}
 #endif

From 81b9cc0cb6ce8efdfab3a3b2e35cd9a5019290db Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 17 Mar 2026 14:21:10 +0800
Subject: [PATCH 02/38] cpu-o3: fix smt framework

---
 configs/common/Options.py           |  8 ++--
 configs/common/xiangshan.py         |  5 +++
 src/cpu/o3/comm.hh                  | 22 +++++++---
 src/cpu/o3/commit.cc                | 10 +++--
 src/cpu/o3/commit.hh                |  2 +-
 src/cpu/o3/cpu.cc                   | 16 +++----
 src/cpu/o3/decode.cc                |  7 +--
 src/cpu/o3/decode.hh                |  2 +-
 src/cpu/o3/fetch.cc                 | 66 +++++++++++++++++------------
 src/cpu/o3/fetch.hh                 |  2 +-
 src/cpu/o3/iew.cc                   | 12 +++---
 src/cpu/o3/iew.hh                   |  2 +-
 src/cpu/o3/inst_queue.cc            |  2 +-
 src/cpu/o3/issue_queue.cc           | 17 +++++---
 src/cpu/o3/issue_queue.hh           |  4 +-
 src/cpu/o3/lsq.cc                   | 12 ++++++
 src/cpu/o3/lsq.hh                   |  4 ++
 src/cpu/o3/rename.cc                |  7 +--
 src/cpu/o3/rename.hh                |  2 +-
 src/cpu/pred/btb/decoupled_bpred.cc | 35 +++++++++++++--
 src/cpu/pred/btb/decoupled_bpred.hh |  5 +--
 21 files changed, 157 insertions(+), 85 deletions(-)

diff --git a/configs/common/Options.py b/configs/common/Options.py
index 441840b5c8..4c314ddbf9 100644
--- a/configs/common/Options.py
+++ b/configs/common/Options.py
@@ -349,16 +349,14 @@ def addCommonOptions(parser, configure_xiangshan=False):
         "that are present under any of the roots. If not given, dump all "
         "stats. ")
 
+    parser.add_argument("--smt", action="store_true", default=False,
+                        help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""")
+
     if configure_xiangshan:
         return
     # Following options are not available in XiangShan
 
     parser.add_argument("--checker", action="store_true")
-    parser.add_argument("--smt", action="store_true", default=False,
-                        help="""
-                      Only used if multiple programs are specified. If true,
-                      then the number of threads per cpu is same as the
-                      number of programs.""")
     parser.add_argument(
         "--elastic-trace-en", action="store_true",
         help="""Enable capture of data dependency and instruction
diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
index ca5362d449..3e2c0fa4a2 100644
--- a/configs/common/xiangshan.py
+++ b/configs/common/xiangshan.py
@@ -439,7 +439,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
     test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i)
                     for i in range(np)]
     # Configure MMU for trace-aware FS mode
+    if args.smt:
+        test_sys.multi_thread = True
+
     for cpu in test_sys.cpu:
+        if args.smt:
+            cpu.numThreads = 2
         cpu.mmu.pma_checker = PMAChecker(
             uncacheable=[AddrRange(0, size=0x80000000)])
         cpu.mmu.functional = args.functional_tlb
diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh
index f15257426f..ade70ed5e3 100644
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -187,14 +187,23 @@ struct SquashVersion
         return (version + 1) % versionLimit;
     }
     bool largerThan(uint8_t other) const {
-        bool larger = version > other && version - other <= maxInflightSquash;
-        bool wrapped_larger =
-            version + versionLimit > other &&
-            version + versionLimit - other <= maxInflightSquash;
-        if (!(larger || wrapped_larger || (version == other))) {
+        const uint8_t distance = (version + versionLimit - other) % versionLimit;
+        if (distance == 0) {
+            return false;
+        }
+
+        if (distance <= maxInflightSquash) {
+            return true;
+        }
+
+        if (versionLimit - distance <= maxInflightSquash) {
+            return false;
+        }
+
+        if (version != other) {
             panic("SquashVersion: %d, other: %d\n", version, other);
         }
-        return larger || wrapped_larger;
+        return false;
     }
     void update(uint8_t v) {
         version = v;
@@ -205,6 +214,7 @@ struct SquashVersion
 
 struct ResolveQueueEntry
 {
+    ThreadID resolvedTid;
     uint64_t resolvedFTQId;
     std::vector<uint64_t> resolvedInstPC;
 };
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index e7036301b4..f0040c0d4f 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -1997,10 +1997,10 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid)
 
     fixedbuffer[tid].clear();
 
-    localSquashVer.update(localSquashVer.nextVersion());
-    toIEW->commitInfo[tid].squashVersion = localSquashVer;
+    localSquashVer[tid].update(localSquashVer[tid].nextVersion());
+    toIEW->commitInfo[tid].squashVersion = localSquashVer[tid];
     DPRINTF(Commit, "Updating squash version to %u\n",
-            localSquashVer.getVersion());
+            localSquashVer[tid].getVersion());
 }
 
 void
@@ -2021,7 +2021,9 @@ Commit::markCompletedInsts()
             fromIEW->insts[inst_num]->setCanCommit();
             auto &inst = fromIEW->insts[inst_num];
 
-            panic_if(!rob->findInst(0, inst->seqNum), "[sn:%llu] Committed instruction not found in ROB",
+            panic_if(!rob->findInst(inst->threadNumber, inst->seqNum),
+                     "[tid:%i] [sn:%llu] Committed instruction not found in ROB",
+                     inst->threadNumber,
                      inst->seqNum);
         }
     }
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 4cb184af98..465732ea0e 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -431,7 +431,7 @@ class Commit
     /** Wire to read information from rename queue. */
     TimeBuffer<RenameStruct>::wire fromRename;
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
   public:
     /** ROB interface. */
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 502f02c25e..162a3ad341 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -135,13 +135,6 @@ CPU::CPU(const BaseO3CPUParams &params)
       cpuStats(this),
       valuePred(params.valuePred)
 {
-    fatal_if(FullSystem && params.numThreads > 1,
-            "SMT is not supported in O3 in full system mode currently.");
-
-    fatal_if(!FullSystem && params.numThreads < params.workload.size(),
-            "More workload items (%d) than threads (%d) on CPU %s.",
-            params.workload.size(), params.numThreads, name());
-
     if (!params.switched_out) {
         _status = Running;
     } else {
@@ -206,7 +199,10 @@ CPU::CPU(const BaseO3CPUParams &params)
 
     ThreadID active_threads;
     if (FullSystem) {
-        active_threads = 1;
+        // FS-SMT still uses one shared workload/system image, but the O3 core
+        // must provision per-thread architectural state for every hardware
+        // thread context exposed by the CPU.
+        active_threads = numThreads;
     } else {
         active_threads = params.workload.size();
 
@@ -283,9 +279,7 @@ CPU::CPU(const BaseO3CPUParams &params)
 
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
         if (FullSystem) {
-            // SMT is not supported in FS mode yet.
-            assert(numThreads == 1);
-            thread[tid] = new ThreadState(this, 0, NULL);
+            thread[tid] = new ThreadState(this, tid, NULL);
         } else {
             if (tid < params.workload.size()) {
                 DPRINTF(O3CPU, "Workload[%i] process is %#x", tid,
diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index ecb274b152..d76d0fbc60 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -401,7 +401,7 @@ Decode::moveInstsToBuffer()
     for (int i = 0; i < insts_from_fetch; ++i) {
         const DynInstPtr &inst = stallBuffer.front();
         assert(tid == inst->threadNumber);
-        if (localSquashVer.largerThan(inst->getVersion())) {
+        if (localSquashVer[tid].largerThan(inst->getVersion())) {
             inst->setSquashed();
         }
         assert(!fixedbuffer[inst->threadNumber].full());
@@ -419,9 +419,10 @@ Decode::checkSquash()
             DPRINTF(Decode, "[tid:%i] Squashing instructions due to squash "
                     "from commit.\n", i);
             squash(i);
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
             DPRINTF(Decode, "Updating squash version to %u\n",
-                    localSquashVer.getVersion());
+                    localSquashVer[i].getVersion());
         }
     }
 }
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index a510d8dd9d..0f55d838b4 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -293,7 +293,7 @@ class Decode
 
     void setAllStalls(StallReason decodeStall);
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 };
 
 } // namespace o3
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 3c00c5937d..fba856f813 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -656,8 +656,8 @@ Fetch::processCacheCompletion(PacketPtr pkt)
     }
 
     // Verify fetchBufferPC alignment with the supplying FSQ entry.
-    if (threads[tid].valid && dbpbtb->ftqHasFetching(0)) {
-        const auto &stream = dbpbtb->ftqFetchingTarget(0);
+    if (threads[tid].valid && dbpbtb->ftqHasFetching(tid)) {
+        const auto &stream = dbpbtb->ftqFetchingTarget(tid);
         if (threads[tid].startPC != stream.startPC) {
             panic("fetchBufferPC %#x should be aligned with FSQ startPC %#x",
                   threads[tid].startPC, stream.startPC);
@@ -793,7 +793,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc)
     // Decoupled+BTB-only: compute next PC directly from the supplying FSQ entry.
     ThreadID tid = inst->threadNumber;
     assert(dbpbtb);
-    assert(dbpbtb->ftqHasFetching(0));
+    assert(dbpbtb->ftqHasFetching(tid));
     const auto &stream = dbpbtb->ftqFetchingTarget(tid);
 
     const Addr curr_pc = next_pc.instAddr();
@@ -1002,7 +1002,7 @@ Fetch::handleTranslationFault(ThreadID tid, const RequestPtr &mem_req, const Fau
     // We will use a nop in order to carry the fault.
     DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr,
             fetch_pc, fetch_pc, false);
-    instruction->setVersion(localSquashVer);
+    instruction->setVersion(localSquashVer[tid]);
     instruction->setNotAnInst();
 
     instruction->setPredTarg(fetch_pc);
@@ -1522,35 +1522,42 @@ Fetch::handleIEWSignals()
         return;
     }
 
-    auto &incoming = fromIEW->iewInfo->resolvedCFIs;
     const bool had_pending_resolve = !resolveQueue.empty();
-    uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size();
     uint8_t enqueueCount = 0;
+    uint8_t enqueueSize = 0;
+
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        enqueueSize += fromIEW->iewInfo[tid].resolvedCFIs.size();
+    }
 
     if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) {
         fetchStats.resolveQueueFullEvents++;
         fetchStats.resolveEnqueueFailEvent += enqueueSize;
     } else {
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            auto &incoming = fromIEW->iewInfo[tid].resolvedCFIs;
+            for (const auto &resolved : incoming) {
+                bool merged = false;
+                for (auto &queued : resolveQueue) {
+                    if (queued.resolvedTid == tid &&
+                        queued.resolvedFTQId == resolved.ftqId) {
+                        queued.resolvedInstPC.push_back(resolved.pc);
+                        merged = true;
+                        break;
+                    }
+                }
 
-        for (const auto &resolved : incoming) {
-            bool merged = false;
-            for (auto &queued : resolveQueue) {
-                if (queued.resolvedFTQId == resolved.ftqId) {
-                    queued.resolvedInstPC.push_back(resolved.pc);
-                    merged = true;
-                    break;
+                if (merged) {
+                    continue;
                 }
-            }
 
-            if (merged) {
-                continue;
+                ResolveQueueEntry new_entry;
+                new_entry.resolvedTid = tid;
+                new_entry.resolvedFTQId = resolved.ftqId;
+                new_entry.resolvedInstPC.push_back(resolved.pc);
+                resolveQueue.push_back(std::move(new_entry));
+                enqueueCount++;
             }
-
-            ResolveQueueEntry new_entry;
-            new_entry.resolvedFTQId = resolved.ftqId;
-            new_entry.resolvedInstPC.push_back(resolved.pc);
-            resolveQueue.push_back(std::move(new_entry));
-            enqueueCount++;
         }
         fetchStats.resolveEnqueueCount.sample(enqueueCount);
     }
@@ -1562,12 +1569,13 @@ Fetch::handleIEWSignals()
     // and fetch consuming them as predictor resolved updates.
     if (had_pending_resolve && !resolveQueue.empty()) {
         auto &entry = resolveQueue.front();
+        ThreadID tid = entry.resolvedTid;
         unsigned int stream_id = entry.resolvedFTQId;
-        dbpbtb->prepareResolveUpdateEntries(stream_id, 0);
+        dbpbtb->prepareResolveUpdateEntries(stream_id, tid);
         for (const auto resolvedInstPC : entry.resolvedInstPC) {
-            dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0);
+            dbpbtb->markCFIResolved(stream_id, resolvedInstPC, tid);
         }
-        bool success = dbpbtb->resolveUpdate(stream_id, 0);
+        bool success = dbpbtb->resolveUpdate(stream_id, tid);
         if (success) {
             dbpbtb->notifyResolveSuccess();
             resolveQueue.pop_front();
@@ -1612,8 +1620,10 @@ Fetch::handleCommitSignals(ThreadID tid)
     squash(*fromCommit->commitInfo[tid].pc, squash_seq,
            squash_inst, tid);
 
-    localSquashVer.update(fromCommit->commitInfo[tid].squashVersion.getVersion());
-    DPRINTF(Fetch, "Updating squash version to %u\n", localSquashVer.getVersion());
+    localSquashVer[tid].update(
+        fromCommit->commitInfo[tid].squashVersion.getVersion());
+    DPRINTF(Fetch, "Updating squash version to %u\n",
+            localSquashVer[tid].getVersion());
 
     auto mispred_inst = fromCommit->commitInfo[tid].mispredictInst;
 
@@ -1924,7 +1934,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
                 tid, waitForVsetvl);
     }
 
-    instruction->setVersion(localSquashVer);
+    instruction->setVersion(localSquashVer[tid]);
     ppFetch->notify(instruction);
     numInst++;
 
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 6e114487cf..1d8c3e88aa 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -1126,7 +1126,7 @@ class Fetch
         statistics::Scalar traceMetaCleanupCommitCalls;
     } fetchStats;
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
 public:
     const FetchStatGroup &getFetchStats() { return fetchStats; }
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index d9c815b86c..7ea9c872ba 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -822,8 +822,10 @@ IEW::checkSquash()
     for (int i = 0; i < numThreads; i++) {
         if (fromCommit->commitInfo[i].squash) {
             squash(i);
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
-            DPRINTF(IEW, "Updating squash version to %u\n", localSquashVer.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
+            DPRINTF(IEW, "Updating squash version to %u\n",
+                    localSquashVer[i].getVersion());
 
             fetchRedirect[i] = false;
             iewStats.stallEvents[ROBWalk]++;
@@ -854,7 +856,7 @@ IEW::moveInstsToBuffer()
     for (int i = 0; i < insts_from_rename; ++i) {
         const DynInstPtr &inst = fromRename->insts[i];
         assert(inst->threadNumber == tid);
-        if (localSquashVer.largerThan(inst->getVersion())) {
+        if (localSquashVer[tid].largerThan(inst->getVersion())) {
             inst->setSquashed();
         } else {
             fixedbuffer[tid].push_back(inst);
@@ -958,9 +960,9 @@ IEW::dispatchInsts()
 
         toRename->iewInfo[tid].robHeadStallReason = checkDispatchStall(tid, NumDQ, nullptr, -1);
         toRename->iewInfo[tid].lqHeadStallReason =
-            ldstQueue.lqEmpty() ? StallReason::NoStall : checkLSQStall(tid, true);
+            ldstQueue.lqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, true);
         toRename->iewInfo[tid].sqHeadStallReason =
-            ldstQueue.sqEmpty() ? StallReason::NoStall : checkLSQStall(tid, false);
+            ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false);
         toRename->iewInfo[tid].blockReason = blockReason;
     }
 }
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index fc357dfb28..e23d0fb490 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -405,7 +405,7 @@ class IEW
     /** Scoreboard pointer. */
     Scoreboard* scoreboard;
 
-    SquashVersion localSquashVer{0};
+    SquashVersion localSquashVer[MaxThreads];
 
     /** Value predictor */
     valuepred::VPUnit *valuePred;
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 29573959cf..72f99bbb8e 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -758,7 +758,7 @@ InstructionQueue::commit(const InstSeqNum &inst, ThreadID tid)
 {
     DPRINTF(IQ, "[tid:%i] Committing instructions older than [sn:%llu]\n",
             tid,inst);
-    scheduler->doCommit(inst);
+    scheduler->doCommit(inst, tid);
 }
 
 int
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index 50fa7a5eb5..f2d09e17de 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -834,11 +834,16 @@ IssueQue::insertNonSpec(const DynInstPtr& inst)
 }
 
 void
-IssueQue::doCommit(const InstSeqNum seqNum)
+IssueQue::doCommit(const InstSeqNum seqNum, ThreadID tid)
 {
-    while (!instList.empty() && instList.front()->seqNum <= seqNum) {
-        assert(instList.front()->isIssued());
-        instList.pop_front();
+    for (auto it = instList.begin(); it != instList.end();) {
+        const auto &inst = *it;
+        if (inst->threadNumber == tid && inst->seqNum <= seqNum) {
+            assert(inst->isIssued());
+            it = instList.erase(it);
+        } else {
+            ++it;
+        }
     }
 }
 
@@ -1678,10 +1683,10 @@ Scheduler::isDrained()
 }
 
 void
-Scheduler::doCommit(const InstSeqNum seqNum)
+Scheduler::doCommit(const InstSeqNum seqNum, ThreadID tid)
 {
     for (auto it : issueQues) {
-        it->doCommit(seqNum);
+        it->doCommit(seqNum, tid);
     }
 }
 
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index b1ab4f361a..a91da979db 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -253,7 +253,7 @@ class IssueQue : public SimObject
     void retryMem(const DynInstPtr& inst);
     bool idle();
 
-    void doCommit(const InstSeqNum inst);
+    void doCommit(const InstSeqNum inst, ThreadID tid);
     void doSquash(SquashInfo squashInfo);
 
     int getIssueStages() { return scheduleToExecDelay; }
@@ -395,7 +395,7 @@ class Scheduler : public SimObject
     uint32_t getCorrectedOpLat(const DynInstPtr& inst);
     bool hasReadyInsts();
     bool isDrained();
-    void doCommit(const InstSeqNum seqNum);
+    void doCommit(const InstSeqNum seqNum, ThreadID tid);
     void doSquash(SquashInfo squashInfo);
     uint32_t getIQInsts();
     uint32_t getIQInsts(ThreadID tid);
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index e7685c6a0b..0f3b005a8f 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -1393,6 +1393,12 @@ LSQ::lqEmpty() const
     return true;
 }
 
+bool
+LSQ::lqEmpty(ThreadID tid) const
+{
+    return thread[tid].lqEmpty();
+}
+
 bool
 LSQ::sqEmpty() const
 {
@@ -1409,6 +1415,12 @@ LSQ::sqEmpty() const
     return true;
 }
 
+bool
+LSQ::sqEmpty(ThreadID tid) const
+{
+    return thread[tid].sqEmpty();
+}
+
 bool
 LSQ::lqFull()
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index e83794b6b3..504b4d4561 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -938,8 +938,12 @@ class LSQ
     bool isEmpty() const;
     /** Returns if all of the LQs are empty. */
     bool lqEmpty() const;
+    /** Returns if the LQ of a given thread is empty. */
+    bool lqEmpty(ThreadID tid) const;
     /** Returns if all of the SQs are empty. */
     bool sqEmpty() const;
+    /** Returns if the SQ of a given thread is empty. */
+    bool sqEmpty(ThreadID tid) const;
 
     /** Returns if any of the LQs are full. */
     bool lqFull();
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index 0cdc70f935..0b9a1b47a8 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -601,7 +601,7 @@ Rename::moveInstsToBuffer()
     for (int i = 0; i < insts_from_decode; ++i) {
         const DynInstPtr &inst = fromDecode->insts[i];
         assert(inst->threadNumber == tid);
-        if (localSquashVer.largerThan(inst->getVersion())) {
+        if (localSquashVer[tid].largerThan(inst->getVersion())) {
             inst->setSquashed();
         } else {
             assert(!fixedbuffer[tid].full());
@@ -626,9 +626,10 @@ Rename::checkSquash()
 
             squash(fromCommit->commitInfo[i].doneSeqNum, i);
 
-            localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion());
+            localSquashVer[i].update(
+                fromCommit->commitInfo[i].squashVersion.getVersion());
             DPRINTF(Rename, "Updating squash version to %u\n",
-                    localSquashVer.getVersion());
+                    localSquashVer[i].getVersion());
         }
     }
 }
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index 26769e0b5f..4e83cc0919 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -451,7 +451,7 @@ class Rename
 
     StallReason checkRenameStallFromIEW(ThreadID tid);
 
-    SquashVersion localSquashVer;
+    SquashVersion localSquashVer[MaxThreads];
 
     /** Value predictor */
     valuepred::VPUnit *valuePred;
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index a1fee43d87..8ed265af90 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -7,6 +7,7 @@
 #include "cpu/o3/cpu.hh"
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/pred/btb/folded_hist.hh"
+#include "cpu/thread_context.hh"
 #include "debug/BTB.hh"
 #include "debug/DecoupleBPHist.hh"
 #include "debug/DecoupleBPVerbose.hh"
@@ -45,7 +46,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
       // uras(p.uras),
       bpDBSwitches(p.bpDBSwitches),
       numStages(p.numStages),
-      ftq(2, p.ftq_size),
+      ftq(p.numThreads, p.ftq_size),
       historyManager(16), // TODO: fix this
       resolveBlockThreshold(p.resolveBlockThreshold),
       dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
@@ -115,6 +116,26 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
     });
 }
 
+ThreadID
+DecoupledBPUWithBTB::scheduleThread()
+{
+    for (ThreadID offset = 0; offset < numThreads; ++offset) {
+        const ThreadID tid = (nextPredictTid + offset) % numThreads;
+
+        if (cpu) {
+            auto *tc = cpu->getContext(tid);
+            if (!tc || tc->status() != gem5::ThreadContext::Active) {
+                continue;
+            }
+        }
+
+        nextPredictTid = (tid + 1) % numThreads;
+        return tid;
+    }
+
+    return InvalidThreadID;
+}
+
 
 void
 DecoupledBPUWithBTB::tick()
@@ -122,6 +143,9 @@ DecoupledBPUWithBTB::tick()
     DPRINTF(Override, "DecoupledBPUWithBTB::tick()\n");
 
     ThreadID curTid = scheduleThread();
+    if (curTid == InvalidThreadID) {
+        return;
+    }
 
     // On squash, reset state if there was a valid prediction.
     bool squashOccurred = false;
@@ -428,8 +452,13 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id,
 
     // Find the target being squashed
     if (!ftq.hasTarget(target_id, tid)) {
-        assert(!ftq.empty(tid));
-        DPRINTF(DecoupleBP, "The squashing target is insane, ignore squash on it");
+        DPRINTF(DecoupleBP,
+                "Ignore squash for tid %u on missing FTQ target %u; "
+                "recovering predictor state from redirect PC %#lx\n",
+                tid, target_id, redirect_pc);
+        clearPreds(tid);
+        threads[tid].validprediction = false;
+        threads[tid].s0PC = redirect_pc;
         return;
     }
 
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 288450001f..134258f77c 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -75,8 +75,7 @@ class DecoupledBPUWithBTB : public BPredUnit
     // FetchTargetId fetchHeadFtqId{1}; // next FSQ id to be consumed by fetch
 
     CPU *cpu;
-
-    const int numThreads = 2;
+    ThreadID nextPredictTid = 0;
     unsigned predictWidth;  // max predict width, default 64
     unsigned maxInstsNum;
 
@@ -145,7 +144,7 @@ class DecoupledBPUWithBTB : public BPredUnit
     unsigned resolveDequeueFailCounter{0};
     const unsigned resolveBlockThreshold;
 
-    ThreadID scheduleThread() { return 0; }
+    ThreadID scheduleThread();
 
     void processNewPrediction(ThreadID tid);
 

From c96339acd38d48ac851c6ee800bec9d1c32b5807 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 18 Mar 2026 14:28:24 +0800
Subject: [PATCH 03/38] cpu-o3: support shared-address-space fs smt

---
 .gitignore                          |   8 +-
 configs/common/xiangshan.py         |  12 +-
 src/cpu/base.cc                     | 123 +++++++++++-------
 src/cpu/base.hh                     |  17 +--
 src/cpu/difftest.cc                 |  23 ++--
 src/cpu/difftest.hh                 |   7 ++
 src/cpu/o3/commit.cc                |   3 +-
 src/cpu/o3/cpu.cc                   |   9 +-
 src/cpu/o3/cpu.hh                   |   2 +-
 src/cpu/o3/decode.cc                |  83 +++++++++---
 src/cpu/o3/decode.hh                |   3 +
 src/cpu/o3/fetch.cc                 |  90 +++++++------
 src/cpu/o3/fetch.hh                 |   8 +-
 src/cpu/o3/inst_queue.cc            |   1 +
 src/cpu/o3/lsq.hh                   |   1 -
 src/cpu/o3/lsq_unit.cc              |   3 +-
 src/cpu/o3/rename.cc                |  32 +++--
 src/cpu/o3/rename.hh                |   4 +-
 src/cpu/pred/btb/abtb.cc            |   3 +-
 src/cpu/pred/btb/abtb.hh            |   2 +-
 src/cpu/pred/btb/btb_ittage.cc      | 107 ++++++++++++----
 src/cpu/pred/btb/btb_ittage.hh      |  28 +++--
 src/cpu/pred/btb/btb_mgsc.cc        | 187 ++++++++++++++++++++--------
 src/cpu/pred/btb/btb_mgsc.hh        |  39 ++++--
 src/cpu/pred/btb/btb_tage.cc        | 117 +++++++++++------
 src/cpu/pred/btb/btb_tage.hh        |  37 +++---
 src/cpu/pred/btb/btb_tage_ub.cc     |   3 +-
 src/cpu/pred/btb/btb_tage_ub.hh     |   2 +-
 src/cpu/pred/btb/btb_ubtb.hh        |   2 +-
 src/cpu/pred/btb/decoupled_bpred.cc |  69 ++++++----
 src/cpu/pred/btb/decoupled_bpred.hh |  13 +-
 src/cpu/pred/btb/mbtb.cc            |   3 +-
 src/cpu/pred/btb/mbtb.hh            |   2 +-
 src/cpu/pred/btb/microtage.cc       | 154 ++++++++++++++++-------
 src/cpu/pred/btb/microtage.hh       |  37 +++---
 src/cpu/pred/btb/ras.cc             |   3 +-
 src/cpu/pred/btb/ras.hh             |   2 +-
 src/cpu/pred/btb/timed_base_pred.hh |   5 +-
 src/cpu/pred/btb/uras.cc            |   3 +-
 src/cpu/pred/btb/uras.hh            |   4 +-
 src/cpu/simple/base.cc              |   7 +-
 src/cpu/simple/base.hh              |   2 +-
 src/sim/system.cc                   |   4 +-
 src/sim/system.hh                   |   5 +
 44 files changed, 860 insertions(+), 409 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0b3dca3746..6a03374ea1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,4 +70,10 @@ package.json
 
 microbench/build/
 microbench/output/
-microbench/dramsim3*
\ No newline at end of file
+microbench/dramsim3*
+
+*.bin
+*.db
+*.log
+*.gz
+*.zstd
\ No newline at end of file
diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
index 3e2c0fa4a2..368f6cd884 100644
--- a/configs/common/xiangshan.py
+++ b/configs/common/xiangshan.py
@@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace):
     if args.difftest_ref_so is not None:
         ref_so = args.difftest_ref_so
         print("Obtained ref_so from args.difftest_ref_so: ", ref_so)
-    elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ:
+    elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ:
         ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"]
         print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so)
     elif "GCBV_REF_SO" in os.environ:
@@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         if args.raw_cpt:
             # If using raw binary, no restorer is needed.
             gcpt_restorer = None
-        elif args.num_cpus > 1:
+        elif args.num_cpus > 1 or args.smt:
             if "GCB_MULTI_CORE_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"]
                 print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer)
             else:
-                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core")
+                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest")
         elif args.restore_rvv_cpt:
             if "GCBV_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCBV_RESTORER"]
@@ -355,8 +355,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer)
         gcpt_restorer = args.gcpt_restorer
 
-    if args.num_cpus > 1:
-        print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).")
+    if args.num_cpus > 1 or args.smt:
+        print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).")
         sys.gcpt_restorer_size_limit = 2**20
     elif args.restore_rvv_cpt:
         print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.")
@@ -403,7 +403,7 @@ def config_difftest(cpu_list, args, sys):
     if not args.enable_difftest:
         return
     else:
-        if len(cpu_list) > 1:
+        if len(cpu_list) > 1 or args.smt:
             sys.enable_mem_dedup = True
             for cpu in cpu_list:
                 cpu.enable_mem_dedup = True
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 63c0e7964a..83a2a27686 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -43,6 +43,7 @@
 
 #include "cpu/base.hh"
 
+#include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -208,40 +209,50 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker)
               "of threads (%i).\n", params().isa.size(), numThreads);
     }
 
-    diffAllStates = std::make_shared<DiffAllStates>();
+    diffAllStates.resize(numThreads);
     if (enableDifftest) {
         assert(params().difftest_ref_so.length() > 2);
-        diffAllStates->diff.nemu_reg = &(diffAllStates->referenceRegFile);
-        diffAllStates->diff.nemu_this_pc = 0x80000000u;
-        diffAllStates->diff.cpu_id = params().cpu_id;
-        warn("cpu_id set to %d\n", params().cpu_id);
-
-        if (params().difftest_ref_so.find("spike") != std::string::npos) {
-            assert(!system->multiCore());
-            diffAllStates->proxy = new SpikeProxy(
-                params().cpu_id, params().difftest_ref_so.c_str(),
-                params().nemuSDimg.size() && params().nemuSDCptBin.size());
-        } else {
-            diffAllStates->proxy =
-                new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(),
-                              params().nemuSDimg.size() && params().nemuSDCptBin.size(), system->enabledMemDedup(),
-                              system->multiCore());
-        }
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid] = std::make_shared<DiffAllStates>();
+            auto diff_state = diffAllStates[tid];
+            diff_state->diff.nemu_reg = &(diff_state->referenceRegFile);
+            diff_state->diff.nemu_this_pc = 0x80000000u;
+            diff_state->diff.cpu_id = difftestHartId(tid);
+            warn("difftest hart id set to %d for tid %d\n",
+                 diff_state->diff.cpu_id, tid);
+
+            if (params().difftest_ref_so.find("spike") != std::string::npos) {
+                assert(!system->multiContextDifftest());
+                diff_state->proxy = new SpikeProxy(
+                    params().cpu_id, params().difftest_ref_so.c_str(),
+                    params().nemuSDimg.size() && params().nemuSDCptBin.size());
+            } else {
+                diff_state->proxy =
+                    new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(),
+                                  params().nemuSDimg.size() && params().nemuSDCptBin.size(),
+                                  system->enabledMemDedup(),
+                                  system->multiContextDifftest());
+            }
 
-        warn("Difftest is enabled with ref so: %s.\n", params().difftest_ref_so.c_str());
+            warn("Difftest is enabled with ref so: %s.\n",
+                 params().difftest_ref_so.c_str());
 
-        diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), REF_TO_DUT);
-        diffAllStates->diff.dynamic_config.ignore_illegal_mem_access = false;
-        diffAllStates->diff.dynamic_config.debug_difftest = false;
-        diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
-        if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) {
-            diffAllStates->proxy->sdcard_init(params().nemuSDimg.c_str(),
-                               params().nemuSDCptBin.c_str());
+            diff_state->proxy->regcpy(&(diff_state->gem5RegFile), REF_TO_DUT);
+            diff_state->diff.dynamic_config.ignore_illegal_mem_access = false;
+            diff_state->diff.dynamic_config.debug_difftest = false;
+            diff_state->proxy->update_config(&diff_state->diff.dynamic_config);
+            if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) {
+                diff_state->proxy->sdcard_init(params().nemuSDimg.c_str(),
+                                   params().nemuSDCptBin.c_str());
+            }
+            diff_state->diff.will_handle_intr = false;
         }
-        diffAllStates->diff.will_handle_intr = false;
     } else {
         warn("Difftest is disabled\n");
-        diffAllStates->hasCommit = true;
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid] = std::make_shared<DiffAllStates>();
+            diffAllStates[tid]->hasCommit = true;
+        }
     }
 
     if (dumpCommitFlag) {
@@ -404,11 +415,14 @@ BaseCPU::startup()
     if (powerState->get() == enums::PwrState::UNDEFINED)
         powerState->set(enums::PwrState::ON);
 
-    if (system->multiCore()) {
+    if (system->multiContextDifftest()) {
         goldenMemPtr = system->getGoldenMemPtr();
         _goldenMemManager = system->getGoldenMemManager();
 
-        diffAllStates->proxy->initState(params().cpu_id, goldenMemPtr);
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            diffAllStates[tid]->proxy->initState(difftestHartId(tid),
+                                                 goldenMemPtr);
+        }
     } else {
         goldenMemPtr = nullptr;
         _goldenMemManager = nullptr;
@@ -702,7 +716,7 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU)
     if (enable_diff) {
         warn("Take over difftest state to new CPU\n");
         enableDifftest = enable_diff;
-        takeOverDiffAllStates(diff_all);
+        takeOverDiffAllStates(std::move(diff_all));
     }
 }
 
@@ -865,6 +879,12 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent)
     hostOpRate = simOps / hostSeconds;
 }
 
+int
+BaseCPU::difftestHartId(ThreadID tid) const
+{
+    return params().cpu_id * numThreads + tid;
+}
+
 void
 BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq,
                         std::string error_csr_name, int &diff_at)
@@ -883,6 +903,8 @@ BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint
 std::pair<int, bool>
 BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     int diff_at = DiffAt::NoneDiff;
     bool npc_match = false;
     bool is_mmio = diffInfo.curInstStrictOrdered;
@@ -966,7 +988,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
 
     if (enableRVV) {
         if (diffInfo.inst->isVector()) {
-            readGem5Regs();
+            readGem5Regs(tid);
             uint64_t* nemu_val = (uint64_t*)&(diffAllStates->referenceRegFile.vr[0]);
             uint64_t* gem5_val = (uint64_t*)&(diffAllStates->gem5RegFile.vr[0]);
             bool maybe_error = false;
@@ -1431,7 +1453,8 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                                         diffInfo.physEffAddr, diffInfo.effSize);
                 }
 
-                if (system->multiCore() && (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) &&
+                if (system->multiContextDifftest() &&
+                    (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) &&
                     _goldenMemManager->inPmem(diffInfo.physEffAddr)) {
                     warn("Difference on %s instr found in multicore mode, check in golden memory\n",
                          diffInfo.inst->isLoad() ? "load" : "amo");
@@ -1517,9 +1540,10 @@ BaseCPU::clearDiffMismatch(ThreadID tid, InstSeqNum seq) {
 void
 BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     warn("%s", diffMsg.str());
     diffAllStates->proxy->isa_reg_display();
-    displayGem5Regs();
+    displayGem5Regs(tid);
     warn("start dump last %lu committed msg\n", diffInfo.lastCommittedMsg.size());
     while (diffInfo.lastCommittedMsg.size()) {
         auto &inst = diffInfo.lastCommittedMsg.front();
@@ -1531,6 +1555,8 @@ BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq)
 void
 BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     bool should_diff = false;
     DPRINTF(DumpCommit, "[sn:%llu] %#lx, %s\n",
             seq, diffInfo.pc->instAddr(), diffInfo.inst->disassemble(diffInfo.pc->instAddr()));
@@ -1550,10 +1576,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
         should_diff = true;
         if (!diffAllStates->hasCommit && diffInfo.pc->instAddr() == 0x80000000u) {
             diffAllStates->hasCommit = true;
-            readGem5Regs();
+            readGem5Regs(tid);
             diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr();
             if (noHypeMode) {
-                auto start = pmemStart + pmemSize * diffAllStates->diff.cpu_id;
+                auto start = pmemStart + pmemSize * difftestHartId(tid);
                 warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize);
                 diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF);
             } else if (enableMemDedup) {
@@ -1603,9 +1629,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
 }
 
 void
-BaseCPU::displayGem5Regs()
+BaseCPU::displayGem5Regs(ThreadID tid)
 {
-    readGem5Regs();
+    auto diffAllStates = this->diffAllStates[tid];
+    readGem5Regs(tid);
     std::string str;
     //reg
     for (size_t i = 0; i < 32; i++)
@@ -1712,8 +1739,9 @@ BaseCPU::displayGem5Regs()
 }
 
 void
-BaseCPU::difftestRaiseIntr(uint64_t no)
+BaseCPU::difftestRaiseIntr(uint64_t no, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     diffAllStates->diff.will_handle_intr = true;
     diffAllStates->proxy->raise_intr(no);
 }
@@ -1721,19 +1749,24 @@ BaseCPU::difftestRaiseIntr(uint64_t no)
 void
 BaseCPU::clearGuideExecInfo()
 {
-    diffAllStates->diff.guide.force_raise_exception = false;
-    diffAllStates->diff.guide.force_set_jump_target = false;
+    for (auto &diffAllStates : this->diffAllStates) {
+        diffAllStates->diff.guide.force_raise_exception = false;
+        diffAllStates->diff.guide.force_set_jump_target = false;
+    }
 }
 
 void
 BaseCPU::enableDiffPrint()
 {
-    diffAllStates->diff.dynamic_config.debug_difftest = true;
-    diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
+    for (auto &diffAllStates : this->diffAllStates) {
+        diffAllStates->diff.dynamic_config.debug_difftest = true;
+        diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config);
+    }
 }
 
-void BaseCPU::setSCSuccess(bool success, paddr_t addr)
+void BaseCPU::setSCSuccess(bool success, paddr_t addr, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     diffAllStates->diff.sync.lrscValid = success;
     diffAllStates->diff.sync.lrscAddr = addr; // used for spike diff
 }
@@ -1742,6 +1775,8 @@ void
 BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, bool force_set_jump_target,
                                    uint64_t jump_target, ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
+
     auto &gd = diffAllStates->diff.guide;
     gd.force_raise_exception = true;
     gd.exception_num = exception_num;
@@ -1769,7 +1804,7 @@ BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint6
 void
 BaseCPU::checkL1DRefill(Addr paddr, const uint8_t* refill_data, size_t size) {
     assert(size == 64);
-    if (system->multiCore()) {
+    if (system->multiContextDifftest()) {
         uint8_t *golden_ptr = (uint8_t *)_goldenMemManager->guestToHost(paddr);
         if (memcmp(golden_ptr, refill_data, size)) {
             panic("Refill data diff with Golden addr %#lx with size %d\n", paddr, size);
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 8fe6d55d61..3d3e8e5a85 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -693,7 +693,7 @@ class BaseCPU : public ClockedObject
     bool enableRVV{false};
     bool enableRVHDIFF{false};
     bool enableSkipCSR{false};
-    std::shared_ptr<DiffAllStates> diffAllStates{};
+    std::vector<std::shared_ptr<DiffAllStates>> diffAllStates{};
 
     enum  diffRegConfig
     {
@@ -701,7 +701,7 @@ class BaseCPU : public ClockedObject
       diffCsrNum = 36,
     };
 
-    virtual void readGem5Regs()
+    virtual void readGem5Regs(ThreadID tid)
     {
         panic("difftest:readGem5Regs() is not implemented\n");
     }
@@ -709,6 +709,7 @@ class BaseCPU : public ClockedObject
     void csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq,
                         std::string error_csr_name,int &diff_at);
     std::pair<int, bool> diffWithNEMU(ThreadID tid, InstSeqNum seq);
+    int difftestHartId(ThreadID tid) const;
 
     std::stringstream diffMsg;
     void reportDiffMismatch(ThreadID tid, InstSeqNum seq);
@@ -779,11 +780,11 @@ class BaseCPU : public ClockedObject
 
     inline bool difftestEnabled() const { return enableDifftest; }
 
-    void displayGem5Regs();
+    void displayGem5Regs(ThreadID tid);
 
-    void difftestRaiseIntr(uint64_t no);
+    void difftestRaiseIntr(uint64_t no, ThreadID tid = 0);
 
-    void setSCSuccess(bool success, paddr_t addr);
+    void setSCSuccess(bool success, paddr_t addr, ThreadID tid);
 
     void setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval,
                                    // force set jump target
@@ -793,14 +794,14 @@ class BaseCPU : public ClockedObject
 
     void enableDiffPrint();
 
-    std::pair<bool, std::shared_ptr<DiffAllStates>> getDiffAllStates()
+    std::pair<bool, std::vector<std::shared_ptr<DiffAllStates>>> getDiffAllStates()
     {
         return std::make_pair(enableDifftest, diffAllStates);
     }
 
-    void takeOverDiffAllStates(std::shared_ptr<DiffAllStates> diffAllStates)
+    void takeOverDiffAllStates(std::vector<std::shared_ptr<DiffAllStates>> diffAllStates)
     {
-        this->diffAllStates = diffAllStates;
+        this->diffAllStates = std::move(diffAllStates);
     }
 
     int committedInstNum = 0;
diff --git a/src/cpu/difftest.cc b/src/cpu/difftest.cc
index 7293e51b9a..63665f194b 100644
--- a/src/cpu/difftest.cc
+++ b/src/cpu/difftest.cc
@@ -149,6 +149,12 @@ NemuProxy::NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bo
 #endif
 
     multiCore = multi_core;
+    if (multiCore) {
+        nemuSetHartId = (void (*)(int))dlsym(handle, "difftest_set_mhartid");
+        assert(nemuSetHartId);
+        nemuPutGmaddr = (void (*)(uint8_t *))dlsym(handle, "difftest_put_gmaddr");
+        assert(nemuPutGmaddr);
+    }
 
     if (enable_sdcard_diff) {
         sdcard_init = (void (*)(const char *, const char *))dlsym(
@@ -168,15 +174,18 @@ void
 NemuProxy::initState(int coreid, uint8_t *golden_mem)
 {
     if (multiCore) {
-        auto nemu_difftest_set_mhartid = (void (*)(int))dlsym(handle, "difftest_set_mhartid");
         warn("Setting mhartid to %d\n", coreid);
-        assert(nemu_difftest_set_mhartid);
-        nemu_difftest_set_mhartid(coreid);
-
-        auto nemu_difftest_put_gmaddr = (void (*)(uint8_t *ptr))dlsym(handle, "difftest_put_gmaddr");
+        setHartId(coreid);
         warn("Setting gmaddr to %#lx\n", (uint64_t) golden_mem);
-        assert(nemu_difftest_put_gmaddr);
-        nemu_difftest_put_gmaddr(golden_mem);
+        nemuPutGmaddr(golden_mem);
+    }
+}
+
+void
+NemuProxy::setHartId(int coreid)
+{
+    if (multiCore) {
+        nemuSetHartId(coreid);
     }
 }
 
diff --git a/src/cpu/difftest.hh b/src/cpu/difftest.hh
index af4eee4d96..7d91201b4f 100644
--- a/src/cpu/difftest.hh
+++ b/src/cpu/difftest.hh
@@ -195,6 +195,7 @@ class RefProxy
     void (*sdcard_init)(const char *img_path,
                         const char *sd_cpt_bin_path) = nullptr;
     virtual void initState(int coreid, uint8_t *golden_mem) = 0;
+    virtual void setHartId(int coreid) = 0;
 
   protected:
     bool multiCore;
@@ -208,6 +209,11 @@ class NemuProxy : public RefProxy
     NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bool enable_mem_dedup, bool multi_core);
 
     void initState(int coreid, uint8_t *golden_mem) override;
+    void setHartId(int coreid) override;
+
+  private:
+    void (*nemuSetHartId)(int) = nullptr;
+    void (*nemuPutGmaddr)(uint8_t *) = nullptr;
 };
 
 
@@ -217,6 +223,7 @@ class SpikeProxy : public RefProxy
     SpikeProxy(int coreid, const char *ref_so, bool enable_sdcard_diff);
 
     void initState(int coreid, uint8_t *golden_mem) override { panic("Not implemented\n"); }
+    void setHartId(int coreid) override { panic("Not implemented\n"); }
 };
 
 #define DIFFTEST_WIDTH 8
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index f0040c0d4f..ad42b0c7fe 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -1843,7 +1843,8 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
     if (head_inst->isStoreConditional()) {
         DPRINTF(Commit, "[tid:%i] [sn:%llu] Store Conditional success: %i\n", tid, head_inst->seqNum,
                 head_inst->lockedWriteSuccess());
-        cpu->setSCSuccess(head_inst->lockedWriteSuccess(), head_inst->physEffAddr);
+        cpu->setSCSuccess(head_inst->lockedWriteSuccess(),
+                          head_inst->physEffAddr, tid);
     }
 
     // Update the commit rename map
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 162a3ad341..5961aed7b1 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -1735,12 +1735,13 @@ CPU::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid,
 }
 
 void
-CPU::readGem5Regs()
+CPU::readGem5Regs(ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     for (int i = 0; i < 32; i++) {
-        diffAllStates->gem5RegFile[i] = readArchIntReg(i, 0);
-        diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, 0);
-        readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], 0);
+        diffAllStates->gem5RegFile[i] = readArchIntReg(i, tid);
+        diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, tid);
+        readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], tid);
     }
 }
 
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index fae5eea4d4..8ca2b276e3 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -740,7 +740,7 @@ class CPU : public BaseCPU
                             HtmFailureFaultCause cause) override;
 
     //difftest virtual function
-    void readGem5Regs() override;
+    void readGem5Regs(ThreadID tid) override;
 
   private:
     /** Value predictor */
diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index d76d0fbc60..93ede3d673 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -72,6 +72,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams &params)
       iewToDecodeDelay(params.iewToDecodeDelay),
       commitToDecodeDelay(params.commitToDecodeDelay),
       fetchToDecodeDelay(params.fetchToDecodeDelay),
+      decodeToFetchDelay(params.decodeToFetchDelay),
       decodeWidth(params.decodeWidth),
       numThreads(params.numThreads),
       enableLoadFusion(params.enable_loadFusion),
@@ -86,8 +87,15 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams &params)
     for (int i=0;i<numThreads;i++) {
         fixedbuffer[i] = boost::circular_buffer<DynInstPtr>(decodeWidth);
     }
-    stallBuffer = boost::circular_buffer<DynInstPtr>(decodeWidth * (fetchToDecodeDelay + 1));
-    eachstallSize = boost::circular_buffer<int>(fetchToDecodeDelay + 1);
+    // This buffer preserves the fetch->decode pipeline contents when decode
+    // stalls while TimeBuffer keeps advancing. Its depth matches the original
+    // forward pipeline window; fetch is backpressured before full to absorb
+    // both the decode->fetch feedback delay and the request already issued in
+    // the current cycle before decode computes backpressure.
+    const auto stallGroupDepth = fetchToDecodeDelay + 1;
+    stallBuffer = boost::circular_buffer<DynInstPtr>(
+        decodeWidth * stallGroupDepth);
+    eachstallSize = boost::circular_buffer<int>(stallGroupDepth);
 
 
     decodeStalls.resize(decodeWidth, StallReason::NoStall);
@@ -373,6 +381,38 @@ Decode::updateActivate()
 void
 Decode::moveInstsToBuffer()
 {
+    auto tryMoveHeadGroupToFixedBuffer = [&]() -> bool {
+        if (stallBuffer.empty()) {
+            return false;
+        }
+
+        // stallbuffer moves to fixedbuffer in strict FIFO order.
+        ThreadID tid = stallBuffer.front()->threadNumber;
+        if (!fixedbuffer[tid].empty()) {
+            return false;
+        }
+
+        int insts_from_stall = eachstallSize.front();
+        eachstallSize.pop_front();
+        for (int i = 0; i < insts_from_stall; ++i) {
+            const DynInstPtr &inst = stallBuffer.front();
+            assert(tid == inst->threadNumber);
+            if (localSquashVer[tid].largerThan(inst->getVersion())) {
+                inst->setSquashed();
+            }
+            assert(!fixedbuffer[inst->threadNumber].full());
+            fixedbuffer[inst->threadNumber].push_back(inst);
+            stallBuffer.pop_front();
+        }
+
+        return true;
+    };
+
+    // Model one stage advance before latching the next cycle's input so a
+    // full stall buffer can still accept a new fetch bundle when its head
+    // group moves forward in the same cycle.
+    const bool moved_group = tryMoveHeadGroupToFixedBuffer();
+
     // do not support mixed thread instructions in one fetch group
     int insts_from_fetch = fromFetch->size;
     if (insts_from_fetch != 0) {
@@ -392,23 +432,12 @@ Decode::moveInstsToBuffer()
     if (stallBuffer.empty()) {
         return;
     }
-    // stallbuffer move to fixedbuffer
-    ThreadID tid = stallBuffer.front()->threadNumber;
-    if (!fixedbuffer[tid].empty())
-        return;
-    insts_from_fetch = eachstallSize.front();
-    eachstallSize.pop_front();
-    for (int i = 0; i < insts_from_fetch; ++i) {
-        const DynInstPtr &inst = stallBuffer.front();
-        assert(tid == inst->threadNumber);
-        if (localSquashVer[tid].largerThan(inst->getVersion())) {
-            inst->setSquashed();
-        }
-        assert(!fixedbuffer[inst->threadNumber].full());
-        fixedbuffer[inst->threadNumber].push_back(inst);
-        stallBuffer.pop_front();
-    }
 
+    // If nothing advanced before latching new input, allow the current head
+    // (possibly the just-arrived group) to fill an empty stage this cycle.
+    if (!moved_group) {
+        tryMoveHeadGroupToFixedBuffer();
+    }
 }
 
 void
@@ -443,13 +472,27 @@ Decode::tick()
     // check threads stall & status
     ThreadID tid = InvalidThreadID;
     ThreadID blocked_tid = InvalidThreadID;
+    const bool fifoBackpressured =
+        !stallBuffer.empty() &&
+        eachstallSize.size() + decodeToFetchDelay + 1 >=
+            eachstallSize.capacity();
+    const ThreadID fifoHeadTid =
+        !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID;
+    const StallReason fifoBlockReason =
+        (fifoBackpressured && fifoHeadTid != InvalidThreadID &&
+         stallSig->blockDecode[fifoHeadTid]) ?
+            stallSig->decodeBlockReason[fifoHeadTid] :
+            (fifoBackpressured ? StallReason::OtherFragStall :
+                                 StallReason::NoStall);
     for (int i = 0; i < numThreads; i++) {
         bool block = stallSig->blockDecode[i];
         bool active = !block && !fixedbuffer[i].empty();
 
-        stallSig->blockFetch[i] = block;
+        stallSig->blockFetch[i] = block || fifoBackpressured;
         stallSig->fetchBlockReason[i] =
-            block ? stallSig->decodeBlockReason[i] : StallReason::NoStall;
+            stallSig->blockFetch[i] ?
+                (block ? stallSig->decodeBlockReason[i] : fifoBlockReason) :
+                StallReason::NoStall;
         toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i];
         if (active) {
             if (tid == InvalidThreadID)
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index 0f55d838b4..c548fad3c7 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -236,6 +236,9 @@ class Decode
     /** Fetch to decode delay. */
     Cycles fetchToDecodeDelay;
 
+    /** Decode to fetch feedback delay for stage backpressure. */
+    Cycles decodeToFetchDelay;
+
     /** The width of decode, in instructions. */
     unsigned decodeWidth;
 
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index fba856f813..d2381123ab 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -98,7 +98,6 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
       fetchWidth(params.fetchWidth),
       decodeWidth(params.decodeWidth),
       retryPkt(),
-      retryTid(InvalidThreadID),
       cacheBlkSize(cpu->cacheLineSize()),
       fetchBufferSize(params.fetchBufferSize),
       fetchQueueSize(params.fetchQueueSize),
@@ -460,6 +459,10 @@ Fetch::resetStage()
 {
     numInst = 0;
     interruptPending = false;
+    for (auto *pkt : retryPkt) {
+        delete pkt;
+    }
+    retryPkt.clear();
     cacheBlocked = false;
 
     priorityList.clear();
@@ -489,7 +492,9 @@ Fetch::resetStage()
     }
 
     assert(dbpbtb);
-    dbpbtb->resetPC(threads[0].fetchpc->instAddr());
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        dbpbtb->resetPC(tid, threads[tid].fetchpc->instAddr());
+    }
 }
 
 bool
@@ -587,8 +592,11 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt)
         DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n",
                 tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size());
 
-        // Note: retry is handled completely by the standard gem5 recvReqRetry mechanism
-        // No need to handle retry here to avoid duplicate packet sending
+        if (cacheBlocked && !retryPkt.empty()) {
+            DPRINTF(Fetch, "[tid:%i] Cache response arrived with queued retries pending; "
+                    "trying one response-driven retry pass\n", tid);
+            retryPendingIcacheRequests();
+        }
 
         return false;  // Return false to indicate we're still waiting
     }
@@ -687,7 +695,6 @@ Fetch::drainSanityCheck() const
 {
     assert(isDrained());
     assert(retryPkt.size() == 0);
-    assert(retryTid == InvalidThreadID);
     assert(!cacheBlocked);
     assert(!interruptPending);
 
@@ -939,6 +946,16 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr
 
     fetchStats.cacheLines++;
 
+    if (cacheBlocked) {
+        DPRINTF(Fetch, "[tid:%i] I-cache port already waiting for retry, queueing %#lx\n",
+                tid, mem_req->getVaddr());
+
+        updateCacheRequestStatusByRequest(tid, mem_req, CacheWaitRetry);
+        setAllFetchStalls(StallReason::IcacheStall);
+        retryPkt.push_back(data_pkt);
+        return;
+    }
+
     // Access the cache.
     if (!icachePort.sendTimingReq(data_pkt)) {
         DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
@@ -950,7 +967,6 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr
                 mem_req->getVaddr());
         setAllFetchStalls(StallReason::IcacheStall);
         retryPkt.push_back(data_pkt);
-        retryTid = tid;
         cacheBlocked = true;
     } else {
         DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid);
@@ -1110,15 +1126,17 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN
     // Reset the cache request after cancelling
     threads[tid].cacheReq.reset();
 
-    // Get rid of the retrying packet if it was from this thread.
-    if (retryTid == tid) {
-        assert(cacheBlocked);
-        for (auto it : retryPkt) {
-            delete it;
+    // Drop any retry packets that belong to this squashed thread.
+    for (auto it = retryPkt.begin(); it != retryPkt.end();) {
+        if (cpu->contextToThread((*it)->req->contextId()) == tid) {
+            delete *it;
+            it = retryPkt.erase(it);
+        } else {
+            ++it;
         }
-        retryPkt.clear();
-        retryTid = InvalidThreadID;
-        cacheBlocked = false;   // clear cache blocked
+    }
+    if (retryPkt.empty()) {
+        cacheBlocked = false;
     }
 
     if (squashInst && !squashInst->isControl()) {
@@ -1577,11 +1595,11 @@ Fetch::handleIEWSignals()
         }
         bool success = dbpbtb->resolveUpdate(stream_id, tid);
         if (success) {
-            dbpbtb->notifyResolveSuccess();
+            dbpbtb->notifyResolveSuccess(tid);
             resolveQueue.pop_front();
             fetchStats.resolveDequeueCount++;
         } else {
-            dbpbtb->notifyResolveFailure();
+            dbpbtb->notifyResolveFailure(tid);
         }
     }
 }
@@ -1731,8 +1749,8 @@ Fetch::buildInst(ThreadID tid, StaticInstPtr staticInst,
             instruction->isMov());
     assert(dbpbtb);
     DPRINTF(DecoupleBP, "Set instruction %lu with fetch id %lu\n",
-            instruction->seqNum, dbpbtb->ftqHeadId(0));
-    instruction->setFtqId(dbpbtb->ftqHeadId(0));
+            instruction->seqNum, dbpbtb->ftqHeadId(tid));
+    instruction->setFtqId(dbpbtb->ftqHeadId(tid));
 
 #if TRACING_ON
     if (trace) {
@@ -2087,36 +2105,32 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
 void
 Fetch::recvReqRetry()
 {
-    if (retryPkt.size() == 0) {
-        assert(retryTid == InvalidThreadID);
+    if (retryPkt.empty()) {
         // Access has been squashed since it was sent out.  Just clear
         // the cache being blocked.
         cacheBlocked = false;
         return;
     }
     assert(cacheBlocked);
-    assert(retryTid != InvalidThreadID);
-    // Note: In multi-cacheline fetch, overall status may not be CacheWaitRetry
-    // if some requests have progressed while others still need retry.
-    // The presence of retryPkt itself indicates retry is needed.
+    retryPendingIcacheRequests();
+}
 
-    for (auto it = retryPkt.begin(); it != retryPkt.end();) {
-        if (icachePort.sendTimingReq(*it)) {
-            // Use new cache state management with specific RequestPtr
-            updateCacheRequestStatusByRequest(retryTid, (*it)->req, CacheWaitResponse);
-            // Notify Fetch Request probe when a retryPkt is successfully sent.
-            // Note that notify must be called before retryPkt is set to NULL.
-            ppFetchRequestSent->notify((*it)->req);
-            it = retryPkt.erase(it);
-        } else {
-            it++;
+void
+Fetch::retryPendingIcacheRequests()
+{
+    while (!retryPkt.empty()) {
+        PacketPtr pkt = retryPkt.front();
+        if (!icachePort.sendTimingReq(pkt)) {
+            return;
         }
-    }
 
-    if (retryPkt.size() == 0) {
-        retryTid = InvalidThreadID;
-        cacheBlocked = false;
+        const ThreadID tid = cpu->contextToThread(pkt->req->contextId());
+        updateCacheRequestStatusByRequest(tid, pkt->req, CacheWaitResponse);
+        ppFetchRequestSent->notify(pkt->req);
+        retryPkt.erase(retryPkt.begin());
     }
+
+    cacheBlocked = false;
 }
 
 void
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 1d8c3e88aa..0061b87912 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -322,6 +322,9 @@ class Fetch
     /** Reset this pipeline stage */
     void resetStage();
 
+    /** Retry queued I-cache packets once, stopping at the first new block. */
+    void retryPendingIcacheRequests();
+
     /** Changes the status of this stage to active, and indicates this
      * to the CPU.
      */
@@ -676,12 +679,9 @@ class Fetch
     /** Is the cache blocked?  If so no threads can access it. */
     bool cacheBlocked;
 
-    /** The packet that is waiting to be retried. */
+    /** Packets waiting for the next cache-issued retry callback. */
     std::vector<PacketPtr> retryPkt;
 
-    /** The thread that is waiting on the cache to tell fetch to retry. */
-    ThreadID retryTid;
-
     /** Cache block size. */
     unsigned int cacheBlkSize;
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index 72f99bbb8e..db8ec407f4 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -53,6 +53,7 @@
 #include "cpu/o3/dyn_inst.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/fu_pool.hh"
+#include "cpu/o3/iew.hh"
 #include "cpu/o3/issue_queue.hh"
 #include "cpu/o3/limits.hh"
 #include "debug/IQ.hh"
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 504b4d4561..604df7c0f1 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1143,7 +1143,6 @@ class LSQ
     std::vector<uint32_t> dcacheRefillDataRead;
     std::vector<uint32_t> dcacheRefillDataWrite;
     std::vector<uint32_t> dcacheRefillTagWrite;
-
     bool isDcacheRefillTagWrite() const
     {
         for (auto stage : dcacheRefillTagWrite) {
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 433f3c17a8..9cfc4d791f 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -349,7 +349,8 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
         if (inst->isLoad() || inst->isAtomic()) {
             Addr addr = pkt->getAddr();
             auto [enable_diff, diff_all_states] = cpu->getDiffAllStates();
-            if (system->multiCore() && enable_diff && !request->_sbufferBypass &&
+            if (system->multiContextDifftest() && enable_diff &&
+                !request->_sbufferBypass &&
                 cpu->goldenMemManager()->inPmem(addr)) {
                 // check data with golden mem
                 uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index 0b9a1b47a8..84e3e0e031 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -79,6 +79,8 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams &params)
         fixedbuffer[tid] = boost::circular_buffer<DynInstPtr>(renameWidth);
         renameMap[tid] = nullptr;
         stalls[tid] = {false, false};
+        finalCommitSeq[tid] = 0;
+        releaseSeq[tid] = 0;
     }
 
     assert(decodeToRenameDelay == 1);
@@ -261,6 +263,8 @@ Rename::resetStage()
     for (ThreadID tid = 0; tid < numThreads; tid++) {
 
         stalls[tid].iew = false;
+        finalCommitSeq[tid] = 0;
+        releaseSeq[tid] = 0;
     }
 }
 
@@ -416,7 +420,15 @@ Rename::tick()
 
     updateActivate();
 
-    if (wroteToTimeBuffer || releaseSeq < finalCommitSeq) {
+    bool release_pending = false;
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (releaseSeq[tid] < finalCommitSeq[tid]) {
+            release_pending = true;
+            break;
+        }
+    }
+
+    if (wroteToTimeBuffer || release_pending) {
         DPRINTF(Activity, "Activity this cycle.\n");
         cpu->activityThisCycle();
     }
@@ -427,21 +439,23 @@ Rename::releasePhysRegs()
 {
     // Release physical registers up to releaseWidth
     auto threads = activeThreads->begin();
-    if (releaseSeq + releaseWidth < finalCommitSeq) {
-        releaseSeq += releaseWidth;
-    } else {
-        releaseSeq = finalCommitSeq;
-    }
     while (threads != activeThreads->end()) {
         ThreadID tid = *threads++;
 
-        removeFromHistory(releaseSeq, tid);
+        if (releaseSeq[tid] + releaseWidth < finalCommitSeq[tid]) {
+            releaseSeq[tid] += releaseWidth;
+        } else {
+            releaseSeq[tid] = finalCommitSeq[tid];
+        }
+
+        removeFromHistory(releaseSeq[tid], tid);
         // If we committed this cycle then doneSeqNum will be > 0
         if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
             !fromCommit->commitInfo[tid].squash) {
 
-            finalCommitSeq = fromCommit->commitInfo[tid].doneSeqNum;
-            releaseSeq = historyBuffer->empty() ? 0 : historyBuffer[tid].back().instSeqNum;
+            finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum;
+            releaseSeq[tid] =
+                historyBuffer[tid].empty() ? 0 : historyBuffer[tid].back().instSeqNum;
         }
     }
 }
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index 4e83cc0919..861b0f82c2 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -277,9 +277,9 @@ class Rename
      */
     std::list<RenameHistory> historyBuffer[MaxThreads];
 
-    InstSeqNum finalCommitSeq = 0;
+    InstSeqNum finalCommitSeq[MaxThreads] = {};
 
-    InstSeqNum releaseSeq = 0;
+    InstSeqNum releaseSeq[MaxThreads] = {};
 
     void tryFreePReg(PhysRegIdPtr phys_reg);
 
diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc
index c4876e8158..aeafc9bb38 100644
--- a/src/cpu/pred/btb/abtb.cc
+++ b/src/cpu/pred/btb/abtb.cc
@@ -313,8 +313,9 @@ AheadBTB::putPCHistory(Addr startAddr,
 }
 
 std::shared_ptr<void>
-AheadBTB::getPredictionMeta()
+AheadBTB::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     // Lazy-initialize meta so callers never observe a null pointer
     // This avoids early-cycle crashes when prediction hasn't populated meta yet
     if (!meta) {
diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh
index 9e7abc6260..677f5f7f32 100644
--- a/src/cpu/pred/btb/abtb.hh
+++ b/src/cpu/pred/btb/abtb.hh
@@ -147,7 +147,7 @@ class AheadBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // not used
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc
index aed825e1e3..e625650d10 100644
--- a/src/cpu/pred/btb/btb_ittage.cc
+++ b/src/cpu/pred/btb/btb_ittage.cc
@@ -37,6 +37,8 @@ ittageStats(this, p.numPredictors)
     tableIndexMasks.resize(numPredictors);
     tableTagBits.resize(numPredictors);
     tableTagMasks.resize(numPredictors);
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
     for (unsigned int i = 0; i < p.numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -52,9 +54,15 @@ ittageStats(this, p.numPredictors)
 
         assert(tablePcShifts.size() >= numPredictors);
 
-        tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], (int)16));
-        altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, (int)16));
-        indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], (int)16));
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i], (int)16);
+            state.altTagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i] - 1, (int)16);
+            state.indexFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableIndexBits[i], (int)16);
+        }
     }
     // useAlt.resize(128);
     // for (unsigned i = 0; i < useAlt.size(); ++i) {
@@ -63,6 +71,27 @@ ittageStats(this, p.numPredictors)
     usefulResetCnt = 0;
 }
 
+ThreadID
+BTBITTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBITTAGE::ThreadHistoryState &
+BTBITTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBITTAGE::ThreadHistoryState &
+BTBITTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 void
 BTBITTAGE::tickStart()
 {
@@ -72,7 +101,8 @@ void
 BTBITTAGE::tick() {}
 
 void
-BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries, IndirectTargets& results)
+BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries,
+                        IndirectTargets& results, ThreadID tid)
 {
     DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr);
     std::vector<TagePrediction> preds;
@@ -149,7 +179,7 @@ BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries,
             }
             // Note: predTargetHit will be updated in the update phase when we know the actual target
             TagePrediction pred(btb_entry.pc, main_info, alt_info, use_alt, main_target);
-            meta->preds[btb_entry.pc] = pred;
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
         }
     }
 }
@@ -161,17 +191,19 @@ BTBITTAGE::dryRunCycle(Addr startPC) {
 
 void
 BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const auto &state = historyState(tid);
     if (debugPC == stream_start) {
         debugFlag = true;
     }
     DPRINTF(ITTAGE, "putPCHistory startAddr: %#lx\n", stream_start);
 
     // clear old metas
-    meta = std::make_shared<TageMeta>();
+    threadMeta[tid] = std::make_shared<TageMeta>();
     // assign history for meta
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
 
     lookupEntries.clear();
     lookupIndices.clear();
@@ -180,8 +212,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
     // all btb entries should use the same lookup result
     // but each btb entry can use prediction from different tables
     for (int i = 0; i < numPredictors; ++i) {
-        Addr index = getTageIndex(stream_start, i);
-        Addr tag = getTageTag(stream_start, i);
+        Addr index = getTageIndex(stream_start, i, state.indexFoldedHist[i].get());
+        Addr tag = getTageTag(stream_start, i, state.tagFoldedHist[i].get(),
+                              state.altTagFoldedHist[i].get());
         auto &entry = tageTable[i][index];
         lookupEntries.push_back(entry);
         lookupIndices.push_back(index);
@@ -190,20 +223,24 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
         DPRINTF(ITTAGE, "lookup table %d[%d]: valid %d, tag %d, ctr %d, useful %d\n",
             i, index, entry.valid, entry.tag, entry.counter, entry.useful);
     }
-    meta->usefulMask = std::move(useful_mask);
+    threadMeta[tid]->usefulMask = std::move(useful_mask);
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         auto &stage_pred = stagePreds[s];
         stage_pred.indirectTargets.clear();
-        lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.indirectTargets);
+        lookupHelper(stream_start, stage_pred.btbEntries,
+                     stage_pred.indirectTargets, tid);
     }
     DPRINTF(ITTAGE, "putPCHistory end\n");
     debugFlag = false;
 }
 
 std::shared_ptr<void>
-BTBITTAGE::getPredictionMeta() {
-    return meta;
+BTBITTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 void
@@ -419,7 +456,9 @@ BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis
 Addr
 BTBITTAGE::getTageTag(Addr pc, int t)
 {
-    return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get());
+    const auto &state = historyState(0);
+    return getTageTag(pc, t, state.tagFoldedHist[t].get(),
+                      state.altTagFoldedHist[t].get());
 }
 
 Addr
@@ -436,7 +475,7 @@ BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
 Addr
 BTBITTAGE::getTageIndex(Addr pc, int t)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
 }
 
 bool
@@ -477,8 +516,10 @@ BTBITTAGE::satDecrement(int min, short &counter)
  * @param target The target address of the branch
  */
 void
-BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target)
+BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken,
+                        Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::ITTAGEHistory) {  // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
@@ -491,7 +532,9 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference
             foldedHist.update(history, 2, taken, pc, target);
             DPRINTF(ITTAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get());
@@ -502,7 +545,7 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 bool
 BTBITTAGE::tageHit()
 {
-    auto meta = getPredictionMeta();
+    auto meta = getPredictionMeta(0);
     auto preds = std::static_pointer_cast<TageMeta>(meta)->preds;
     bool hit = false;
     for (auto & [pc, pred] : preds) {
@@ -530,7 +573,7 @@ void
 BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, taken, pc, target);
+    doUpdateHist(history, taken, pc, target, pred.tid);
 }
 
 /**
@@ -549,18 +592,28 @@ BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredic
 void
 BTBITTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = historyState(entry.tid);
     std::shared_ptr<TageMeta> predMeta = std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < numPredictors; i++) {
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
-    doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, cond_taken, entry.getControlPC(),
+                 entry.getTakenTarget(), entry.tid);
 }
 
 void
 BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                           const char * when)
+{
+    auto &state = historyState(tid);
     if (debugFlag) {
         DPRINTF(ITTAGE, "checking folded history when %s\n", when);
         std::string hist_str;
@@ -571,7 +624,9 @@ BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe
         for (int type = 0; type < 2; type++) {
             DPRINTF(ITTAGE, "t: %d, type: %d\n", t, type);
             std::string buf2, buf3;
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             foldedHist.check(hist);
         }
     }
diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh
index e86b45817b..8269fdaeb6 100644
--- a/src/cpu/pred/btb/btb_ittage.hh
+++ b/src/cpu/pred/btb/btb_ittage.hh
@@ -3,6 +3,7 @@
 
 #include <deque>
 #include <map>
+#include <memory>
 #include <utility>
 #include <vector>
 
@@ -10,6 +11,7 @@
 #include "base/statistics.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -30,6 +32,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor
 {
     using defer = std::shared_ptr<void>;
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
     typedef BTBITTAGEParams Params;
 
@@ -99,7 +102,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update 3 folded history, according history and pred.taken
     // the other specUpdateHist methods are left blank
@@ -116,11 +119,13 @@ class BTBITTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
   private:
 
     // return provided
-    void lookupHelper(Addr stream_start, const std::vector<BTBEntry> &btbEntries, IndirectTargets& results);
+    void lookupHelper(Addr stream_start, const std::vector<BTBEntry> &btbEntries,
+                      IndirectTargets& results, ThreadID tid);
 
     // use blockPC
     Addr getTageIndex(Addr pc, int table);
@@ -139,7 +144,8 @@ class BTBITTAGE : public TimedBaseBTBPredictor
     }
 
     // Update branch history
-    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target);
+    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target,
+                      ThreadID tid);
 
     const unsigned numPredictors;
 
@@ -151,9 +157,14 @@ class BTBITTAGE : public TimedBaseBTBPredictor
     std::vector<bitset> tableTagMasks;
     std::vector<unsigned> tablePcShifts;
     std::vector<unsigned> histLengths;
-    std::vector<PathFoldedHist> tagFoldedHist;
-    std::vector<PathFoldedHist> altTagFoldedHist;
-    std::vector<PathFoldedHist> indexFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<PathFoldedHist> tagFoldedHist;
+        std::vector<PathFoldedHist> altTagFoldedHist;
+        std::vector<PathFoldedHist> indexFoldedHist;
+    };
+
+    std::vector<ThreadHistoryState> threadHistory;
 
     LFSR64 allocLFSR;
 
@@ -261,7 +272,10 @@ class BTBITTAGE : public TimedBaseBTBPredictor
         }
     } TageMeta;
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 
 public:
 
diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc
index b2b7726a5f..f0a3837191 100755
--- a/src/cpu/pred/btb/btb_mgsc.cc
+++ b/src/cpu/pred/btb/btb_mgsc.cc
@@ -60,41 +60,64 @@ BTBMGSC::initStorage()
     assert(isPowerOf2(numCtrsPerLine));
     numCtrsPerLineBits = log2i(numCtrsPerLine);
 
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     auto bwTableSize = allocPredTable(bwTable, bwTableNum, bwTableIdxWidth);
-    for (unsigned int i = 0; i < bwTableNum; ++i) {
-        indexBwFoldedHist.push_back(GlobalBwFoldedHist(bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < bwTableNum; ++i) {
+            state.indexBwFoldedHist.emplace_back(
+                bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     bwIndex.resize(bwTableNum);
 
     auto lTableSize = allocPredTable(lTable, lTableNum, lTableIdxWidth);
-    indexLFoldedHist.resize(numEntriesFirstLocalHistories);
-    for (unsigned int i = 0; i < lTableNum; ++i) {
-        for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
-            indexLFoldedHist[k].push_back(LocalFoldedHist(lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        state.indexLFoldedHist.resize(numEntriesFirstLocalHistories);
+        for (unsigned int i = 0; i < lTableNum; ++i) {
+            for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
+                state.indexLFoldedHist[k].push_back(LocalFoldedHist(
+                    lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16));
+            }
         }
     }
     lIndex.resize(lTableNum);
 
     auto iTableSize = allocPredTable(iTable, iTableNum, iTableIdxWidth);
-    for (unsigned int i = 0; i < iTableNum; ++i) {
-        assert(iHistLen[i] >= 0);
-        assert(static_cast<unsigned>(iHistLen[i]) < 63);
-        assert(pow2(static_cast<unsigned>(iHistLen[i])) <= iTableSize);
-        indexIFoldedHist.push_back(ImliFoldedHist(iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < iTableNum; ++i) {
+            assert(iHistLen[i] >= 0);
+            assert(static_cast<unsigned>(iHistLen[i]) < 63);
+            assert(pow2(static_cast<unsigned>(iHistLen[i])) <= iTableSize);
+            state.indexIFoldedHist.emplace_back(
+                iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     iIndex.resize(iTableNum);
 
     auto gTableSize = allocPredTable(gTable, gTableNum, gTableIdxWidth);
-    for (unsigned int i = 0; i < gTableNum; ++i) {
-        assert(gTable.size() >= gTableNum);
-        indexGFoldedHist.push_back(GlobalFoldedHist(gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < gTableNum; ++i) {
+            assert(gTable.size() >= gTableNum);
+            state.indexGFoldedHist.emplace_back(
+                gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16);
+        }
     }
     gIndex.resize(gTableNum);
 
     auto pTableSize = allocPredTable(pTable, pTableNum, pTableIdxWidth);
-    for (unsigned int i = 0; i < pTableNum; ++i) {
-        assert(pTable.size() >= pTableNum);
-        indexPFoldedHist.push_back(PathFoldedHist(pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2));
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        auto &state = threadHistory[tid];
+        for (unsigned int i = 0; i < pTableNum; ++i) {
+            assert(pTable.size() >= pTableNum);
+            state.indexPFoldedHist.emplace_back(
+                pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2);
+        }
     }
     pIndex.resize(pTableNum);
 
@@ -219,6 +242,27 @@ BTBMGSC::BTBMGSC(const Params &p)
 #endif
 BTBMGSC::~BTBMGSC() {}
 
+ThreadID
+BTBMGSC::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBMGSC::ThreadHistoryState &
+BTBMGSC::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBMGSC::ThreadHistoryState &
+BTBMGSC::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 BTBMGSC::setTrace()
@@ -357,34 +401,41 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum)
  * @return TagePrediction containing main and alternative predictions
  */
 BTBMGSC::MgscPrediction
-BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info)
+BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
+                                  const TageInfoForMGSC &tage_info,
+                                  ThreadID tid)
 {
     DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc,
             btb_entry.alwaysTaken);
+    const auto &state = historyState(tid);
 
     // Calculate indices for all tables
     for (unsigned int i = 0; i < bwTableNum; ++i) {
-        bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, indexBwFoldedHist[i].get());
+        bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits,
+                                  state.indexBwFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < lTableNum; ++i) {
         lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits,
-                                 indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get());
+                                 state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get());
     }
     // std::string buf;
     // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf);
     // DPRINTF(MGSC, "startPC: %#lx, local index: %d, local_folded_hist: %s\n", startPC, lIndex[0], buf.c_str());
 
     for (unsigned int i = 0; i < iTableNum; ++i) {
-        iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, indexIFoldedHist[i].get());
+        iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexIFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < gTableNum; ++i) {
-        gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, indexGFoldedHist[i].get());
+        gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexGFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < pTableNum; ++i) {
-        pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, indexPFoldedHist[i].get());
+        pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits,
+                                 state.indexPFoldedHist[i].get());
     }
 
     for (unsigned int i = 0; i < biasTableNum; ++i) {
@@ -478,7 +529,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
  */
 void
 BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens &results)
+                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens &results, ThreadID tid)
 {
     DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -488,8 +540,9 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
         if (btb_entry.isCond && btb_entry.valid) {
             auto tage_info = tageInfoForMgscs.find(btb_entry.pc);
             if (tage_info != tageInfoForMgscs.end()) {
-                auto pred = generateSinglePrediction(btb_entry, startPC, tage_info->second);
-                meta->preds[btb_entry.pc] = pred;
+                auto pred = generateSinglePrediction(btb_entry, startPC,
+                                                     tage_info->second, tid);
+                threadMeta[tid]->preds[btb_entry.pc] = pred;
                 results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
             } else {
                 assert(false);
@@ -514,6 +567,8 @@ void
 BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds)
 {
+    const ThreadID tid = predictorTid(stagePreds);
+    const auto &state = historyState(tid);
     DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start);
 
     // IMPORTANT: when this function is called,
@@ -525,25 +580,29 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
     }
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<MgscMeta>();
-    meta->indexBwFoldedHist = indexBwFoldedHist;
-    meta->indexLFoldedHist = indexLFoldedHist;
-    meta->indexIFoldedHist = indexIFoldedHist;
-    meta->indexGFoldedHist = indexGFoldedHist;
-    meta->indexPFoldedHist = indexPFoldedHist;
+    threadMeta[tid] = std::make_shared<MgscMeta>();
+    threadMeta[tid]->indexBwFoldedHist = state.indexBwFoldedHist;
+    threadMeta[tid]->indexLFoldedHist = state.indexLFoldedHist;
+    threadMeta[tid]->indexIFoldedHist = state.indexIFoldedHist;
+    threadMeta[tid]->indexGFoldedHist = state.indexGFoldedHist;
+    threadMeta[tid]->indexPFoldedHist = state.indexPFoldedHist;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens);
+        lookupHelper(stream_start, stage_pred.btbEntries,
+                     stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid);
     }
 }
 
 std::shared_ptr<void>
-BTBMGSC::getPredictionMeta()
+BTBMGSC::getPredictionMeta(ThreadID tid)
 {
-    return meta;
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -1092,10 +1151,11 @@ BTBMGSC::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, bool ta
 void
 BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getHistInfo();
-    doUpdateHist(history, shamt, cond_taken, indexGFoldedHist);  // use global history to update G folded history
+    doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist);  // use global history to update G folded history
 }
 
 /**
@@ -1113,8 +1173,9 @@ BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredictio
 void
 BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, 2, taken, indexPFoldedHist, pc, target);  // only path history needs pc!
+    doUpdateHist(history, 2, taken, state.indexPFoldedHist, pc, target);  // only path history needs pc!
 }
 
 
@@ -1133,10 +1194,11 @@ BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti
 void
 BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getBwHistInfo();
-    doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist);
 }
 
 /**
@@ -1154,12 +1216,13 @@ BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPredict
 void
 BTBMGSC::specUpdateIHist(FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getBwHistInfo();
     // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update)
     boost::dynamic_bitset<> dummy;
-    doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist);
+    doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist);
 }
 
 /**
@@ -1177,11 +1240,12 @@ BTBMGSC::specUpdateIHist(FullBTBPrediction &pred)
 void
 BTBMGSC::specUpdateLHist(const std::vector<boost::dynamic_bitset<>> &history, FullBTBPrediction &pred)
 {
+    auto &state = historyState(pred.tid);
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getHistInfo();
     doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                 indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]);
+                 state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]);
 }
 
 /**
@@ -1203,11 +1267,12 @@ BTBMGSC::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < gTableNum; i++) {
-        indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]);
+        state.indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]);
     }
-    doUpdateHist(history, shamt, cond_taken, indexGFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist);
 }
 
 /**
@@ -1229,11 +1294,13 @@ BTBMGSC::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < pTableNum; i++) {
-        indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]);
+        state.indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]);
     }
-    doUpdateHist(history, 2, cond_taken, indexPFoldedHist, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, 2, cond_taken, state.indexPFoldedHist,
+                 entry.getControlPC(), entry.getTakenTarget());
 }
 
 /**
@@ -1255,11 +1322,12 @@ BTBMGSC::recoverBwHist(const boost::dynamic_bitset<> &history, const FetchTarget
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < bwTableNum; i++) {
-        indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]);
+        state.indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]);
     }
-    doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist);
+    doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist);
 }
 
 /**
@@ -1281,13 +1349,14 @@ BTBMGSC::recoverIHist(const FetchTarget &entry, int shamt, bool cond_taken)
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < iTableNum; i++) {
-        indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]);
+        state.indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]);
     }
     // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update)
     boost::dynamic_bitset<> dummy;
-    doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist);
+    doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist);
 }
 
 /**
@@ -1310,14 +1379,15 @@ BTBMGSC::recoverLHist(const std::vector<boost::dynamic_bitset<>> &history, const
     if (!isEnabled()) {
         return;  // No recover when disabled
     }
+    auto &state = historyState(entry.tid);
     std::shared_ptr<MgscMeta> predMeta = std::static_pointer_cast<MgscMeta>(entry.predMetas[getComponentIdx()]);
     for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) {
         for (int i = 0; i < lTableNum; i++) {
-            indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]);
+            state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]);
         }
     }
             doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                         indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]);
+                         state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]);
         }
 
 #ifndef UNIT_TEST
@@ -1438,6 +1508,15 @@ void
 BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
                          const std::vector<boost::dynamic_bitset<>> &LHistory, const char *when)
 {
+    checkFoldedHist(Ghistory, PHistory, LHistory, 0, when);
+}
+
+void
+BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
+                         const std::vector<boost::dynamic_bitset<>> &LHistory,
+                         ThreadID tid, const char *when)
+{
+    auto &state = historyState(tid);
     DPRINTF(MGSC, "checking folded history when %s\n", when);
     if (debug::MGSC) {
         std::string hist_str;
@@ -1445,17 +1524,17 @@ BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::d
         DPRINTF(MGSC, "history:\t%s\n", hist_str.c_str());
     }
     for (int t = 0; t < gTableNum; t++) {
-        auto &foldedHist = indexGFoldedHist[t];
+        auto &foldedHist = state.indexGFoldedHist[t];
         foldedHist.check(Ghistory);
     }
     for (int t = 0; t < pTableNum; t++) {
-        auto &foldedHist = indexPFoldedHist[t];
+        auto &foldedHist = state.indexPFoldedHist[t];
         foldedHist.check(PHistory);
     }
     for (int t = 0; t < lTableNum; t++) {
-        assert(LHistory.size() == indexLFoldedHist.size());
+        assert(LHistory.size() == state.indexLFoldedHist.size());
         for (int i = 0; i < LHistory.size(); i++) {
-            auto &foldedHist = indexLFoldedHist[i][t];
+            auto &foldedHist = state.indexLFoldedHist[i][t];
             foldedHist.check(LHistory[i]);
         }
     }
diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh
index 100fc639a4..6ff29b13c8 100755
--- a/src/cpu/pred/btb/btb_mgsc.hh
+++ b/src/cpu/pred/btb/btb_mgsc.hh
@@ -14,6 +14,7 @@
 
 #include "base/sat_counter.hh"
 #include "base/types.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -39,6 +40,7 @@ namespace test {
 
 class BTBMGSC : public TimedBaseBTBPredictor
 {
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     BTBMGSC();
@@ -157,7 +159,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update all folded history, according history and pred.taken
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
@@ -191,6 +193,9 @@ class BTBMGSC : public TimedBaseBTBPredictor
     // check folded hists after speculative update and recover
     void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
                          const std::vector<boost::dynamic_bitset<>> &LHistory, const char *when);  // Check GHR folded
+    void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory,
+                         const std::vector<boost::dynamic_bitset<>> &LHistory,
+                         ThreadID tid, const char *when);  // Check GHR folded
 
     // Calculate MGSC weight index
     Addr getPcIndex(Addr pc, unsigned tableIndexBits);
@@ -247,7 +252,8 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
     // Look up predictions in MGSC tables for a stream of instructions
     void lookupHelper(const Addr &stream_start, const std::vector<BTBEntry> &btbEntries,
-                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens &results);
+                      const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens &results, ThreadID tid);
 
     // Calculate MGSC history index with folded history
     Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist);
@@ -277,7 +283,8 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
     // Helper method to generate prediction for a single BTB entry
     MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
-                                            const TageInfoForMGSC &tage_info);
+                                            const TageInfoForMGSC &tage_info,
+                                            ThreadID tid);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -353,12 +360,16 @@ class BTBMGSC : public TimedBaseBTBPredictor
     bool enablePCThreshold;
     Addr focusBranchPC;
 
-    // Folded history for index calculation
-    std::vector<GlobalBwFoldedHist> indexBwFoldedHist;
-    std::vector<std::vector<LocalFoldedHist>> indexLFoldedHist;
-    std::vector<ImliFoldedHist> indexIFoldedHist;
-    std::vector<GlobalFoldedHist> indexGFoldedHist;
-    std::vector<PathFoldedHist> indexPFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<GlobalBwFoldedHist> indexBwFoldedHist;
+        std::vector<std::vector<LocalFoldedHist>> indexLFoldedHist;
+        std::vector<ImliFoldedHist> indexIFoldedHist;
+        std::vector<GlobalFoldedHist> indexGFoldedHist;
+        std::vector<PathFoldedHist> indexPFoldedHist;
+    };
+
+    std::vector<ThreadHistoryState> threadHistory;
 
     // The actual MGSC prediction tables (table x index x line)
     std::vector<std::vector<std::vector<int16_t>>> bwTable;
@@ -552,8 +563,9 @@ class BTBMGSC : public TimedBaseBTBPredictor
 
         static const std::unordered_map<Addr, MgscPrediction> &preds(const BTBMGSC &mgsc)
         {
-            assert(mgsc.meta);
-            return mgsc.meta->preds;
+            assert(!mgsc.threadMeta.empty());
+            assert(mgsc.threadMeta[0]);
+            return mgsc.threadMeta[0]->preds;
         }
     };
 #endif
@@ -594,7 +606,10 @@ class BTBMGSC : public TimedBaseBTBPredictor
         }
     } MgscMeta;
 
-    std::shared_ptr<MgscMeta> meta;
+    std::vector<std::shared_ptr<MgscMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc
index 85db48441a..c81bfb1a1d 100644
--- a/src/cpu/pred/btb/btb_tage.cc
+++ b/src/cpu/pred/btb/btb_tage.cc
@@ -155,6 +155,9 @@ tageStats(this, p.numPredictors, p.numBanks)
     tableTagBits.resize(numPredictors);
     tableTagMasks.resize(numPredictors);
 
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     for (unsigned int i = 0; i < numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -173,17 +176,14 @@ tageStats(this, p.numPredictors, p.numBanks)
         tableTagMasks[i].resize(tableTagBits[i], true);
 
         assert(tablePcShifts.size() >= numPredictors);
-
         const auto historyType =
             usePathHistory ? HistoryType::PATH : HistoryType::GLOBAL;
-        tagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i],
-                                   16, historyType);
-        altTagFoldedHist.emplace_back((int)histLengths[i],
-                                      (int)tableTagBits[i] - 1, 16,
-                                      historyType);
-        indexFoldedHist.emplace_back((int)histLengths[i],
-                                     (int)tableIndexBits[i], 16,
-                                     historyType);
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i], 16, historyType);
+            state.altTagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i] - 1, 16, historyType);
+            state.indexFoldedHist.emplace_back((int)histLengths[i], (int)tableIndexBits[i], 16, historyType);
+        }
     }
     usefulResetCnt = 0;
 
@@ -202,6 +202,27 @@ BTBTAGE::~BTBTAGE()
 {
 }
 
+ThreadID
+BTBTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+BTBTAGE::ThreadHistoryState &
+BTBTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const BTBTAGE::ThreadHistoryState &
+BTBTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 BTBTAGE::setTrace()
@@ -275,8 +296,10 @@ BTBTAGE::tickStart() {}
 BTBTAGE::TagePrediction
 BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
-                                 std::shared_ptr<TageMeta> predMeta) {
+                                 std::shared_ptr<TageMeta> predMeta,
+                                 ThreadID tid) {
     DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
+    const auto &state = historyState(tid);
 
     // Find main and alternative predictions
     bool provided = false;
@@ -292,10 +315,11 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
         // Calculate index and tag: use snapshot if provided, otherwise use current folded history
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
         Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i);
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get());
         Addr tag = predMeta ? getTageTag(startPC, i,
                             predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position)
-                        : getTageTag(startPC, i, position);
+                        : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
+                                     state.altTagFoldedHist[i].get(), position);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -391,7 +415,8 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
  */
 void
 BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                      std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens& results)
+                      std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                      CondTakens& results, ThreadID tid)
 {
     DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -399,8 +424,8 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
     for (auto &btb_entry : btbEntries) {
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
-            auto pred = generateSinglePrediction(btb_entry, startPC);
-            meta->preds[btb_entry.pc] = pred;
+            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid);
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
             tageInfoForMgscs[btb_entry.pc].tage_pred_taken = pred.taken;
@@ -442,6 +467,8 @@ BTBTAGE::dryRunCycle(Addr startPC) {
  */
 void
 BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
     predBankValid = true;
@@ -459,24 +486,28 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPr
     // get prediction and save it
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<TageMeta>();
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
-    meta->history = history;
+    threadMeta[tid] = std::make_shared<TageMeta>();
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
+    threadMeta[tid]->history = history;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens);
+        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs,
+                     stage_pred.condTakens, tid);
     }
 
 }
 
 std::shared_ptr<void>
-BTBTAGE::getPredictionMeta() {
-    return meta;
+BTBTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -1042,7 +1073,9 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
 Addr
 BTBTAGE::getTageTag(Addr pc, int t, Addr position)
 {
-    return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get(), position);
+    const auto &state = historyState(0);
+    return getTageTag(pc, t, state.tagFoldedHist[t].get(),
+                      state.altTagFoldedHist[t].get(), position);
 }
 
 Addr
@@ -1062,7 +1095,7 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
 Addr
 BTBTAGE::getTageIndex(Addr pc, int t)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
 }
 
 bool
@@ -1125,8 +1158,9 @@ BTBTAGE::getBankId(Addr pc) const
  */
 void
 BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt,
-                      bool taken, Addr pc, Addr target)
+                      bool taken, Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::TAGEHistory) {   // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
@@ -1149,7 +1183,10 @@ BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt,
 
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
+            // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference
             foldedHist.update(history, shamt, taken, pc, target);
             DPRINTF(TAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get());
         }
@@ -1177,7 +1214,7 @@ BTBTAGE::specUpdateHist(const boost::dynamic_bitset<> &history,
     }
 
     auto [shamt, taken] = pred.getHistInfo();
-    doUpdateHist(history, shamt, taken, 0, 0);
+    doUpdateHist(history, shamt, taken, 0, 0, pred.tid);
 }
 
 void
@@ -1188,7 +1225,7 @@ BTBTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti
     }
 
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, 2, taken, pc, target);
+    doUpdateHist(history, 2, taken, pc, target, pred.tid);
 }
 
 void
@@ -1197,9 +1234,9 @@ BTBTAGE::recoverFoldedHist(const FetchTarget &entry)
     auto predMeta =
         std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     for (int i = 0; i < numPredictors; i++) {
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        threadHistory[entry.tid].tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        threadHistory[entry.tid].altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        threadHistory[entry.tid].indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
 }
 
@@ -1225,7 +1262,7 @@ BTBTAGE::recoverHist(const boost::dynamic_bitset<> &history,
     }
 
     recoverFoldedHist(entry);
-    doUpdateHist(history, shamt, cond_taken, 0, 0);
+    doUpdateHist(history, shamt, cond_taken, 0, 0, entry.tid);
 }
 
 void
@@ -1238,13 +1275,21 @@ BTBTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
 
     recoverFoldedHist(entry);
     doUpdateHist(history, 2, cond_taken, entry.getControlPC(),
-                 entry.getTakenTarget());
+                 entry.getTakenTarget(), entry.tid);
 }
 
 // Check folded history after speculative update and recovery
 void
 BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                         const char * when)
+{
+    auto &state = historyState(tid);
     DPRINTF(TAGE, "checking folded history when %s\n", when);
     if (debug::TAGEHistory) {
         std::string hist_str;
@@ -1254,7 +1299,9 @@ BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
     for (int t = 0; t < numPredictors; t++) {
         for (int type = 0; type < 3; type++) {
             std::string buf2, buf3;
-            auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t];
+            auto &foldedHist = type == 0 ? state.indexFoldedHist[t]
+                                         : type == 1 ? state.tagFoldedHist[t]
+                                                     : state.altTagFoldedHist[t];
             foldedHist.check(hist);
         }
     }
diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh
index abd4bf0f49..33bd6826ae 100644
--- a/src/cpu/pred/btb/btb_tage.hh
+++ b/src/cpu/pred/btb/btb_tage.hh
@@ -4,12 +4,14 @@
 #include <cstdint>
 #include <deque>
 #include <map>
+#include <memory>
 #include <utility>
 #include <vector>
 
 #include "base/sat_counter.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
@@ -43,6 +45,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
 {
     using defer = std::shared_ptr<void>;
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     // Test constructor
@@ -140,7 +143,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // Update folded history from GHR when configured in direction-history mode.
     void specUpdateHist(const boost::dynamic_bitset<> &history,
@@ -167,6 +170,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     virtual void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
 #ifndef UNIT_TEST
   protected:
@@ -174,7 +178,8 @@ class BTBTAGE : public TimedBaseBTBPredictor
 
     // Look up predictions in TAGE tables for a stream of instructions
     void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                    std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs, CondTakens& results);
+                    std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
+                    CondTakens& results, ThreadID tid);
 
     // Calculate TAGE index for a given PC and table
     Addr getTageIndex(Addr pc, int table);
@@ -204,7 +209,7 @@ class BTBTAGE : public TimedBaseBTBPredictor
 
     // Update branch history
     void doUpdateHist(const bitset &history, int shamt, bool taken,
-                      Addr pc, Addr target);
+                      Addr pc, Addr target, ThreadID tid);
     void recoverFoldedHist(const FetchTarget &entry);
 
     // Number of TAGE predictor tables
@@ -231,17 +236,16 @@ class BTBTAGE : public TimedBaseBTBPredictor
     // History lengths for each table
     std::vector<unsigned> histLengths;
 
-    // Folded history for tag calculation
-    std::vector<TageFoldedHist> tagFoldedHist;
-
-    // Folded history for alternative tag calculation
-    std::vector<TageFoldedHist> altTagFoldedHist;
+    const bool usePathHistory;
 
-    // Folded history for index calculation
-    std::vector<TageFoldedHist> indexFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<TageFoldedHist> tagFoldedHist;
+        std::vector<TageFoldedHist> altTagFoldedHist;
+        std::vector<TageFoldedHist> indexFoldedHist;
+    };
 
-    // Select whether BTBTAGE consumes PHR or GHR folded history.
-    const bool usePathHistory;
+    std::vector<ThreadHistoryState> threadHistory;
 
     // Linear feedback shift register for allocation
     LFSR64 allocLFSR;
@@ -461,7 +465,8 @@ private:
     // If predMeta is nullptr, use current folded history (prediction path)
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
-                                           const std::shared_ptr<TageMeta> predMeta = nullptr);
+                                           const std::shared_ptr<TageMeta> predMeta = nullptr,
+                                           ThreadID tid = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -486,7 +491,11 @@ private:
     unsigned getLRUVictim(int table, Addr index);
     unsigned getNumWays(unsigned table) const;
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/btb_tage_ub.cc b/src/cpu/pred/btb/btb_tage_ub.cc
index e1a1cf698a..d324bd962b 100644
--- a/src/cpu/pred/btb/btb_tage_ub.cc
+++ b/src/cpu/pred/btb/btb_tage_ub.cc
@@ -322,8 +322,9 @@ BTBTAGEUpperBound::putPCHistory(Addr startAddr, const bitset &history,
 }
 
 std::shared_ptr<void>
-BTBTAGEUpperBound::getPredictionMeta()
+BTBTAGEUpperBound::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     return ubMeta;
 }
 
diff --git a/src/cpu/pred/btb/btb_tage_ub.hh b/src/cpu/pred/btb/btb_tage_ub.hh
index f97792c713..b4aae9e7cc 100644
--- a/src/cpu/pred/btb/btb_tage_ub.hh
+++ b/src/cpu/pred/btb/btb_tage_ub.hh
@@ -95,7 +95,7 @@ class BTBTAGEUpperBound : public BTBTAGE
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     void specUpdateHist(const boost::dynamic_bitset<> &history,
                         FullBTBPrediction &pred) override;
diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh
index 5c394ac9cc..649641b420 100644
--- a/src/cpu/pred/btb/btb_ubtb.hh
+++ b/src/cpu/pred/btb/btb_ubtb.hh
@@ -141,7 +141,7 @@ class UBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override
     {
         return meta;
     }
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 8ed265af90..2e272047eb 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -47,7 +47,6 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
       bpDBSwitches(p.bpDBSwitches),
       numStages(p.numStages),
       ftq(p.numThreads, p.ftq_size),
-      historyManager(16), // TODO: fix this
       resolveBlockThreshold(p.resolveBlockThreshold),
       dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
 {
@@ -87,6 +86,12 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
         printf("\n");
     }
 
+    historyManagers.reserve(numThreads);
+    resolveDequeueFailCounters.assign(numThreads, 0);
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        historyManagers.emplace_back(16);
+    }
+
     for (int tid=0;tid<numThreads; tid++) {
         auto& thread = threads[tid];
 
@@ -606,7 +611,7 @@ DecoupledBPUWithBTB::commit(unsigned target_id, ThreadID tid)
     if (!ftq.empty(tid))
         printTarget(ftq.front(tid));
 
-    historyManager.commit(target_id);
+    historyManagers[tid].commit(target_id);
 }
 
 bool
@@ -644,26 +649,26 @@ DecoupledBPUWithBTB::resolveUpdate(unsigned &target_id, ThreadID tid)
 }
 
 void
-DecoupledBPUWithBTB::notifyResolveSuccess()
+DecoupledBPUWithBTB::notifyResolveSuccess(ThreadID tid)
 {
-    resolveDequeueFailCounter = 0;
+    resolveDequeueFailCounters[tid] = 0;
 }
 
 void
-DecoupledBPUWithBTB::notifyResolveFailure()
+DecoupledBPUWithBTB::notifyResolveFailure(ThreadID tid)
 {
-    resolveDequeueFailCounter++;
-    if (resolveDequeueFailCounter >= resolveBlockThreshold) {
-        blockPredictionOnce();
-        resolveDequeueFailCounter = 0;
+    auto &failCounter = resolveDequeueFailCounters[tid];
+    failCounter++;
+    if (failCounter >= resolveBlockThreshold) {
+        blockPredictionOnce(tid);
+        failCounter = 0;
     }
 }
 
 void
-DecoupledBPUWithBTB::blockPredictionOnce()
+DecoupledBPUWithBTB::blockPredictionOnce(ThreadID tid)
 {
-    // smtTODO
-    threads[0].blockPredictionPending = true;
+    threads[tid].blockPredictionPending = true;
 }
 
 void
@@ -808,7 +813,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid)
 
     // Save predictors' metadata
     for (int i = 0; i < numComponents; i++) {
-        entry.predMetas[i] = components[i]->getPredictionMeta();
+        entry.predMetas[i] = components[i]->getPredictionMeta(tid);
     }
 
     // Initialize default resolution state
@@ -843,7 +848,8 @@ DecoupledBPUWithBTB::fillAheadPipeline(FetchTarget &entry)
 }
 
 void
-DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history)
+DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history,
+                                  ThreadID tid)
 {
     // This function performs a crucial validation of branch history consistency
     // It rebuilds the "ideal" history from HistoryManager's records and compares
@@ -854,7 +860,7 @@ DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history)
     boost::dynamic_bitset<> ideal_hash_hist(historyBits, 0);
 
     // Iterate through all speculative history entries stored in HistoryManager
-    for (const auto entry: historyManager.getSpeculativeHist()) {
+    for (const auto entry: historyManagers[tid].getSpeculativeHist()) {
         // Only process entries that have non-zero shift amount (actual branches)
         if (entry.shamt != 0) {
             // Accumulate total history bits
@@ -897,6 +903,12 @@ DecoupledBPUWithBTB::resetPC(Addr new_pc)
         threads[i].s0PC = new_pc;
 }
 
+void
+DecoupledBPUWithBTB::resetPC(ThreadID tid, Addr new_pc)
+{
+    threads[tid].s0PC = new_pc;
+}
+
 Addr
 DecoupledBPUWithBTB::getPreservedReturnAddr(const DynInstPtr &dynInst)
 {
@@ -944,7 +956,7 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry)
     histShiftIn(shamt, taken, s0History);
 
     // Update history manager and verify TAGE folded history
-    historyManager.addSpeculativeHist(
+    historyManagers[tid].addSpeculativeHist(
         entry.startPC, shamt, taken, entry.predBranchInfo, ftq.backId(tid) + 1);
 
     // Get prediction information for global backward history updates
@@ -968,17 +980,18 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry)
 #ifndef NDEBUG
     if (tage->isEnabled()) {
         tage->checkFoldedHist(
-            tage->usesPathHistory() ? s0PHistory : s0History,
+            tage->usesPathHistory() ? s0PHistory : s0History, tid,
             "speculative update");
     }
     if (ittage->isEnabled()) {
-        ittage->checkFoldedHist(s0PHistory, "speculative update");
+        ittage->checkFoldedHist(s0PHistory, tid, "speculative update");
     }
     if (microtage->isEnabled()) {
-        microtage->checkFoldedHist(s0PHistory, "speculative update");
+        microtage->checkFoldedHist(s0PHistory, tid, "speculative update");
     }
     if (mgsc->isEnabled()) {
-        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, "speculative update");
+        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid,
+                              "speculative update");
     }
 #endif
 }
@@ -1054,32 +1067,34 @@ DecoupledBPUWithBTB::recoverHistoryForSquash(
 
     // Update history manager with appropriate branch info
     if (squash_type == SQUASH_CTRL) {
-        historyManager.squash(target_id, real_shamt, real_taken, target.exeBranchInfo);
+        historyManagers[tid].squash(target_id, real_shamt, real_taken,
+                                    target.exeBranchInfo);
     } else {
-        historyManager.squash(target_id, real_shamt, real_taken, BranchInfo());
+        historyManagers[tid].squash(target_id, real_shamt, real_taken,
+                                    BranchInfo());
     }
 
     // Perform history consistency checks when not a fast build variant
 #ifndef NDEBUG
-    checkHistory(s0History);
+    checkHistory(s0History, tid);
     if (tage->isEnabled()) {
         tage->checkFoldedHist(
-            tage->usesPathHistory() ? s0PHistory : s0History,
+            tage->usesPathHistory() ? s0PHistory : s0History, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (ittage->isEnabled()) {
-        ittage->checkFoldedHist(s0PHistory,
+        ittage->checkFoldedHist(s0PHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (microtage->isEnabled()) {
-        microtage->checkFoldedHist(s0PHistory,
+        microtage->checkFoldedHist(s0PHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
     if (mgsc->isEnabled()) {
-        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory,
+        mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid,
             squash_type == SQUASH_CTRL ? "control squash" :
             squash_type == SQUASH_OTHER ? "non control squash" : "trap squash");
     }
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 134258f77c..38adad4115 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -140,8 +140,8 @@ class DecoupledBPUWithBTB : public BPredUnit
         bool blockPredictionPending{false};
     } threads[MaxThreads];
 
-    HistoryManager historyManager;
-    unsigned resolveDequeueFailCounter{0};
+    std::vector<HistoryManager> historyManagers;
+    std::vector<unsigned> resolveDequeueFailCounters;
     const unsigned resolveBlockThreshold;
 
     ThreadID scheduleThread();
@@ -424,7 +424,7 @@ class DecoupledBPUWithBTB : public BPredUnit
 
     void overrideStats(OverrideReason overrideReason);
 
-    void checkHistory(const boost::dynamic_bitset<> &history);
+    void checkHistory(const boost::dynamic_bitset<> &history, ThreadID tid);
 
     Addr getPreservedReturnAddr(const DynInstPtr &dynInst);
 
@@ -703,6 +703,7 @@ class DecoupledBPUWithBTB : public BPredUnit
                       unsigned control_inst_size = 0);
 
     void resetPC(Addr new_pc);
+    void resetPC(ThreadID tid, Addr new_pc);
 
     // Helper functions for update
     bool resolveUpdate(unsigned &target_id, ThreadID tid);
@@ -710,9 +711,9 @@ class DecoupledBPUWithBTB : public BPredUnit
     void markCFIResolved(unsigned &target, uint64_t resolvedInstPC, ThreadID tid);
     void updatePredictorComponents(FetchTarget &target);
     void updateStatistics(const FetchTarget &target);
-    void notifyResolveSuccess();
-    void notifyResolveFailure();
-    void blockPredictionOnce();
+    void notifyResolveSuccess(ThreadID tid);
+    void notifyResolveFailure(ThreadID tid);
+    void blockPredictionOnce(ThreadID tid);
 
     /**
      * @brief Types of control flow instructions for misprediction tracking
diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc
index 4ab8445677..abd2923739 100644
--- a/src/cpu/pred/btb/mbtb.cc
+++ b/src/cpu/pred/btb/mbtb.cc
@@ -313,8 +313,9 @@ MBTB::putPCHistory(Addr startAddr,
 }
 
 std::shared_ptr<void>
-MBTB::getPredictionMeta()
+MBTB::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     return meta;
 }
 
diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh
index d736d0f55c..b4f587a141 100644
--- a/src/cpu/pred/btb/mbtb.hh
+++ b/src/cpu/pred/btb/mbtb.hh
@@ -147,7 +147,7 @@ class MBTB : public TimedBaseBTBPredictor
     /** Get prediction BTBMeta
      *  @return Returns the prediction meta
      */
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // not used
     void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc
index fb8dd2f139..7fd88b0845 100644
--- a/src/cpu/pred/btb/microtage.cc
+++ b/src/cpu/pred/btb/microtage.cc
@@ -95,6 +95,9 @@ tageStats(this, p.numPredictors, p.numBanks)
     }
 
     // Initialize base table for fallback predictions
+    threadHistory.resize(MaxThreads);
+    threadMeta.resize(MaxThreads);
+
     for (unsigned int i = 0; i < numPredictors; ++i) {
         //initialize ittage predictor
         assert(tableSizes.size() >= numPredictors);
@@ -111,9 +114,15 @@ tageStats(this, p.numPredictors, p.numBanks)
 
         assert(tablePcShifts.size() >= numPredictors);
 
-        tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16));
-        altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16));
-        indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16));
+        for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+            auto &state = threadHistory[tid];
+            state.tagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i], 16);
+            state.altTagFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableTagBits[i] - 1, 16);
+            state.indexFoldedHist.emplace_back(
+                (int)histLengths[i], (int)tableIndexBits[i], 16);
+        }
     }
     usefulResetCnt = 0;
 
@@ -127,6 +136,27 @@ MicroTAGE::~MicroTAGE()
 {
 }
 
+ThreadID
+MicroTAGE::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+MicroTAGE::ThreadHistoryState &
+MicroTAGE::historyState(ThreadID tid)
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
+const MicroTAGE::ThreadHistoryState &
+MicroTAGE::historyState(ThreadID tid) const
+{
+    assert(tid < threadHistory.size());
+    return threadHistory[tid];
+}
+
 // Set up tracing for debugging
 void
 MicroTAGE::setTrace()
@@ -181,8 +211,10 @@ MicroTAGE::tickStart() {}
 MicroTAGE::TagePrediction
 MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
-                                 std::shared_ptr<TageMeta> predMeta) {
+                                 std::shared_ptr<TageMeta> predMeta,
+                                 ThreadID tid) {
     DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
+    const auto &state = historyState(tid);
 
     bool provided = false;
     TageTableInfo main_info;
@@ -196,10 +228,11 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
         Addr index = predMeta ? getTageIndex(startPC, i,
                             predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i);
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get());
         Addr tag = predMeta ? getTageTag(startPC, i,
                             predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position)
-                        : getTageTag(startPC, i, tagFoldedHist[i].get(),altTagFoldedHist[i].get(), position);
+                        : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
+                                     state.altTagFoldedHist[i].get(), position);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -255,7 +288,8 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
  * @return Map of branch PC addresses to their predicted outcomes
  */
 void
-MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries, CondTakens& results)
+MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
+                        CondTakens& results, ThreadID tid)
 {
     DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -263,8 +297,9 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEnt
     for (auto &btb_entry : btbEntries) {
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
-            auto pred = generateSinglePrediction(btb_entry, startPC);
-            meta->preds[btb_entry.pc] = pred;
+            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr,
+                                                 tid);
+            threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
         }
@@ -295,6 +330,8 @@ MicroTAGE::dryRunCycle(Addr startPC) {
  */
 void
 MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
+    const ThreadID tid = predictorTid(stagePreds);
+    const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
     predBankValid = true;
@@ -312,30 +349,36 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTB
     // get prediction and save it
 
     // Clear old prediction metadata and save current history state
-    meta = std::make_shared<TageMeta>();
-    meta->tagFoldedHist = tagFoldedHist;
-    meta->altTagFoldedHist = altTagFoldedHist;
-    meta->indexFoldedHist = indexFoldedHist;
-    meta->aheadIndexFoldedHistValid = !aheadindexFoldedHist.empty();
-    if (meta->aheadIndexFoldedHistValid) {
-        meta->aheadIndexFoldedHist = aheadindexFoldedHist.front();
+    threadMeta[tid] = std::make_shared<TageMeta>();
+    threadMeta[tid]->tagFoldedHist = state.tagFoldedHist;
+    threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist;
+    threadMeta[tid]->indexFoldedHist = state.indexFoldedHist;
+    threadMeta[tid]->aheadIndexFoldedHistValid =
+        !state.aheadIndexFoldedHist.empty();
+    if (threadMeta[tid]->aheadIndexFoldedHistValid) {
+        threadMeta[tid]->aheadIndexFoldedHist =
+            state.aheadIndexFoldedHist.front();
     } else {
-        meta->aheadIndexFoldedHist.clear();
+        threadMeta[tid]->aheadIndexFoldedHist.clear();
     }
-    meta->history = history;
+    threadMeta[tid]->history = history;
 
     for (int s = getDelay(); s < stagePreds.size(); s++) {
         // TODO: only lookup once for one btb entry in different stages
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
-        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens);
+        lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens,
+                     tid);
     }
 
 }
 
 std::shared_ptr<void>
-MicroTAGE::getPredictionMeta() {
-    return meta;
+MicroTAGE::getPredictionMeta(ThreadID tid) {
+    if (tid >= threadMeta.size()) {
+        return nullptr;
+    }
+    return threadMeta[tid];
 }
 
 /**
@@ -783,7 +826,7 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
 Addr
 MicroTAGE::getTageIndex(Addr pc, int t)
 {
-    return getTageIndex(pc, t, indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
 }
 
 bool
@@ -849,23 +892,26 @@ MicroTAGE::getBankId(Addr pc) const
  * @param taken Whether the branch was taken
  */
 void
-MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target)
+MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken,
+                        Addr pc, Addr target, ThreadID tid)
 {
+    auto &state = historyState(tid);
     if (debug::TAGEHistory) {   // if debug flag is off, do not use to_string since it's too slow
         std::string buf;
         boost::to_string(history, buf);
         DPRINTF(TAGEHistory, "in doUpdateHist, taken %d, pc %#lx, history %s\n", taken, pc, buf.c_str());
     }
 
-    if (!aheadindexFoldedHist.empty()) {
-        indexFoldedHist = aheadindexFoldedHist.front();
+    if (!state.aheadIndexFoldedHist.empty()) {
+        state.indexFoldedHist = state.aheadIndexFoldedHist.front();
     }
 
     if (!taken) {
-        if (debug::TAGEHistory && !aheadindexFoldedHist.empty()) {
+        if (debug::TAGEHistory && !state.aheadIndexFoldedHist.empty()) {
             bool mismatch = false;
             for (int t = 0; t < numPredictors; t++) {
-                if (indexFoldedHist[t].get() != aheadindexFoldedHist.front()[t].get()) {
+                if (state.indexFoldedHist[t].get() !=
+                    state.aheadIndexFoldedHist.front()[t].get()) {
                     mismatch = true;
                     break;
                 }
@@ -881,22 +927,23 @@ MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr
 
     for (int t = 0; t < numPredictors; t++) {
         // Update tag folded history immediately so tag calculation always sees current history.
-        tagFoldedHist[t].update(history, 2, taken, pc, target);
-        altTagFoldedHist[t].update(history, 2, taken, pc, target);
+        state.tagFoldedHist[t].update(history, 2, taken, pc, target);
+        state.altTagFoldedHist[t].update(history, 2, taken, pc, target);
         DPRINTF(TAGEHistory, "t: %d, tag 0x%lx, altTag 0x%lx\n",
-                t, tagFoldedHist[t].get(), altTagFoldedHist[t].get());
+                t, state.tagFoldedHist[t].get(),
+                state.altTagFoldedHist[t].get());
     }
 
     // Prepare next-cycle index folded history and delay its visibility by one cycle.
-    auto nextIndexFoldedHist = indexFoldedHist;
+    auto nextIndexFoldedHist = state.indexFoldedHist;
     for (int t = 0; t < numPredictors; t++) {
         nextIndexFoldedHist[t].update(history, 2, taken, pc, target);
         DPRINTF(TAGEHistory, "t: %d, index foldedHist(next) _folded 0x%lx\n",
                 t, nextIndexFoldedHist[t].get());
     }
-    aheadindexFoldedHist.push(nextIndexFoldedHist);
-    if (aheadindexFoldedHist.size() > 1) {
-        aheadindexFoldedHist.pop();
+    state.aheadIndexFoldedHist.push(nextIndexFoldedHist);
+    if (state.aheadIndexFoldedHist.size() > 1) {
+        state.aheadIndexFoldedHist.pop();
     }
 }
 
@@ -916,7 +963,7 @@ void
 MicroTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
     auto [pc, target, taken] = pred.getPHistInfo();
-    doUpdateHist(history, taken, pc, target);
+    doUpdateHist(history, taken, pc, target, pred.tid);
 }
 
 /**
@@ -936,6 +983,7 @@ void
 MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = historyState(entry.tid);
     std::shared_ptr<TageMeta> predMeta = std::static_pointer_cast<TageMeta>(entry.predMetas[getComponentIdx()]);
     if (!predMeta) {
         DPRINTF(UTAGE, "recoverPHist: no prediction metadata, cannot recover\n");
@@ -943,21 +991,22 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     }
     // Restore current folded index history exactly to prediction-time state.
     for (int i = 0; i < numPredictors; i++) {
-        indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
+        state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]);
     }
 
     // Restore delayed index folded history slot exactly to prediction-time state.
-    while (!aheadindexFoldedHist.empty()) {
-        aheadindexFoldedHist.pop();
+    while (!state.aheadIndexFoldedHist.empty()) {
+        state.aheadIndexFoldedHist.pop();
     }
     if (predMeta->aheadIndexFoldedHistValid) {
         assert(predMeta->aheadIndexFoldedHist.size() == numPredictors);
-        aheadindexFoldedHist.push(predMeta->aheadIndexFoldedHist);
+        state.aheadIndexFoldedHist.push(predMeta->aheadIndexFoldedHist);
     }
 
     if (debug::TAGEHistory) {
         bool queue_valid_mismatch =
-            (predMeta->aheadIndexFoldedHistValid != !aheadindexFoldedHist.empty());
+            (predMeta->aheadIndexFoldedHistValid !=
+             !state.aheadIndexFoldedHist.empty());
         if (queue_valid_mismatch) {
             DPRINTF(TAGEHistory,
                     "recoverPHist: ahead queue valid mismatch after restore, cond_taken %d\n",
@@ -966,16 +1015,25 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history,
     }
 
     for (int i = 0; i < numPredictors; i++) {
-        altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
-        tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
+        state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]);
+        state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]);
     }
-    doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget());
+    doUpdateHist(history, cond_taken, entry.getControlPC(),
+                 entry.getTakenTarget(), entry.tid);
 }
 
 // Check folded history after speculative update and recovery
 void
 MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when)
 {
+    checkFoldedHist(hist, 0, when);
+}
+
+void
+MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid,
+                           const char * when)
+{
+    auto &state = historyState(tid);
     DPRINTF(UTAGE, "checking folded history when %s\n", when);
     if (debug::TAGEHistory) {
         std::string hist_str;
@@ -987,13 +1045,13 @@ MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe
         // aheadindexFoldedHist in doUpdateHist(). During consistency checks
         // right after speculative/recovery updates, compare against the staged
         // next-cycle value when available.
-        if (!aheadindexFoldedHist.empty()) {
-            aheadindexFoldedHist.front()[t].check(hist);
+        if (!state.aheadIndexFoldedHist.empty()) {
+            state.aheadIndexFoldedHist.front()[t].check(hist);
         } else {
-            indexFoldedHist[t].check(hist);
+            state.indexFoldedHist[t].check(hist);
         }
-        tagFoldedHist[t].check(hist);
-        altTagFoldedHist[t].check(hist);
+        state.tagFoldedHist[t].check(hist);
+        state.altTagFoldedHist[t].check(hist);
     }
 }
 
diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh
index b99258face..3a5fcc518c 100644
--- a/src/cpu/pred/btb/microtage.hh
+++ b/src/cpu/pred/btb/microtage.hh
@@ -4,6 +4,7 @@
 #include <cstdint>
 #include <deque>
 #include <map>
+#include <memory>
 #include <queue>
 #include <utility>
 #include <vector>
@@ -11,6 +12,7 @@
 #include "base/sat_counter.hh"
 #include "base/types.hh"
 #include "cpu/inst_seq.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/pred/btb/folded_hist.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
 
@@ -42,6 +44,7 @@ namespace test {
 class MicroTAGE : public TimedBaseBTBPredictor
 {
     using bitset = boost::dynamic_bitset<>;
+    static constexpr unsigned MaxThreads = o3::MaxThreads;
   public:
 #ifdef UNIT_TEST
     // Test constructor
@@ -121,7 +124,7 @@ class MicroTAGE : public TimedBaseBTBPredictor
                       const boost::dynamic_bitset<> &history,
                       std::vector<FullBTBPrediction> &stagePreds) override;
 
-    std::shared_ptr<void> getPredictionMeta() override;
+    std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
     // speculative update 3 folded history, according history and pred.taken
     // the other specUpdateHist methods are left blank
@@ -157,13 +160,15 @@ class MicroTAGE : public TimedBaseBTBPredictor
 
     // check folded hists after speculative update and recover
     void checkFoldedHist(const bitset &history, const char *when);
+    void checkFoldedHist(const bitset &history, ThreadID tid, const char *when);
 
 #ifndef UNIT_TEST
   private:
 #endif
 
     // Look up predictions in TAGE tables for a stream of instructions
-    void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries, CondTakens& results);
+    void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
+                      CondTakens& results, ThreadID tid);
 
     // Calculate TAGE index for a given PC and table
     Addr getTageIndex(Addr pc, int table);
@@ -183,7 +188,8 @@ class MicroTAGE : public TimedBaseBTBPredictor
     unsigned getBankId(Addr pc) const;
 
     // Update branch history
-    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target);
+    void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target,
+                      ThreadID tid);
 
     // Number of TAGE predictor tables
     const unsigned numPredictors;
@@ -203,14 +209,15 @@ class MicroTAGE : public TimedBaseBTBPredictor
     // History lengths for each table
     std::vector<unsigned> histLengths;
 
-    // Folded history for tag calculation
-    std::vector<PathFoldedHist> tagFoldedHist;
-
-    // Folded history for alternative tag calculation
-    std::vector<PathFoldedHist> altTagFoldedHist;
+    struct ThreadHistoryState
+    {
+        std::vector<PathFoldedHist> tagFoldedHist;
+        std::vector<PathFoldedHist> altTagFoldedHist;
+        std::vector<PathFoldedHist> indexFoldedHist;
+        std::queue<std::vector<PathFoldedHist>> aheadIndexFoldedHist;
+    };
 
-    // Folded history for index calculation
-    std::vector<PathFoldedHist> indexFoldedHist;
+    std::vector<ThreadHistoryState> threadHistory;
 
     // Maximum history length, not used
     unsigned maxHistLen;
@@ -257,8 +264,6 @@ class MicroTAGE : public TimedBaseBTBPredictor
     unsigned lastPredBankId;         // Bank ID of last prediction
     bool predBankValid;              // Whether lastPredBankId is valid
 
-    std::queue<std::vector<PathFoldedHist>> aheadindexFoldedHist;
-
 #ifdef UNIT_TEST
     typedef uint64_t Scalar;
 #else
@@ -349,7 +354,8 @@ private:
     // If predMeta is nullptr, use current folded history (prediction path)
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
-                                           const std::shared_ptr<TageMeta> predMeta = nullptr);
+                                           const std::shared_ptr<TageMeta> predMeta = nullptr,
+                                           ThreadID tid = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -370,7 +376,10 @@ private:
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way);
 
-    std::shared_ptr<TageMeta> meta;
+    std::vector<std::shared_ptr<TageMeta>> threadMeta;
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadHistoryState &historyState(ThreadID tid);
+    const ThreadHistoryState &historyState(ThreadID tid) const;
 };
 
 // Close conditional namespace wrapper for testing
diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc
index 4dabf6dabf..7f279bdb8e 100644
--- a/src/cpu/pred/btb/ras.cc
+++ b/src/cpu/pred/btb/ras.cc
@@ -116,8 +116,9 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
 }
 
 std::shared_ptr<void>
-BTBRAS::getPredictionMeta()
+BTBRAS::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     return meta;
 }
 
diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh
index 0055446013..b0b31c6d94 100644
--- a/src/cpu/pred/btb/ras.hh
+++ b/src/cpu/pred/btb/ras.hh
@@ -94,7 +94,7 @@ namespace btb_pred {
         void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                           std::vector<FullBTBPrediction> &stagePreds) override;
         
-        std::shared_ptr<void> getPredictionMeta() override;
+        std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
         void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
 
diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh
index fce1a6aef1..db611fef25 100644
--- a/src/cpu/pred/btb/timed_base_pred.hh
+++ b/src/cpu/pred/btb/timed_base_pred.hh
@@ -61,7 +61,10 @@ class TimedBaseBTBPredictor: public SimObject
                               const boost::dynamic_bitset<> &history,
                               std::vector<FullBTBPrediction> &stagePreds) {}
 
-    virtual std::shared_ptr<void> getPredictionMeta() { return nullptr; }
+    virtual std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0)
+    {
+        return nullptr;
+    }
 
     virtual void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {}
     virtual void specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {}
diff --git a/src/cpu/pred/btb/uras.cc b/src/cpu/pred/btb/uras.cc
index c507956d0e..53825d818a 100644
--- a/src/cpu/pred/btb/uras.cc
+++ b/src/cpu/pred/btb/uras.cc
@@ -85,8 +85,9 @@ BTBuRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
 }
 
 std::shared_ptr<void>
-BTBuRAS::getPredictionMeta()
+BTBuRAS::getPredictionMeta(ThreadID tid)
 {
+    (void)tid;
     std::shared_ptr<void> meta_void_ptr = std::make_shared<uRASMeta>(meta);
     return meta_void_ptr;
 }
diff --git a/src/cpu/pred/btb/uras.hh b/src/cpu/pred/btb/uras.hh
index cdcde96b54..4ba12b3099 100644
--- a/src/cpu/pred/btb/uras.hh
+++ b/src/cpu/pred/btb/uras.hh
@@ -43,7 +43,7 @@ class BTBuRAS : public TimedBaseBTBPredictor
         void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                           std::vector<FullBTBPrediction> &stagePreds) override;
         
-        std::shared_ptr<void> getPredictionMeta() override;
+        std::shared_ptr<void> getPredictionMeta(ThreadID tid = 0) override;
 
         void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override;
 
@@ -161,4 +161,4 @@ struct NonSpecRASTrace : public Record {
 }  // namespace branch_prediction
 
 }  // namespace gem5
-#endif  // __CPU_PRED_BTB_URAS_HH__
\ No newline at end of file
+#endif  // __CPU_PRED_BTB_URAS_HH__
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index fc91c8d2f3..27adf7f598 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -519,13 +519,14 @@ BaseSimpleCPU::readMiscReg(int misc_reg, ThreadID tid)
 }
 
 void
-BaseSimpleCPU::readGem5Regs()
+BaseSimpleCPU::readGem5Regs(ThreadID tid)
 {
+    auto diffAllStates = this->diffAllStates[tid];
     for (int i = 0; i < 32; i++) {
         diffAllStates->gem5RegFile[i] =
-            threadContexts[curThread]->getReg(RegId(IntRegClass, i));
+            threadContexts[tid]->getReg(RegId(IntRegClass, i));
         diffAllStates->gem5RegFile[i + 32] =
-            threadContexts[curThread]->getReg(RegId(FloatRegClass, i));
+            threadContexts[tid]->getReg(RegId(FloatRegClass, i));
     }
 }
 
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index b289ac778f..bcdd7c9066 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -207,7 +207,7 @@ class BaseSimpleCPU : public BaseCPU
 
     RegVal readMiscReg(int misc_reg, ThreadID tid) override;
 
-    void readGem5Regs() override;
+    void readGem5Regs(ThreadID tid) override;
 };
 
 } // namespace gem5
diff --git a/src/sim/system.cc b/src/sim/system.cc
index 7bc4ec37ce..c640334f4d 100644
--- a/src/sim/system.cc
+++ b/src/sim/system.cc
@@ -562,8 +562,8 @@ void System::initState()
     }
 
     // have to initiate golden memory after checkpoint restored
-    if (numCPUs > 1 && enableDifftest) {
-        warn("Creating golden memory for multi-core difftest\n");
+    if (multiContextDifftest()) {
+        warn("Creating golden memory for multi-context difftest\n");
         assert(enableMemDedup);
         goldenMem = dedupMemManager.createCopyOnWriteBranch();
         goldenMemManager.initGoldenMem(physmem.getStartaddr(), memSize(), goldenMem);
diff --git a/src/sim/system.hh b/src/sim/system.hh
index db49b66926..1dca935d6e 100644
--- a/src/sim/system.hh
+++ b/src/sim/system.hh
@@ -416,6 +416,11 @@ class System : public SimObject, public PCEventScope
 
     bool multiCore() const { return numCPUs > 1; }
 
+    bool multiContextDifftest() const
+    {
+        return enableDifftest && (multiCore() || multiThread);
+    }
+
     uint8_t *getGoldenMemPtr() const { return goldenMem; }
 
     GoldenGloablMem *getGoldenMemManager() { return &goldenMemManager; }

From 099bf7c52b656428ba12efce12c94af34eed2efa Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 18 Mar 2026 16:03:01 +0800
Subject: [PATCH 04/38] cpu-pred: fix unit test compile

---
 src/cpu/pred/btb/common.hh             |  4 +++-
 src/cpu/pred/btb/test/btb_tage.test.cc | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh
index dc327c8589..b61e459ff6 100644
--- a/src/cpu/pred/btb/common.hh
+++ b/src/cpu/pred/btb/common.hh
@@ -323,7 +323,8 @@ struct FetchTarget
     int s3Source; // which stage the prediction comes from
 
    FetchTarget()
-       : startPC(0),
+       : tid(0),
+         startPC(0),
          predTaken(false),
          predEndPC(0),
          predBranchInfo(BranchInfo()),
@@ -472,6 +473,7 @@ struct FullBTBPrediction
     int s3Source;
 
     FullBTBPrediction() :
+        tid(0),
         bbStart(0),
         btbEntries(),
         condTakens(),
diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc
index 75514e2d3a..e945065e9f 100644
--- a/src/cpu/pred/btb/test/btb_tage.test.cc
+++ b/src/cpu/pred/btb/test/btb_tage.test.cc
@@ -374,7 +374,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) {
 
     // Test case 1: Update with taken branch (PHR shifts in 2 bits from PC hash)
     // Correct order: first update folded histories with pre-update PHR, then mutate PHR
-    tage->doUpdateHist(history, 2, true, pc, target);
+    tage->doUpdateHist(history, 2, true, pc, target, 0);
     applyPathHistoryTaken(history, pc, target);
 
     // Verify folded history matches the ideal fold of the updated PHR
@@ -382,7 +382,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) {
 
     // Test case 2: Update with not-taken branch (PHR unchanged, folded update is no-op)
     boost::dynamic_bitset<> before_not_taken = history;
-    tage->doUpdateHist(history, 2, false, pc, target);
+    tage->doUpdateHist(history, 2, false, pc, target, 0);
 
     // Verify folded history remains consistent
     tage->checkFoldedHist(history, "not-taken update");
@@ -615,9 +615,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) {
 
     // Verify recovery produced the expected history
     for (int i = 0; i < tage->numPredictors; i++) {
-        tage->tagFoldedHist[i].check(expectedHistory);
-        tage->altTagFoldedHist[i].check(expectedHistory);
-        tage->indexFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].tagFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].altTagFoldedHist[i].check(expectedHistory);
+        tage->threadHistory[0].indexFoldedHist[i].check(expectedHistory);
     }
 }
 

From 9def4665a400eab1a8fa1504b9e2f9b0a09cd7d1 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 23 Mar 2026 11:32:56 +0800
Subject: [PATCH 05/38] cpu-o3: integrate FS-SMT support changes

Change-Id: I7690e69545b01ca4a8ba3e751f6cab7665f8767e
---
 configs/common/FSConfig.py          |  18 +-
 configs/common/xiangshan.py         |   5 +-
 src/cpu/base.cc                     |  79 +++-
 src/cpu/base.hh                     |  13 +
 src/cpu/o3/commit.cc                | 681 +++++++++++++++-------------
 src/cpu/o3/commit.hh                |   4 +-
 src/cpu/o3/fetch.cc                 |  46 +-
 src/cpu/o3/lsq.cc                   |  28 +-
 src/cpu/o3/lsq.hh                   |  20 +-
 src/cpu/o3/lsq_unit.cc              |  56 ++-
 src/cpu/o3/rob.cc                   |  16 +-
 src/cpu/o3/rob.hh                   |   1 +
 src/cpu/pred/BranchPredictor.py     |   1 +
 src/cpu/pred/btb/decoupled_bpred.cc |   5 +-
 src/cpu/pred/btb/decoupled_bpred.hh |   7 +-
 src/cpu/pred/btb/ras.cc             | 326 +++++++------
 src/cpu/pred/btb/ras.hh             |  75 +--
 src/dev/riscv/HartCtrl.py           |  13 +
 src/dev/riscv/SConscript            |   2 +
 src/dev/riscv/hart_ctrl.cc          |  98 ++++
 src/dev/riscv/hart_ctrl.hh          |  33 ++
 21 files changed, 978 insertions(+), 549 deletions(-)
 create mode 100644 src/dev/riscv/HartCtrl.py
 create mode 100644 src/dev/riscv/hart_ctrl.cc
 create mode 100644 src/dev/riscv/hart_ctrl.hh

diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py
index dc66ed7833..d650b82f70 100644
--- a/configs/common/FSConfig.py
+++ b/configs/common/FSConfig.py
@@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None):
     self.system_port = self.membus.cpu_side_ports
     return self
 
-def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False):
-    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby)
+def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1,
+                                 ruby=False, num_threads=None):
+    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby,
+                                       num_threads=num_threads)
     self.workload = RiscvBareMetal()
     self.workload.reset_vect = 0x80000000
     return self
 
 
-def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
+def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False,
+                                num_threads=None):
     self = System()
     if not mdesc:
         # generic system
         mdesc = SysConfig()
+    if num_threads is None:
+        num_threads = np
     self.mem_mode = mem_mode
     self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
     print(self.mem_ranges)
@@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
     self.lint = Clint()
     self.lint.pio = self.iobus.mem_side_ports
     self.lint.pio_addr = 0x38000000
-    self.lint.num_threads = np
+    self.lint.num_threads = num_threads
+
+    self.hartctrl = HartCtrl()
+    self.hartctrl.pio = self.iobus.mem_side_ports
+    self.hartctrl.num_threads = num_threads
 
     self.mmcs = NemuMMC()
     self.mmcs.pio = self.iobus.mem_side_ports
@@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
             AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr +
             self.uartlite.pio_size),
             AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size),
+            AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size),
             AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size),
             AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size),
             ]
diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
index 368f6cd884..6abcd5ea39 100644
--- a/configs/common/xiangshan.py
+++ b/configs/common/xiangshan.py
@@ -827,8 +827,11 @@ def build_xiangshan_system(args):
 
     TestCPUClass = get_xiangshan_cpu_class(args)
     ruby = bool(hasattr(args, 'ruby') and args.ruby)
+    num_threads = np * (2 if getattr(args, 'smt', False) else 1)
 
-    test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby)
+    test_sys = makeBareMetalXiangshanSystem(
+        'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby,
+        num_threads=num_threads)
 
     if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode:
         if bool(getattr(args, 'trace_timing_ptw', False)):
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 83a2a27686..68808f3b3a 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -210,6 +210,7 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker)
     }
 
     diffAllStates.resize(numThreads);
+    recentCommittedStores.resize(numThreads);
     if (enableDifftest) {
         assert(params().difftest_ref_so.length() > 2);
         for (ThreadID tid = 0; tid < numThreads; ++tid) {
@@ -431,6 +432,33 @@ BaseCPU::startup()
 
 }
 
+void
+BaseCPU::recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst)
+{
+    RecentCommittedStore recent;
+
+    if (!system->multiContextDifftest() || !_goldenMemManager ||
+        !inst->isStore() || inst->isAtomic() ||
+        (inst->isStoreConditional() && !inst->lockedWriteSuccess()) ||
+        !inst->memData || inst->effSize == 0 ||
+        inst->effSize > sizeof(recent.data) ||
+        !_goldenMemManager->inPmem(inst->physEffAddr)) {
+        return;
+    }
+
+    auto &recent_history = recentCommittedStores.at(tid);
+    recent.valid = true;
+    recent.addr = inst->physEffAddr;
+    recent.size = inst->effSize;
+    recent.seq = inst->seqNum;
+    std::memcpy(recent.data, inst->memData, recent.size);
+    recent_history.push_back(recent);
+    constexpr size_t max_store_history = 16;
+    if (recent_history.size() > max_store_history) {
+        recent_history.pop_front();
+    }
+}
+
 probing::PMUUPtr
 BaseCPU::pmuProbePoint(const char *name)
 {
@@ -1459,10 +1487,31 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                     warn("Difference on %s instr found in multicore mode, check in golden memory\n",
                          diffInfo.inst->isLoad() ? "load" : "amo");
                     uint8_t *golden_ptr = diffInfo.goldenValue;
+                    const RecentCommittedStore *matched_recent_store = nullptr;
+                    if (diffInfo.inst->isLoad()) {
+                        const auto &recent_history = recentCommittedStores.at(tid);
+                        for (auto it = recent_history.rbegin();
+                             it != recent_history.rend(); ++it) {
+                            if (!it->valid ||
+                                it->addr != diffInfo.physEffAddr ||
+                                it->size != diffInfo.effSize ||
+                                it->seq >= seq ||
+                                (seq - it->seq) > 256) {
+                                continue;
+                            }
+                            if (memcmp(it->data, &gem5_val,
+                                       diffInfo.effSize) == 0) {
+                                matched_recent_store = &(*it);
+                                break;
+                            }
+                        }
+                    }
 
                     // a lambda function to sync memory and register from golden results to ref
-                    auto sync_mem_reg = [&]() {
-                        diffAllStates->proxy->memcpy(diffInfo.physEffAddr, golden_ptr, diffInfo.effSize,
+                    auto sync_mem_reg = [&](const uint8_t *mem_src) {
+                        diffAllStates->proxy->memcpy(diffInfo.physEffAddr,
+                                                     const_cast<uint8_t *>(mem_src),
+                                                     diffInfo.effSize,
                                                      DIFFTEST_TO_REF);
                         diffAllStates->referenceRegFile[dest_tag] = gem5_val;
                         diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF);
@@ -1470,7 +1519,16 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
 
                     if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) {
                         DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n");
-                        sync_mem_reg();
+                        sync_mem_reg(golden_ptr);
+                        continue;
+                    } else if (matched_recent_store) {
+                        DPRINTF(Diff,
+                                "Load content matched recent committed store "
+                                "[sn:%llu] at addr %#lx. Syncing ref from the "
+                                "store snapshot for this hart.\n",
+                                matched_recent_store->seq,
+                                diffInfo.physEffAddr);
+                        sync_mem_reg(matched_recent_store->data);
                         continue;
                     } else if (diffInfo.inst->isAtomic()) {
                         DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue,
@@ -1478,7 +1536,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                         DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr);
                         if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) {
                             DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n");
-                            sync_mem_reg();
+                            sync_mem_reg(golden_ptr);
                             continue;
                         } else {
                             warn("Atomic old value not matched!\n");
@@ -1583,9 +1641,16 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
                 warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize);
                 diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF);
             } else if (enableMemDedup) {
-                warn("Let ref share a COW mirror of root memory\n");
-                assert(diffAllStates->proxy->ref_get_backed_memory);
-                diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);
+                if (system->multiContextDifftest()) {
+                    warn("Let ref share the multi-context golden memory\n");
+                    assert(goldenMemPtr);
+                    assert(diffAllStates->proxy->ref_get_backed_memory);
+                    diffAllStates->proxy->ref_get_backed_memory(goldenMemPtr, pmemSize);
+                } else {
+                    warn("Let ref share a COW mirror of root memory\n");
+                    assert(diffAllStates->proxy->ref_get_backed_memory);
+                    diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);
+                }
             } else {
                 warn("MemDedup disabled, copying pmem to NEMU\n");
                 warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize);
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index 3d3e8e5a85..feaf6e13cd 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -42,6 +42,7 @@
 #ifndef __CPU_BASE_HH__
 #define __CPU_BASE_HH__
 
+#include <deque>
 #include <queue>
 #include <vector>
 
@@ -138,6 +139,16 @@ struct DiffAllStates
 class BaseCPU : public ClockedObject
 {
   protected:
+    struct RecentCommittedStore
+    {
+        bool valid = false;
+        Addr addr = 0;
+        size_t size = 0;
+        InstSeqNum seq = 0;
+        uint8_t data[16] = {};
+    };
+
+    std::vector<std::deque<RecentCommittedStore>> recentCommittedStores;
 
     const unsigned IntRegIndexBase = 0;
     const unsigned FPRegIndexBase = 32;
@@ -778,6 +789,8 @@ class BaseCPU : public ClockedObject
 
     void difftestStep(ThreadID tid, InstSeqNum seq);
 
+    void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst);
+
     inline bool difftestEnabled() const { return enableDifftest; }
 
     void displayGem5Regs(ThreadID tid);
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index ad42b0c7fe..e1b20025ce 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -42,6 +42,7 @@
 #include "cpu/o3/commit.hh"
 
 #include <algorithm>
+#include <array>
 #include <cstring>
 #include <set>
 #include <string>
@@ -104,32 +105,35 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara
     : commitPolicy(params.smtCommitPolicy),
       stuckCheckEvent([this]() {
         static std::vector<DynInstPtr> debug_insts;
-        if (cpu->curCycle() - this->lastCommitCycle > 40000) {
-            if (traceMaybeExitOnPipelineDrainFromStuckCheck()) {
-                return;
-            }
 
-            if (auto inst = rob->readHeadInst(0)) {
-                warn("can't commit inst %s\n", inst->genDisassembly());
-                debug_insts.insert(
-                    debug_insts.begin(), rob->getInstList(0).begin(),
-                    rob->getInstList(0).end());
-                warn("dump rob front 10 insts\n");
-                int i = 0;
-                for (auto inst = debug_insts.begin();
-                     inst != debug_insts.end() && i < 10; inst++, i++) {
-                    warn("%s\n", (*inst)->genDisassembly());
+        for (ThreadID tid = 0; tid < numThreads; tid++) {
+            if (cpu->curCycle() - this->lastCommitCycle[tid] > 40000) {
+                if (traceMaybeExitOnPipelineDrainFromStuckCheck()) {
+                    return;
                 }
-            } else {
-                warn("rob was empty, may be fetch or rename stuck\n");
+
+                if (auto inst = rob->readHeadInst(0)) {
+                    warn("can't commit inst %s\n", inst->genDisassembly());
+                    debug_insts.insert(
+                        debug_insts.begin(), rob->getInstList(tid).begin(),
+                        rob->getInstList(tid).end());
+                    warn("dump rob front 10 insts\n");
+                    int i = 0;
+                    for (auto inst = debug_insts.begin();
+                        inst != debug_insts.end() && i < 10; inst++, i++) {
+                        warn("%s\n", (*inst)->genDisassembly());
+                    }
+                } else {
+                    warn("rob was empty, may be fetch or rename stuck\n");
+                }
+                panic(
+                    "Commit stage is stucked for more than 40,000 cycles!\n"
+                    "Thread: %d Last commit cycle: %lu, current cycle: %lu, suggested "
+                    "--debug-start=%llu --debug-end=%llu\n", tid,
+                    lastCommitCycle[tid], cpu->curCycle(),
+                    cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] - 200)),
+                    cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] + 200)));
             }
-            panic(
-                "Commit stage is stucked for more than 40,000 cycles!\n"
-                "Last commit cycle: %lu, current cycle: %lu, suggested "
-                "--debug-start=%llu --debug-end=%llu\n",
-                lastCommitCycle, cpu->curCycle(),
-                cpu->cyclesToTicks(Cycles(lastCommitCycle - 200)),
-                cpu->cyclesToTicks(Cycles(lastCommitCycle + 200)));
         }
         cpu->schedule(this->stuckCheckEvent, cpu->clockEdge(Cycles(40010)));
       }, "CommitStuckCheckEvent"),
@@ -1204,349 +1208,395 @@ Commit::commitInsts()
     DPRINTF(Commit, "Trying to commit instructions in the ROB.\n");
 
     unsigned num_committed = 0;
+    std::array<unsigned, MaxThreads> num_committed_per_thread = {};
+    std::array<unsigned, MaxThreads> commit_width_per_thread = {};
 
     DynInstPtr head_inst;
 
-    int commit_width = rob->countInstsOfGroups(commitWidth);
+    int commit_width = 0;
+    for (ThreadID tid : *activeThreads) {
+        commit_width_per_thread[tid] =
+            rob->countInstsOfGroups(tid, commitWidth);
+        commit_width += commit_width_per_thread[tid];
+    }
 
     if (commit_width >= 0) {
         cpu->activityThisCycle();
     }
 
-    // Commit as many instructions as possible until the commit bandwidth
-    // limit is reached, or it becomes impossible to commit any more.
-    while (num_committed < commit_width) {
-        // hardware transactionally memory
-        // If executing within a transaction,
-        // need to handle interrupts specially
-
-        ThreadID commit_thread = getCommittingThread();
-
-        // Check for any interrupt that we've already squashed for
-        // and start processing it.
-        if (interrupt != NoFault) {
-            // If inside a transaction, postpone interrupts
-            if (executingHtmTransaction(commit_thread)) {
-                cpu->clearInterrupts(0);
-                toIEW->commitInfo[0].clearInterrupt = true;
-                interrupt = NoFault;
-                avoidQuiesceLiveLock = true;
-            } else {
-                handleInterrupt();
-            }
+    // Commit each thread independently for up to its local commit window.
+    for (ThreadID commit_thread : *activeThreads) {
+        if (commitStatus[commit_thread] != Running &&
+            commitStatus[commit_thread] != Idle &&
+            commitStatus[commit_thread] != FetchTrapPending) {
+            continue;
         }
 
-        // ThreadID commit_thread = getCommittingThread();
-
-        if (commit_thread == -1)
-            break;
-
-        head_inst = rob->readHeadInst(commit_thread);
-
-        if (!rob->isHeadGroupReady(commit_thread)) {
-            if (debug::Commit && head_inst->readyToCommit()) {
-                InstSeqNum seqnum = rob->getHeadGroupLastDoneSeq(commit_thread);
-                DPRINTF(
-                    Commit,
-                    "[sn:%llu] Head is ready to commit, but the group is not all ready, last done inst [sn:%llu]\n",
-                    head_inst->seqNum, seqnum);
+            while (num_committed < commit_width &&
+                num_committed_per_thread[commit_thread] <
+                    commit_width_per_thread[commit_thread]) {
+            // hardware transactionally memory
+            // If executing within a transaction,
+            // need to handle interrupts specially
+
+            // Check for any interrupt that we've already squashed for
+            // and start processing it.
+            if (interrupt != NoFault) {
+                // If inside a transaction, postpone interrupts
+                if (executingHtmTransaction(commit_thread)) {
+                    cpu->clearInterrupts(0);
+                    toIEW->commitInfo[0].clearInterrupt = true;
+                    interrupt = NoFault;
+                    avoidQuiesceLiveLock = true;
+                } else {
+                    handleInterrupt();
+                }
             }
-            break;
-        }
 
-        ThreadID tid = head_inst->threadNumber;
-
-        assert(tid == commit_thread);
-
-        DPRINTF(Commit,
-                "Trying to commit head instruction, [tid:%i] [sn:%llu]\n",
-                tid, head_inst->seqNum);
+            head_inst = rob->readHeadInst(commit_thread);
+
+            if (!rob->isHeadGroupReady(commit_thread)) {
+                if (debug::Commit && head_inst->readyToCommit()) {
+                    InstSeqNum seqnum =
+                        rob->getHeadGroupLastDoneSeq(commit_thread);
+                    DPRINTF(
+                        Commit,
+                        "[sn:%llu] Head is ready to commit, but the group "
+                        "is not all ready, last done inst [sn:%llu]\n",
+                        head_inst->seqNum, seqnum);
+                }
+                break;
+            }
 
-        // If the head instruction is squashed, it is ready to retire
-        // (be removed from the ROB) at any time.
-        if (head_inst->isSquashed()) {
+            ThreadID tid = head_inst->threadNumber;
 
-            DPRINTF(Commit, "Retiring squashed instruction from "
-                    "ROB.\n");
+            assert(tid == commit_thread);
 
-            rob->retireHead(commit_thread);
+            DPRINTF(Commit,
+                    "Trying to commit head instruction, [tid:%i] [sn:%llu]\n",
+                    tid, head_inst->seqNum);
 
-            ++stats.commitSquashedInsts;
-            // Notify potential listeners that this instruction is squashed
-            ppSquash->notify(head_inst);
+            // If the head instruction is squashed, it is ready to retire
+            // (be removed from the ROB) at any time.
+            if (head_inst->isSquashed()) {
 
-            // Record that the number of ROB entries has changed.
-            changedROBNumEntries[tid] = true;
-        } else {
-            set(pc[tid], head_inst->pcState());
-            traceMaybeInjectCtrlFlowChangeFault(tid, head_inst);
+                DPRINTF(Commit, "Retiring squashed instruction from "
+                        "ROB.\n");
 
-            // Try to commit the head instruction.
-            bool commit_success = commitHead(head_inst, num_committed);
+                rob->retireHead(commit_thread);
 
-            if (commit_success) {
-                cpu->perfCCT->updateInstPos(head_inst->seqNum, PerfRecord::AtCommit);
-                auto res = head_inst->getResult();
-                if (res.is<RegVal>()) {
-                    cpu->perfCCT->updateInstMeta(head_inst->seqNum, InstDetail::Result, res.as<RegVal>());
-                }
-                cpu->perfCCT->commitMeta(head_inst->seqNum);
+                ++stats.commitSquashedInsts;
+                // Notify potential listeners that this instruction is squashed
+                ppSquash->notify(head_inst);
 
-                DPRINTF(CommitTrace, "CT: %s\n", head_inst->genDisassembly());
+                // Record that the number of ROB entries has changed.
+                changedROBNumEntries[tid] = true;
+            } else {
+                set(pc[tid], head_inst->pcState());
+                traceMaybeInjectCtrlFlowChangeFault(tid, head_inst);
+
+                // Try to commit the head instruction.
+                bool commit_success = commitHead(head_inst,
+                                                num_committed_per_thread[tid]);
+
+                if (commit_success) {
+                    cpu->perfCCT->updateInstPos(head_inst->seqNum,
+                                                PerfRecord::AtCommit);
+                    auto res = head_inst->getResult();
+                    if (res.is<RegVal>()) {
+                        cpu->perfCCT->updateInstMeta(
+                            head_inst->seqNum, InstDetail::Result,
+                            res.as<RegVal>());
+                    }
+                    cpu->perfCCT->commitMeta(head_inst->seqNum);
 
-                if (ismispred) {
-                    ismispred = false;
-                    stats.recovery_bubble += (cpu->curCycle() - lastCommitCycle) * renameWidth;
-                }
-                if (head_inst->mispredicted()) {
-                    ismispred = true;
-                }
+                    DPRINTF(CommitTrace, "CT [tid:%d]: %s\n",
+                            head_inst->threadNumber,
+                            head_inst->genDisassembly());
 
-                lastCommitCycle = cpu->curCycle();
-                const auto &head_rv_pc = head_inst->pcState().as<RiscvISA::PCState>();
-                if (bp->isBTB()) {
-                    auto dbbtb = dynamic_cast<branch_prediction::btb_pred::DecoupledBPUWithBTB*>(bp);
-                    bool miss = head_inst->mispredicted();
-                    if (head_inst->isReturn()) {
-                        DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n",
-                                head_inst->pcState().instAddr(), miss,
-                                head_rv_pc.npc(), *(head_inst->predPC));
+                    if (ismispred) {
+                        ismispred = false;
+                        stats.recovery_bubble +=
+                            (cpu->curCycle() - lastCommitCycle[tid]) *
+                            renameWidth;
+                    }
+                    if (head_inst->mispredicted()) {
+                        ismispred = true;
                     }
 
-                    // FIXME: ignore mret/sret/uret in correspond with RTL
-                    if (!head_inst->isNonSpeculative() && head_inst->isControl()) {
-                        dbbtb->commitBranch(head_inst, miss);
-                        if (!head_inst->isReturn() && head_inst->isIndirectCtrl() && miss) {
-                            misPredIndirect[head_inst->pcState().instAddr()]++;
+                    lastCommitCycle[tid] = cpu->curCycle();
+                    const auto &head_rv_pc =
+                        head_inst->pcState().as<RiscvISA::PCState>();
+                    if (bp->isBTB()) {
+                        auto dbbtb = dynamic_cast<
+                            branch_prediction::btb_pred::
+                                DecoupledBPUWithBTB *>(bp);
+                        bool miss = head_inst->mispredicted();
+                        if (head_inst->isReturn()) {
+                            DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n",
+                                    head_inst->pcState().instAddr(), miss,
+                                    head_rv_pc.npc(), *(head_inst->predPC));
                         }
-                    }
-                    dbbtb->notifyInstCommit(head_inst);
-                }
-                    if (traceMaybeExitOnLastTraceInst(head_inst)) {
-                        return;
-                    }
 
-                if (head_inst->isUpdateVsstatusSd()) {
-                    auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
-                    RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
-                    RiscvISA::VSSTATUS vsstatus =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-                    RiscvISA::VSSTATUS32 vsstatus32 =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-
-                    if (v) {
-                        if (hstatus.vsxl ==1) {
-                            vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid);
-                        } else {
-                            vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid);
+                        // FIXME: ignore mret/sret/uret in correspond with RTL
+                        if (!head_inst->isNonSpeculative() && head_inst->isControl()) {
+                            dbbtb->commitBranch(head_inst, miss);
+                            if (!head_inst->isReturn() &&
+                                head_inst->isIndirectCtrl() && miss) {
+                                misPredIndirect[head_inst->pcState().instAddr()]++;
+                            }
                         }
+                        dbbtb->notifyInstCommit(head_inst);
                     }
+                        if (traceMaybeExitOnLastTraceInst(head_inst)) {
+                            return;
+                        }
 
-                }
-                if (head_inst->isUpdateMstatusSd()) {
-                    updateMstatusSd(tid);
-                }
-
-                ++num_committed;
-                stats.committedInstType[tid][head_inst->opClass()]++;
-                ppCommit->notify(head_inst);
-
-                // hardware transactional memory
-
-                // update nesting depth
-                if (head_inst->isHtmStart())
-                    htmStarts[tid]++;
+                    if (head_inst->isUpdateVsstatusSd()) {
+                        auto v = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
+                        RiscvISA::HSTATUS hstatus =
+                            cpu->readMiscRegNoEffect(
+                                RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
+                        RiscvISA::VSSTATUS vsstatus =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+                        RiscvISA::VSSTATUS32 vsstatus32 =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+
+                        if (v) {
+                            if (hstatus.vsxl ==1) {
+                                vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus32, tid);
+                            } else {
+                                vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus, tid);
+                            }
+                        }
 
-                // sanity check
-                if (head_inst->inHtmTransactionalState()) {
-                    assert(executingHtmTransaction(tid));
-                } else {
-                    assert(!executingHtmTransaction(tid));
-                }
+                    }
+                    if (head_inst->isUpdateMstatusSd()) {
+                        updateMstatusSd(tid);
+                    }
 
-                // update nesting depth
-                if (head_inst->isHtmStop())
-                    htmStops[tid]++;
+                    ++num_committed;
+                    ++num_committed_per_thread[tid];
+                    stats.committedInstType[tid][head_inst->opClass()]++;
+                    ppCommit->notify(head_inst);
 
-                changedROBNumEntries[tid] = true;
+                    // hardware transactional memory
 
-                // Set the doneSeqNum to the youngest committed instruction.
-                toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
+                    // update nesting depth
+                    if (head_inst->isHtmStart())
+                        htmStarts[tid]++;
 
-                if (head_inst->getFtqId() > 1) {
-                    toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1;
-                }
-                committedTargetId = head_inst->getFtqId();
-                committedLoopIter = head_inst->getLoopIteration();
-
-                if (tid == 0)
-                    canHandleInterrupts = !head_inst->isDelayedCommit();
-
-                // at this point store conditionals should either have
-                // been completed or predicated false
-                assert(!head_inst->isStoreConditional() ||
-                       head_inst->isCompleted() ||
-                       !head_inst->readPredicate());
-
-                // Updates misc. registers.
-                head_inst->updateMiscRegs();
-                if (head_inst->staticInst->isVectorConfig()) {
-                    auto vset = static_cast<RiscvISA::VConfOp*>(head_inst->staticInst.get());
-                    if (!(vset->vtypeIsImm)) {
-                        auto tc = head_inst->tcBase();
-                        RiscvISA::VTYPE new_vtype = head_inst->readMiscReg(RiscvISA::MISCREG_VTYPE);
-                        tc->getDecoderPtr()->as<RiscvISA::Decoder>().setVtype(new_vtype);
-                    }
-                    if (hasExecutedYoungerInst(tid, head_inst->seqNum)) {
-                        DPRINTF(Commit,
-                                "[tid:%i] [sn:%llu] Vector config committed with executed younger instructions in "
-                                "ROB, squash younger instructions.\n",
-                                tid, head_inst->seqNum);
-                        squashAfter(tid, head_inst);
-                    }
-                }
-                if (head_inst->isFloating() && head_inst->isLoad()){
-                    RiscvISA::STATUS status = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, tid);
-                    status.sd = 1;
-                    status.fs = 3;
-                    cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, (RegVal)status, tid);
-                }
-                if (head_inst->isUpdateVsstatusSd()) {
-                    auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
-                    RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
-                    RiscvISA::VSSTATUS vsstatus =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-                    RiscvISA::VSSTATUS32 vsstatus32 =
-                        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
-
-                    if (v) {
-                        if (hstatus.vsxl ==1) {
-                            vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid);
-                        } else {
-                            vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
-                            cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid);
-                        }
+                    // sanity check
+                    if (head_inst->inHtmTransactionalState()) {
+                        assert(executingHtmTransaction(tid));
+                    } else {
+                        assert(!executingHtmTransaction(tid));
                     }
 
-                }
+                    // update nesting depth
+                    if (head_inst->isHtmStop())
+                        htmStops[tid]++;
 
-                if (cpu->difftestEnabled()) {
-                    diffInst(tid, head_inst);
-                }
+                    changedROBNumEntries[tid] = true;
+
+                    // Set the doneSeqNum to the youngest committed instruction.
+                    toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
 
-                if (head_inst->isLoad()) {
-                    Addr load_pc = head_inst->pcState().instAddr();
-                    Addr load_addr = head_inst->physEffAddr;
-                    char buffer[8] = {0};
-                    if (head_inst->memData) {
-                        std::memcpy(buffer, head_inst->memData,
-                                    std::min<size_t>(head_inst->effSize,
-                                                     sizeof(buffer)));
+                    if (head_inst->getFtqId() > 1) {
+                        toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1;
                     }
-                    Addr load_value = *((uint64_t *)buffer);
-                    bool hit = loadTripleCounter.update(load_pc, load_addr, load_value);
-                    if (hit) {
-                        // same PC && same addr && same value
-                        stats.loadTriple++;
+                    committedTargetId = head_inst->getFtqId();
+                    committedLoopIter = head_inst->getLoopIteration();
+
+                    if (tid == 0)
+                        canHandleInterrupts = !head_inst->isDelayedCommit();
+
+                    // at this point store conditionals should either have
+                    // been completed or predicated false
+                    assert(!head_inst->isStoreConditional() ||
+                        head_inst->isCompleted() ||
+                        !head_inst->readPredicate());
+
+                    // Updates misc. registers.
+                    head_inst->updateMiscRegs();
+                    if (head_inst->staticInst->isVectorConfig()) {
+                        auto vset = static_cast<RiscvISA::VConfOp *>(
+                            head_inst->staticInst.get());
+                        if (!(vset->vtypeIsImm)) {
+                            auto tc = head_inst->tcBase();
+                            RiscvISA::VTYPE new_vtype =
+                                head_inst->readMiscReg(
+                                    RiscvISA::MISCREG_VTYPE);
+                            tc->getDecoderPtr()->as<RiscvISA::Decoder>().setVtype(new_vtype);
+                        }
+                        if (hasExecutedYoungerInst(tid, head_inst->seqNum)) {
+                            DPRINTF(Commit,
+                                    "[tid:%i] [sn:%llu] Vector config "
+                                    "committed with executed younger "
+                                    "instructions in ROB, squash younger "
+                                    "instructions.\n",
+                                    tid, head_inst->seqNum);
+                            squashAfter(tid, head_inst);
+                        }
                     }
-                    // EA reuse: compare to last committed EA of same static load
-                    auto itEA = lastLoadEA.find(load_pc);
-                    if (itEA != lastLoadEA.end() && itEA->second == load_addr) {
-                        stats.loadEAReused++;
+                    if (head_inst->isFloating() && head_inst->isLoad()) {
+                        RiscvISA::STATUS status = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_STATUS, tid);
+                        status.sd = 1;
+                        status.fs = 3;
+                        cpu->setMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_STATUS,
+                            (RegVal)status, tid);
                     }
-                    lastLoadEA[load_pc] = load_addr;
-                    // Producer stability: only if this load had a forwarding producer
-                    if (head_inst->hasProducerStorePC()) {
-                        stats.loadsWithProducer++;
-                        const Addr prodPC = head_inst->producerStorePC();
-                        auto itP = lastLoadProducerStorePC.find(load_pc);
-                        if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) {
-                            stats.producerStable++;
+                    if (head_inst->isUpdateVsstatusSd()) {
+                        auto v = cpu->readMiscRegNoEffect(
+                            RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid);
+                        RiscvISA::HSTATUS hstatus =
+                            cpu->readMiscRegNoEffect(
+                                RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid);
+                        RiscvISA::VSSTATUS vsstatus =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+                        RiscvISA::VSSTATUS32 vsstatus32 =
+                            cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid);
+
+                        if (v) {
+                            if (hstatus.vsxl ==1) {
+                                vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus32, tid);
+                            } else {
+                                vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3);
+                                cpu->setMiscRegNoEffect(
+                                    RiscvISA::MiscRegIndex::MISCREG_VSSTATUS,
+                                    (RegVal)vsstatus, tid);
+                            }
                         }
-                        lastLoadProducerStorePC[load_pc] = prodPC;
 
-                    // optional: clear after use to avoid confusing later stages
-                    head_inst->clearProducerStorePC();
                     }
-                }
 
+                    if (cpu->difftestEnabled()) {
+                        diffInst(tid, head_inst);
+                    }
+
+                    if (head_inst->isLoad()) {
+                        Addr load_pc = head_inst->pcState().instAddr();
+                        Addr load_addr = head_inst->physEffAddr;
+                        char buffer[8] = {0};
+                        if (head_inst->memData) {
+                            std::memcpy(buffer, head_inst->memData,
+                                        std::min<size_t>(head_inst->effSize,
+                                                        sizeof(buffer)));
+                        }
+                        Addr load_value = *((uint64_t *)buffer);
+                        bool hit = loadTripleCounter.update(load_pc, load_addr, load_value);
+                        if (hit) {
+                            // same PC && same addr && same value
+                            stats.loadTriple++;
+                        }
+                        // EA reuse: compare to last committed EA of same static load
+                        auto itEA = lastLoadEA.find(load_pc);
+                        if (itEA != lastLoadEA.end() && itEA->second == load_addr) {
+                            stats.loadEAReused++;
+                        }
+                        lastLoadEA[load_pc] = load_addr;
+                        // Producer stability: only if this load had a forwarding producer
+                        if (head_inst->hasProducerStorePC()) {
+                            stats.loadsWithProducer++;
+                            const Addr prodPC = head_inst->producerStorePC();
+                            auto itP = lastLoadProducerStorePC.find(load_pc);
+                            if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) {
+                                stats.producerStable++;
+                            }
+                            lastLoadProducerStorePC[load_pc] = prodPC;
+
+                        // optional: clear after use to avoid confusing later stages
+                        head_inst->clearProducerStorePC();
+                        }
+                    }
 
-                // Check instruction execution if it successfully commits and
-                // is not carrying a fault.
-                if (cpu->checker) {
-                    cpu->checker->verify(head_inst);
-                }
 
-                cpu->traceFunctions(pc[tid]->instAddr());
-                traceOnCommit(tid, head_inst);
+                    // Check instruction execution if it successfully commits and
+                    // is not carrying a fault.
+                    if (cpu->checker) {
+                        cpu->checker->verify(head_inst);
+                    }
 
-                head_inst->staticInst->advancePC(*pc[tid]);
+                    cpu->traceFunctions(pc[tid]->instAddr());
+                    traceOnCommit(tid, head_inst);
 
-                // Keep track of the last sequence number commited
-                lastCommitedSeqNum[tid] = head_inst->seqNum;
+                    head_inst->staticInst->advancePC(*pc[tid]);
 
-                // If this is an instruction that doesn't play nicely with
-                // others squash everything and restart fetch
-                if (head_inst->isSquashAfter())
-                    squashAfter(tid, head_inst);
+                    // Keep track of the last sequence number commited
+                    lastCommitedSeqNum[tid] = head_inst->seqNum;
 
-                if (drainPending) {
-                    if (pc[tid]->microPC() == 0 && interrupt == NoFault &&
-                        !thread[tid]->trapPending) {
-                        // Last architectually committed instruction.
-                        // Squash the pipeline, stall fetch, and use
-                        // drainImminent to disable interrupts
-                        DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]);
+                    // If this is an instruction that doesn't play nicely with
+                    // others squash everything and restart fetch
+                    if (head_inst->isSquashAfter())
                         squashAfter(tid, head_inst);
-                        cpu->commitDrained(tid);
-                        drainImminent = true;
-                    }
-                }
 
-                bool onInstBoundary = !head_inst->isMicroop() ||
-                                      head_inst->isLastMicroop() ||
-                                      !head_inst->isDelayedCommit();
-
-                if (onInstBoundary) {
-                    int count = 0;
-                    Addr oldpc;
-                    // Make sure we're not currently updating state while
-                    // handling PC events.
-                    assert(!thread[tid]->noSquashFromTC &&
-                           !thread[tid]->trapPending);
-                    do {
-                        oldpc = pc[tid]->instAddr();
-                        thread[tid]->pcEventQueue.service(
-                                oldpc, thread[tid]->getTC());
-                        count++;
-                    } while (oldpc != pc[tid]->instAddr());
-                    if (count > 1) {
-                        DPRINTF(Commit,
-                                "PC skip function event, stopping commit\n");
-                        break;
-                    }
-                        traceOnMacroCommit(tid);
+                    if (drainPending) {
+                        if (pc[tid]->microPC() == 0 && interrupt == NoFault &&
+                            !thread[tid]->trapPending) {
+                            // Last architectually committed instruction.
+                            // Squash the pipeline, stall fetch, and use
+                            // drainImminent to disable interrupts
+                            DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]);
+                            squashAfter(tid, head_inst);
+                            cpu->commitDrained(tid);
+                            drainImminent = true;
+                        }
                     }
 
-                // Check if an instruction just enabled interrupts and we've
-                // previously had an interrupt pending that was not handled
-                // because interrupts were subsequently disabled before the
-                // pipeline reached a place to handle the interrupt. In that
-                // case squash now to make sure the interrupt is handled.
-                //
-                // If we don't do this, we might end up in a live lock
-                // situation.
-                if (!interrupt && avoidQuiesceLiveLock &&
-                    onInstBoundary && cpu->checkInterrupts(0))
-                    squashAfter(tid, head_inst);
-            } else {
-                DPRINTF(Commit, "Unable to commit head instruction PC:%s "
-                        "[tid:%i] [sn:%llu].\n",
-                        head_inst->pcState(), tid ,head_inst->seqNum);
-                break;
+                    bool onInstBoundary = !head_inst->isMicroop() ||
+                                        head_inst->isLastMicroop() ||
+                                        !head_inst->isDelayedCommit();
+
+                    if (onInstBoundary) {
+                        int count = 0;
+                        Addr oldpc;
+                        // Make sure we're not currently updating state while
+                        // handling PC events.
+                        assert(!thread[tid]->noSquashFromTC &&
+                            !thread[tid]->trapPending);
+                        do {
+                            oldpc = pc[tid]->instAddr();
+                            thread[tid]->pcEventQueue.service(
+                                    oldpc, thread[tid]->getTC());
+                            count++;
+                        } while (oldpc != pc[tid]->instAddr());
+                        if (count > 1) {
+                            DPRINTF(Commit,
+                                    "PC skip function event, stopping commit\n");
+                            break;
+                        }
+                            traceOnMacroCommit(tid);
+                        }
+
+                    // Check if an instruction just enabled interrupts and we've
+                    // previously had an interrupt pending that was not handled
+                    // because interrupts were subsequently disabled before the
+                    // pipeline reached a place to handle the interrupt. In that
+                    // case squash now to make sure the interrupt is handled.
+                    //
+                    // If we don't do this, we might end up in a live lock
+                    // situation.
+                    if (!interrupt && avoidQuiesceLiveLock &&
+                        onInstBoundary && cpu->checkInterrupts(0))
+                        squashAfter(tid, head_inst);
+                } else {
+                    DPRINTF(Commit, "Unable to commit head instruction PC:%s "
+                            "[tid:%i] [sn:%llu].\n",
+                            head_inst->pcState(), tid ,head_inst->seqNum);
+                    break;
+                }
             }
         }
     }
@@ -1596,6 +1646,8 @@ Commit::diffInst(ThreadID tid, const DynInstPtr &inst) {
     cpu->diffInfo.physEffAddr = inst->physEffAddr;
     cpu->diffInfo.effSize = inst->effSize;
     cpu->diffInfo.goldenValue = inst->getGolden();
+    cpu->diffInfo.amoOldGoldenValue = inst->getAmoOldGoldenValue();
+    cpu->recordCommittedStore(tid, inst);
     cpu->difftestStep(tid, inst->seqNum);
 }
 
@@ -1990,6 +2042,13 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid)
     DPRINTF(Commit, "Squashing in-flight renamed instructions\n");
     for (unsigned i_idx = 0; i_idx < fromRename->size; i_idx++) {
         const DynInstPtr &inst = fromRename->insts[i_idx];
+        if (inst->threadNumber != tid) {
+            DPRINTF(Commit,
+                    "[tid:%i] [sn:%llu] Preserving other-thread in-flight "
+                    "instruction during squash for tid %i\n",
+                    inst->threadNumber, inst->seqNum, tid);
+            continue;
+        }
         DPRINTF(Commit, "[tid:%i] [sn:%llu] Squashing in-flight "
                 "instruction PC %s\n",
                 inst->threadNumber, inst->seqNum, inst->pcState());
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 465732ea0e..3c83b610e5 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -196,7 +196,7 @@ class Commit
     };
     std::list<BranchInfo> branchLog;
 
-    uint64_t lastCommitCycle = 0;
+    uint64_t lastCommitCycle[MaxThreads] = {0};
 
     EventFunctionWrapper stuckCheckEvent;
 
@@ -215,8 +215,6 @@ class Commit
     /** Returns the name of the Commit. */
     std::string name() const;
 
-    uint64_t getLastCommitCycle() const { return lastCommitCycle; }
-
     /** Registers probes. */
     void regProbePoints();
 
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index d2381123ab..f95738bd2c 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -592,10 +592,34 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt)
         DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n",
                 tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size());
 
-        if (cacheBlocked && !retryPkt.empty()) {
-            DPRINTF(Fetch, "[tid:%i] Cache response arrived with queued retries pending; "
-                    "trying one response-driven retry pass\n", tid);
-            retryPendingIcacheRequests();
+        bool waitingOnRetry = false;
+        for (const auto status : threads[tid].cacheReq.requestStatus) {
+            if (status == CacheWaitRetry) {
+                waitingOnRetry = true;
+                break;
+            }
+        }
+
+        if (waitingOnRetry && cacheBlocked && !retryPkt.empty()) {
+            PacketPtr queuedPkt = retryPkt.front();
+            const ThreadID queuedTid =
+                cpu->contextToThread(queuedPkt->req->contextId());
+            const bool sameThreadRetry = queuedTid == tid &&
+                threads[tid].cacheReq.findRequestIndex(queuedPkt->req) != SIZE_MAX;
+
+            if (sameThreadRetry && icachePort.sendTimingReq(queuedPkt)) {
+                DPRINTF(Fetch,
+                        "[tid:%i] Retrying matching queued I-cache packet %#lx "
+                        "after sibling response\n",
+                        tid, queuedPkt->req->getVaddr());
+                updateCacheRequestStatusByRequest(tid, queuedPkt->req,
+                                                  CacheWaitResponse);
+                ppFetchRequestSent->notify(queuedPkt->req);
+                retryPkt.erase(retryPkt.begin());
+                if (retryPkt.empty()) {
+                    cacheBlocked = false;
+                }
+            }
         }
 
         return false;  // Return false to indicate we're still waiting
@@ -2094,8 +2118,22 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
     assert(dbpbtb);
     const auto &stream = dbpbtb->ftqFetchingTarget(tid);
     const Addr start_pc = stream.startPC;
+    const Addr current_pc = pc_state.instAddr();
     threads[tid].startPC = start_pc;
 
+    if (current_pc < stream.startPC ||
+        current_pc >= stream.predEndPC) {
+        auto &reset_pc = threads[tid].fetchpc->as<RiscvISA::PCState>();
+        reset_pc.pc(stream.startPC);
+        reset_pc.npc(stream.startPC + 4);
+        reset_pc.uReset();
+        DPRINTF(Fetch,
+                "[tid:%i] Resetting fetch PC to new FTQ stream start %s "
+                "(previous PC %#lx outside [%#lx, %#lx))\n",
+                tid, *threads[tid].fetchpc, current_pc,
+                stream.startPC, stream.predEndPC);
+    }
+
     DPRINTF(Fetch, "[tid:%i] Issuing a pipelined I-cache access for new FSQ entry, "
                   "starting at PC %#x (endPC %#x; original PC %s)\n",
             tid, start_pc, stream.predEndPC, pc_state);
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 0f3b005a8f..e070f076d0 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -387,6 +387,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
             params.StoreCompletionWidth);
         thread[tid].init(cpu, iew_ptr, params, this, tid);
         thread[tid].setDcachePort(&dcachePort);
+        _storeBufferFlushing[tid] = false;
     }
 
     std::vector<StoreBufferEntry *> store_buffer_entries;
@@ -757,18 +758,26 @@ LSQ::processWriteback()
 
 
     if (storeBufferBlocked()) {
-        // dont offload store to sbuffer when sbuffer is flushing
         DPRINTF(StoreBuffer, "Store buffer is blocking, skip SQ offload\n");
         return;
     }
+
     std::vector<uint32_t> offload_quota(numThreads, 0);
     std::vector<uint32_t> offload_demand(numThreads, 0);
     std::vector<ThreadID> requester_tids;
     requester_tids.reserve(activeThreads->size());
+    uint32_t sbuffer_flush_bitset = 0;
+    for (ThreadID tid : *activeThreads) {
+        bool sbuffer_flushing = storeBufferFlushing(tid);
+        sbuffer_flush_bitset |= (sbuffer_flushing << tid);
+    }
+
     for (ThreadID tid : *activeThreads) {
         offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries(
             maxStoreBufferEntriesAcceptedFromSQPerCycle);
-        if (offload_demand[tid] != 0) {
+        // when other thread is flushing sbuffer, stop current thread sq offloading
+        bool conti = (sbuffer_flush_bitset & ~(1 << tid)) == 0;
+        if (conti && offload_demand[tid] != 0) {
             requester_tids.push_back(tid);
         }
     }
@@ -812,17 +821,20 @@ LSQ::processWriteback()
         ThreadID tid = *threads++;
         thread[tid].offloadToStoreBuffer(offload_quota[tid]);
     }
-}
 
-void
-LSQ::storeBufferWriteback()
-{
-    bool can_evict = true;
+    // If the store buffer is flushing and no entries remain to be sent,
+    // clear the flushing state to avoid deadlock.
     if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] {
         assert(storeBuffer.unsentSize() == 0);
         clearStoreBufferFlushing();
         cpu->activityThisCycle();
     }
+}
+
+void
+LSQ::storeBufferWriteback()
+{
+    bool can_evict = true;
 
     // write request will stall one cycle
     // so 2 cycle send one write request
@@ -1536,7 +1548,7 @@ LSQ::hasStoresToWB(ThreadID tid)
 
 bool LSQ::flushStores(ThreadID tid)
 {
-    _storeBufferFlushing = true;
+    _storeBufferFlushing[tid] = true;
     // TODO：high performance shared SMT storebuffer flushing
     bool t = !hasStoresToWB(tid) && storeBufferEmpty();
     return t;
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 604df7c0f1..fc2c73a80c 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -64,6 +64,7 @@
 #include "cpu/inst_seq.hh"
 #include "cpu/o3/dyn_inst_ptr.hh"
 #include "cpu/o3/dyn_inst_xsmeta.hh"
+#include "cpu/o3/limits.hh"
 #include "cpu/utils.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/packet.hh"
@@ -1081,8 +1082,21 @@ class LSQ
     bool getDcacheWriteStall() { return dcacheWriteStall; }
     StoreBuffer &getStoreBuffer() { return storeBuffer; }
     bool storeBufferEmpty() const { return storeBuffer.size() == 0; }
-    bool storeBufferFlushing() const { return _storeBufferFlushing; }
-    void clearStoreBufferFlushing() { _storeBufferFlushing = false; }
+    bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; }
+    bool storeBufferFlushing() const
+    {
+        for (auto tid : *activeThreads) {
+            if (_storeBufferFlushing[tid])
+                return true;
+        }
+        return false;
+    }
+    void clearStoreBufferFlushing(ThreadID tid) { _storeBufferFlushing[tid] = false; }
+    void clearStoreBufferFlushing() {
+        for (auto tid : *activeThreads) {
+            _storeBufferFlushing[tid] = false;
+        }
+    }
     uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; }
     uint32_t getSbufferEntries() const { return sbufferEntries; }
     uint64_t getStoreBufferInactiveCycles() const
@@ -1171,7 +1185,7 @@ class LSQ
     const uint64_t storeBufferInactiveThreshold;
     const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2;
     StoreBuffer storeBuffer;
-    bool _storeBufferFlushing = false;
+    bool _storeBufferFlushing[MaxThreads] = {false};
     uint64_t storeBufferWritebackInactive = 0;
     StoreBufferEntry *blockedSbufferEntry = nullptr;
     ThreadID nextStoreBufferOffloadTid = InvalidThreadID;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 9cfc4d791f..6be535e5df 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -349,20 +349,47 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
         if (inst->isLoad() || inst->isAtomic()) {
             Addr addr = pkt->getAddr();
             auto [enable_diff, diff_all_states] = cpu->getDiffAllStates();
+            if (system->multiContextDifftest() && enable_diff &&
+                request->_sbufferBypass &&
+                inst->isLoad() &&
+                cpu->goldenMemManager()->inPmem(addr)) {
+                // A store-forwarded load may legitimately observe a value that
+                // is newer than the current shared golden memory snapshot.
+                // Keep the observed value on the instruction so difftest can
+                // repair the reference state for this hart if needed.
+                inst->setGolden(pkt->getPtr<uint8_t>());
+            }
             if (system->multiContextDifftest() && enable_diff &&
                 !request->_sbufferBypass &&
                 cpu->goldenMemManager()->inPmem(addr)) {
-                // check data with golden mem
-                uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
                 uint8_t *loaded_data = pkt->getPtr<uint8_t>();
                 size_t size = pkt->getSize();
-                if (memcmp(golden_data, loaded_data, size) == 0) {
-                    assert(size == inst->effSize);
-                    inst->setGolden(golden_data);
+                assert(size == inst->effSize);
+
+                if (inst->isAtomic()) {
+                    uint8_t *golden_old =
+                        reinterpret_cast<uint8_t *>(inst->getAmoOldGoldenValuePtr());
+                    cpu->goldenMemManager()->readGoldenMem(addr, golden_old, size);
+                    if (memcmp(golden_old, loaded_data, size) != 0) {
+                        panic("[tid:%d] [sn:%llu] Atomic old value error at addr %#lx, "
+                              "size %d. %s\n",
+                              inst->threadNumber, inst->seqNum, addr, size,
+                              goldenDiffStr(loaded_data, golden_old, size).c_str());
+                    }
                 } else {
-                    panic("Data error at addr %#lx, size %d. %s\n",
-                        addr, size,
-                        goldenDiffStr(loaded_data, golden_data, size).c_str());
+                    // check data with golden mem
+                    uint8_t *golden_data =
+                        (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
+                    if (memcmp(golden_data, loaded_data, size) == 0) {
+                        inst->setGolden(golden_data);
+                    } else {
+                        DPRINTF(Diff,
+                                "[tid:%d] [sn:%llu] Load sees value different from "
+                                "current golden memory at addr %#lx, size %d. "
+                                "Treating as concurrent update window. %s\n",
+                                inst->threadNumber, inst->seqNum, addr, size,
+                                goldenDiffStr(loaded_data, golden_data, size).c_str());
+                    }
                 }
             }
         }
@@ -2016,6 +2043,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
 {
     assert(!lsq->storeBufferBlocked());
     if (isStoreBlocked) return;
+    if (max_entries == 0) return;
 
     uint32_t accepted_entries = 0;
     while (storesToWB > 0 &&
@@ -2527,23 +2555,21 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
                                                      request->_size);
         } else {
             uint8_t tmp_data[8];
-            memset(tmp_data, 0, 8);
-            memcpy(tmp_data, store_inst->memData, request->_size);
+            memset(tmp_data, 0, sizeof(tmp_data));
             assert(request->req()->getAtomicOpFunctor());
 
-            // read golden memory to get the global latest value before this AMO is executed for further compare
-            cpu->goldenMemManager()->readGoldenMem(paddr,
-                                                   store_inst->getAmoOldGoldenValuePtr(), request->_size);
+            // The AMO response returns the old memory value. Capture it on the
+            // instruction so commit/difftest can use a per-inst copy under SMT.
             cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue();
+            memcpy(tmp_data, store_inst->getAmoOldGoldenValuePtr(), request->_size);
 
-            // before amo operate on golden memory
             (*(request->req()->getAtomicOpFunctor()))(tmp_data);
-            // after amo operate on golden memory
 
             DPRINTF(LSQUnit, "AMO writing to golden memory at addr %#x, data %#lx, mask %#x, size %d\n",
                     paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size);
             cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff,
                                                      request->_size);
+            store_inst->setGolden(tmp_data);
         }
     }
 
diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc
index 4e007804c2..410d7dcfac 100644
--- a/src/cpu/o3/rob.cc
+++ b/src/cpu/o3/rob.cc
@@ -297,15 +297,23 @@ ROB::countInsts(ThreadID tid)
     return instList[tid].size();
 }
 
+uint32_t
+ROB::countInstsOfGroups(ThreadID tid, int groups)
+{
+    int sum = 0;
+    auto it = threadGroups[tid].begin();
+    for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) {
+        sum += *it;
+    }
+    return sum;
+}
+
 uint32_t
 ROB::countInstsOfGroups(int groups)
 {
     int sum = 0;
     for (ThreadID tid = 0; tid < numThreads; tid++) {
-        auto it = threadGroups[tid].begin();
-        for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) {
-            sum += *it;
-        }
+        sum += countInstsOfGroups(tid, groups);
     }
     return sum;
 }
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index d9b3e9999b..1fdcbf0857 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -256,6 +256,7 @@ class ROB
         return sum;
     }
 
+    uint32_t countInstsOfGroups(ThreadID tid, int groups);
     uint32_t countInstsOfGroups(int groups);
 
     bool (ROB::*allocateNewGroup)(const DynInstPtr inst, ThreadID tid);
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index f25b77be68..3171928e1b 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -1021,6 +1021,7 @@ class BTBRAS(TimedBaseBTBPredictor):
     cxx_class = 'gem5::branch_prediction::btb_pred::BTBRAS'
     cxx_header = 'cpu/pred/btb/ras.hh'
 
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
     numEntries = Param.Unsigned(32, "Number of entries in the RAS")
     ctrWidth = Param.Unsigned(8, "Width of the counter")
     numInflightEntries = Param.Unsigned(384, "Number of inflight entries")
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 2e272047eb..bb87772263 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -212,11 +212,12 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid)
 
     DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC);
 
-
-    // Initialize prediction state for each stage
+    // Reset all stage-local prediction fields before components fill them.
+    clearPreds(tid);
     for (int i = 0; i < numStages; i++) {
         predsOfEachStage[i].tid = tid;
         predsOfEachStage[i].bbStart = thread.s0PC;
+        predsOfEachStage[i].predSource = i;
     }
 
     // Query each predictor component with current PC and history
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 38adad4115..2552ce9e44 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -187,10 +187,9 @@ class DecoupledBPUWithBTB : public BPredUnit
     void generateFinalPredAndCreateBubbles(ThreadID tid);
 
     void clearPreds(ThreadID tid) {
-        for (auto &stagePred : threads[tid].predsOfEachStage) {
-            stagePred.condTakens.clear();
-            stagePred.indirectTargets.clear();
-            stagePred.btbEntries.clear();
+        for (int i = 0; i < threads[tid].predsOfEachStage.size(); ++i) {
+            threads[tid].predsOfEachStage[i] = FullBTBPrediction();
+            threads[tid].predsOfEachStage[i].predSource = i;
         }
     }
 
diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc
index 7f279bdb8e..8dd5b80aea 100644
--- a/src/cpu/pred/btb/ras.cc
+++ b/src/cpu/pred/btb/ras.cc
@@ -21,28 +21,13 @@ namespace btb_pred {
             : TimedBaseBTBPredictor(),
               numEntries(numEntries),
               ctrWidth(ctrWidth),
-              numInflightEntries(numInflightEntries)
+              numInflightEntries(numInflightEntries),
+              maxCtr((1 << ctrWidth) - 1),
+              numThreads(1),
+              threadStates(numThreads)
         {
-            // Initialize RAS state
-            ssp = 0;
-            nsp = 0;
-            sctr = 0;
-            stack.resize(numEntries);
-            maxCtr = (1 << ctrWidth) - 1;
-            TOSW = 0;
-            TOSR = 0;
-            inflightPtrDec(TOSR);
-            BOS = 0;
-            inflightStack.resize(numInflightEntries);
-
-            // Initialize stack entries
-            for (auto &entry : stack) {
-                entry.data.ctr = 0;
-                entry.data.retAddr = 0x80000000L;
-            }
-            for (auto &entry : inflightStack) {
-                entry.data.ctr = 0;
-                entry.data.retAddr = 0x80000000L;
+            for (auto &state : threadStates) {
+                initThreadState(state);
             }
         }
 #else
@@ -51,49 +36,61 @@ namespace btb_pred {
         : TimedBaseBTBPredictor(p),
           numEntries(p.numEntries),
           ctrWidth(p.ctrWidth),
-        numInflightEntries(p.numInflightEntries),
-        rasStats(this)
+          numInflightEntries(p.numInflightEntries),
+          maxCtr((1 << ctrWidth) - 1),
+          numThreads(p.numThreads),
+          threadStates(numThreads),
+          rasStats(this)
     {
-        // Initialize RAS state
-        ssp = 0;
-        nsp = 0;
-        sctr = 0;
-        stack.resize(numEntries);
-        maxCtr = (1 << ctrWidth) - 1;
-        TOSW = 0;
-        TOSR = 0;
-        inflightPtrDec(TOSR);
-        BOS = 0;
-        inflightStack.resize(numInflightEntries);
-
-        // Initialize stack entries
-        for (auto &entry : stack) {
-            entry.data.ctr = 0;
-            entry.data.retAddr = 0x80000000L;
-        }
-        for (auto &entry : inflightStack) {
-            entry.data.ctr = 0;
-            entry.data.retAddr = 0x80000000L;
+        for (auto &state : threadStates) {
+            initThreadState(state);
         }
     }
 #endif
 
 void
-BTBRAS::checkCorrectness() {
+BTBRAS::initThreadState(ThreadRASState &state)
+{
+    state.TOSW = 0;
+    state.TOSR = 0;
+    inflightPtrDec(state.TOSR);
+    state.BOS = 0;
+    state.ssp = 0;
+    state.nsp = 0;
+    state.sctr = 0;
+    state.meta.reset();
+
+    state.stack.resize(numEntries);
+    state.inflightStack.resize(numInflightEntries);
+
+    for (auto &entry : state.stack) {
+        entry.data.ctr = 0;
+        entry.data.retAddr = 0x80000000L;
+    }
+    for (auto &entry : state.inflightStack) {
+        entry.data.ctr = 0;
+        entry.data.retAddr = 0x80000000L;
+        entry.nos = 0;
+    }
+}
+
+void
+BTBRAS::checkCorrectness(ThreadID tid) {
+    auto &state = threadStates[tid];
     /*
-    auto tosr = TOSR;
-    int checkssp = ssp;
-    while (inflightInRange(tosr)) {
-        if (!inflightStack[tosr].data.ctr) {
+    auto tosr = state.TOSR;
+    int checkssp = state.ssp;
+    while (inflightInRange(state, tosr)) {
+        if (!state.inflightStack[tosr].data.ctr) {
             checkssp = (checkssp - 1 + numEntries) % numEntries;
         } else {
             // just dec sctr, fixme here
         }
-        tosr = inflightStack[tosr].nos;
+        tosr = state.inflightStack[tosr].nos;
     }
-    if (checkssp != (nsp + numEntries - 1) % numEntries) {
+    if (checkssp != (state.nsp + numEntries - 1) % numEntries) {
         DPRINTF(RAS, "NSP and SSP check failed\n");
-        printStack("checkCorrectness");
+        printStack("checkCorrectness", tid);
     }*/
 }
 
@@ -102,15 +99,19 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
                   std::vector<FullBTBPrediction> &stagePreds)
 {
     assert(getDelay() < stagePreds.size());
-    meta = std::make_shared<RASMeta>();
+    const ThreadID tid = stagePreds.back().tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
+    state.meta = std::make_shared<RASMeta>();
     DPRINTFR(RAS, "putPC startAddr %lx", startAddr);
-    // checkCorrectness();
+    // checkCorrectness(tid);
+    auto top = getTop_meta(tid);
     for (int i = getDelay(); i < stagePreds.size(); i++) {
-        stagePreds[i].returnTarget = getTop_meta().retAddr; // stack[sp].retAddr;
+        stagePreds[i].returnTarget = top.retAddr;
     }
     /*
     if (stagePreds.back().btbEntry.slots[0].isCall || stagePreds.back().btbEntry.slots[0].isReturn || stagePreds.back().btbEntry.slots[1].isCall || stagePreds.back().btbEntry.slots[1].isReturn) {
-        printStack("putPCHistory");
+        printStack("putPCHistory", tid);
     }
     */
 }
@@ -118,13 +119,19 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history,
 std::shared_ptr<void>
 BTBRAS::getPredictionMeta(ThreadID tid)
 {
-    (void)tid;
-    return meta;
+    if (tid >= threadStates.size()) {
+        return nullptr;
+    }
+    return threadStates[tid].meta;
 }
 
 void
 BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred)
 {
+    const ThreadID tid = pred.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
+    assert(state.meta);
     // do push & pops on prediction
     // pred.returnTarget = stack[sp].retAddr;
     auto takenEntry = pred.getTakenEntry();
@@ -132,11 +139,11 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction
 
     if (takenEntry.isCall) {
         Addr retAddr = takenEntry.pc + takenEntry.size;
-        push(retAddr);
+        push(tid, retAddr);
     }
     if (takenEntry.isReturn) {
         // do pop
-        pop();
+        pop(tid);
     }
     if (takenEntry.isCall) {
         DPRINTFR(RAS, "IsCall spec PC %lx\n", takenEntry.pc);
@@ -146,36 +153,39 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction
     }
     
     if (takenEntry.isCall || takenEntry.isReturn)
-        printStack("after specUpdateHist");
-    DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", meta->TOSR, meta->TOSW);
+        printStack("after specUpdateHist", tid);
+    DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", state.meta->TOSR, state.meta->TOSW);
 }
 
 void
 BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    const ThreadID tid = entry.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
     auto takenEntry = entry.exeBranchInfo;
     /*
     if (takenEntry.isCall || takenEntry.isReturn) {
-        printStack("before recoverHist");
+        printStack("before recoverHist", tid);
     }*/
     // recover sp and tos first
     auto meta_ptr = std::static_pointer_cast<RASMeta>(entry.predMetas[getComponentIdx()]);
     DPRINTF(RAS, "recover called, meta TOSR %d TOSW %d ssp %d sctr %u entry PC %lx end PC %lx\n",
         meta_ptr->TOSR, meta_ptr->TOSW, meta_ptr->ssp, meta_ptr->sctr, entry.startPC, entry.predEndPC);
 
-    TOSR = meta_ptr->TOSR;
-    TOSW = meta_ptr->TOSW;
-    ssp = meta_ptr->ssp;
-    sctr = meta_ptr->sctr;
+    state.TOSR = meta_ptr->TOSR;
+    state.TOSW = meta_ptr->TOSW;
+    state.ssp = meta_ptr->ssp;
+    state.sctr = meta_ptr->sctr;
     Addr retAddr = takenEntry.pc + takenEntry.size;
 
     // do push & pops on control squash
     if (entry.exeTaken) {
         if (takenEntry.isCall) {
-            push(retAddr);
+            push(tid, retAddr);
         }
         if (takenEntry.isReturn) {
-            pop();
+            pop(tid);
             //TOSW = (TOSR + 1) % numInflightEntries;
         }
     }
@@ -187,7 +197,7 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e
             DPRINTF(RAS, "IsRet expect target %lx, preded %lx, pred taken %d pred target %lx\n",
                 takenEntry.target, meta_ptr->target, entry.predTaken, entry.predBranchInfo.target);
         }
-        printStack("after recoverHist");
+        printStack("after recoverHist", tid);
     }
     
 }
@@ -195,83 +205,89 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e
 void
 BTBRAS::update(const FetchTarget &entry)
 {
+    const ThreadID tid = entry.tid;
+    assert(tid < numThreads);
+    auto &state = threadStates[tid];
     auto meta_ptr = std::static_pointer_cast<RASMeta>(entry.predMetas[getComponentIdx()]);
     auto takenEntry = entry.exeBranchInfo;
     if (entry.exeTaken) {
-        if (meta_ptr->ssp != nsp || meta_ptr->sctr != stack[nsp].data.ctr) {
+        if (meta_ptr->ssp != state.nsp || meta_ptr->sctr != state.stack[state.nsp].data.ctr) {
             DPRINTF(RAS, "ssp and nsp mismatch, recovering, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n",
-                meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr);
-            nsp = meta_ptr->ssp;
+                meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr);
+            state.nsp = meta_ptr->ssp;
         } else
             DPRINTF(RAS, "ssp and nsp match, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n",
-                meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr);
+                meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr);
         if (takenEntry.isCall) {
             DPRINTF(RAS, "real update call BTB hit %d meta TOSR %d TOSW %d\n entry PC %lx",
                 entry.isHit, meta_ptr->TOSR, meta_ptr->TOSW, entry.startPC);
             Addr retAddr = takenEntry.pc + takenEntry.size;
-            push_stack(retAddr);
-            BOS = inflightPtrPlus1(meta_ptr->TOSW);
+            push_stack(tid, retAddr);
+            state.BOS = inflightPtrPlus1(meta_ptr->TOSW);
         }
         if (takenEntry.isReturn) {
             DPRINTF(RAS, "update ret entry PC %lx\n", entry.startPC);
-            pop_stack();
+            pop_stack(tid);
         }
     }
     if (takenEntry.isCall || takenEntry.isReturn) {
-        printStack("after update(commit)");
+        printStack("after update(commit)", tid);
     }
 }
 
 void
-BTBRAS::push_stack(Addr retAddr)
+BTBRAS::push_stack(ThreadID tid, Addr retAddr)
 {
-    auto tos = stack[nsp];
+    auto &state = threadStates[tid];
+    auto tos = state.stack[state.nsp];
     if (tos.data.retAddr == retAddr && tos.data.ctr < maxCtr) {
-        stack[nsp].data.ctr++;
+        state.stack[state.nsp].data.ctr++;
     } else {
         // push new entry
-        ptrInc(nsp);
-        stack[nsp].data.retAddr = retAddr;
-        stack[nsp].data.ctr = 0;
+        ptrInc(state.nsp);
+        state.stack[state.nsp].data.retAddr = retAddr;
+        state.stack[state.nsp].data.ctr = 0;
     }
     // ++ndepth;
 }
 
 void
-BTBRAS::push(Addr retAddr)
+BTBRAS::push(ThreadID tid, Addr retAddr)
 {
+    auto &state = threadStates[tid];
     rasStats.Pushes++;
     DPRINTF(RAS, "doing push ");
     // update ssp and sctr first
     // meta has recorded their old value
-    auto topAddr = getTop();
-    if (retAddr == topAddr.retAddr && sctr < maxCtr) {
-        sctr++;
+    auto topAddr = getTop(tid);
+    if (retAddr == topAddr.retAddr && state.sctr < maxCtr) {
+        state.sctr++;
     } else {
-        ptrInc(ssp);
-        sctr = 0;
+        ptrInc(state.ssp);
+        state.sctr = 0;
         // do not update non-spec stack here
     }
 
     // push will always enter inflight queue
     RASInflightEntry t;
     t.data.retAddr = retAddr;
-    t.data.ctr = sctr;
-    t.nos = TOSR;
-    inflightStack[TOSW] = t;
-    TOSR = TOSW;
-    inflightPtrInc(TOSW);
+    t.data.ctr = state.sctr;
+    t.nos = state.TOSR;
+    state.inflightStack[state.TOSW] = t;
+    state.TOSR = state.TOSW;
+    inflightPtrInc(state.TOSW);
 }
 
 void
-BTBRAS::pop_stack()
+BTBRAS::pop_stack(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     //if (ndepth) {
-    auto tos = stack[nsp];
+    auto tos = state.stack[state.nsp];
     if (tos.data.ctr > 0) {
-        stack[nsp].data.ctr--;
+        state.stack[state.nsp].data.ctr--;
     } else {
-        ptrDec(nsp);
+        ptrDec(state.nsp);
     }
     //--ndepth;
     //} else {
@@ -281,30 +297,31 @@ BTBRAS::pop_stack()
 }
 
 void
-BTBRAS::pop()
+BTBRAS::pop(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     // DPRINTFR(RAS, "doing pop ndepth = %d", ndepth);
     rasStats.Pops++;
     // pop may need to deal with committed stack
-    if (inflightInRange(TOSR)) {
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
-        TOSR = inflightStack[TOSR].nos;
-        if (sctr > 0) {
-            sctr--; 
+    if (inflightInRange(state, state.TOSR)) {
+        DPRINTF(RAS, "Select from inflight, addr %lx\n", state.inflightStack[state.TOSR].data.retAddr);
+        state.TOSR = state.inflightStack[state.TOSR].nos;
+        if (state.sctr > 0) {
+            state.sctr--;
         } else {
-            ptrDec(ssp);
-            auto newTop = getTop();
-            sctr = newTop.ctr;
+            ptrDec(state.ssp);
+            auto newTop = getTop(tid);
+            state.sctr = newTop.ctr;
         }
     } else /*if (ndepth)*/ {
         // TOSR not valid, operate on committed stack
         DPRINTF(RAS, "in committed range\n");
-        if (sctr > 0) {
-            sctr--;
+        if (state.sctr > 0) {
+            state.sctr--;
         } else {
-            ptrDec(ssp);
-            auto newTop = getTop();
-            sctr = newTop.ctr;
+            ptrDec(state.ssp);
+            auto newTop = getTop(tid);
+            state.sctr = newTop.ctr;
         }
     }
     //else {
@@ -352,12 +369,12 @@ BTBRAS::inflightPtrPlus1(int ptr) {
 }
 
 bool
-BTBRAS::inflightInRange(int &ptr)
+BTBRAS::inflightInRange(const ThreadRASState &state, int ptr)
 {
-    if (TOSW > BOS) {
-        return ptr >= BOS && ptr < TOSW;
-    } else if (TOSW < BOS) {
-        return ptr < TOSW || ptr >= BOS;
+    if (state.TOSW > state.BOS) {
+        return ptr >= state.BOS && ptr < state.TOSW;
+    } else if (state.TOSW < state.BOS) {
+        return ptr < state.TOSW || ptr >= state.BOS;
     } else {
         // empty inflight queue
         return false;
@@ -365,64 +382,79 @@ BTBRAS::inflightInRange(int &ptr)
 }
 
 BTBRAS::RASEssential
-BTBRAS::getTop()
+BTBRAS::getTop(ThreadID tid)
 {
+    auto &state = threadStates[tid];
     // results may come from two sources: inflight queue and committed stack
-    if (inflightInRange(TOSR)) {
+    if (inflightInRange(state, state.TOSR)) {
         // result come from inflight queue
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
+        DPRINTF(RAS, "Select from inflight, addr %lx\n",
+                state.inflightStack[state.TOSR].data.retAddr);
         // additional check: if nos is out of bound, check if commit stack top == inflight[nos]
         /*
-        if (!inflightInRange(inflightStack[TOSR].nos)) {
-            auto top = stack[nsp];
-            if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) {
+        if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) {
+            auto top = state.stack[state.nsp];
+            if (top.data.retAddr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.retAddr ||
+                top.data.ctr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.ctr) {
                 // inflight[nos] is not the same as stack[nsp]
                 DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n");
-                printStack("Error case stack dump");
+                printStack("Error case stack dump", tid);
             }
         }*/
 
-        return inflightStack[TOSR].data;
+        return state.inflightStack[state.TOSR].data;
     } else {
         // result come from commit queue
-        DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr);
-        return stack[ssp].data;
+        DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr);
+        return state.stack[state.ssp].data;
     }
 }
 
 BTBRAS::RASEssential
-BTBRAS::getTop_meta() {
+BTBRAS::getTop_meta(ThreadID tid) {
+    auto &state = threadStates[tid];
+    assert(state.meta);
     // results may come from two sources: inflight queue and committed stack
-    if (inflightInRange(TOSR)) {
+    if (inflightInRange(state, state.TOSR)) {
         // result come from inflight queue
-        DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr);
-        meta->ssp = ssp;
-        meta->sctr = sctr;
-        meta->TOSR = TOSR;
-        meta->TOSW = TOSW;
-        meta->target = inflightStack[TOSR].data.retAddr;
+        DPRINTF(RAS, "Select from inflight, addr %lx\n",
+                state.inflightStack[state.TOSR].data.retAddr);
+        state.meta->ssp = state.ssp;
+        state.meta->sctr = state.sctr;
+        state.meta->TOSR = state.TOSR;
+        state.meta->TOSW = state.TOSW;
+        state.meta->target = state.inflightStack[state.TOSR].data.retAddr;
 
         // additional check: if nos is out of bound, check if commit stack top == inflight[nos]
         /*
-        if (!inflightInRange(inflightStack[TOSR].nos)) {
-            auto top = stack[nsp];
-            if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) {
+        if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) {
+            auto top = state.stack[state.nsp];
+            if (top.data.retAddr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.retAddr ||
+                top.data.ctr !=
+                    state.inflightStack[
+                        state.inflightStack[state.TOSR].nos].data.ctr) {
                 // inflight[nos] is not the same as stack[nsp]
                 DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n");
-                printStack("Error case stack dump");
+                printStack("Error case stack dump", tid);
             }
         }*/
 
-        return inflightStack[TOSR].data;
+        return state.inflightStack[state.TOSR].data;
     } else {
         // result come from commit queue
-        meta->ssp = ssp;
-        meta->sctr = sctr;
-        meta->TOSR = TOSR;
-        meta->TOSW = TOSW;
-        meta->target = stack[ssp].data.retAddr;
-        DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr);
-        return stack[ssp].data;
+        state.meta->ssp = state.ssp;
+        state.meta->sctr = state.sctr;
+        state.meta->TOSR = state.TOSR;
+        state.meta->TOSW = state.TOSW;
+        state.meta->target = state.stack[state.ssp].data.retAddr;
+        DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr);
+        return state.stack[state.ssp].data;
     }
 }
 
diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh
index b0b31c6d94..19bb1f0e15 100644
--- a/src/cpu/pred/btb/ras.hh
+++ b/src/cpu/pred/btb/ras.hh
@@ -112,14 +112,28 @@ namespace btb_pred {
         Addr getTopAddrFromMetas(const FetchTarget &stream);
 
     private:
+        struct ThreadRASState
+        {
+            int TOSW = 0; // inflight pointer to the write top of stack
+            int TOSR = 0; // inflight pointer to the read top of stack
+            int BOS = 0;  // inflight pointer to the bottom of stack
+            int ssp = 0;  // speculative stack pointer
+            int nsp = 0;  // committed stack pointer
+            int sctr = 0;
+            std::vector<RASEntry> stack;
+            std::vector<RASInflightEntry> inflightStack;
+            std::shared_ptr<RASMeta> meta;
+        };
 
-        void push(Addr retAddr);
+        void initThreadState(ThreadRASState &state);
 
-        void pop();
+        void push(ThreadID tid, Addr retAddr);
 
-        void push_stack(Addr retAddr);
-        
-        void pop_stack();
+        void pop(ThreadID tid);
+
+        void push_stack(ThreadID tid, Addr retAddr);
+
+        void pop_stack(ThreadID tid);
 
         void ptrInc(int &ptr);
 
@@ -129,38 +143,43 @@ namespace btb_pred {
         
         void inflightPtrDec(int &ptr);
 
-        bool inflightInRange(int &ptr);
+        bool inflightInRange(const ThreadRASState &state, int ptr);
 
         int inflightPtrPlus1(int ptr);
 
-        void checkCorrectness();
+        void checkCorrectness(ThreadID tid);
 
-        RASEssential getTop();
+        RASEssential getTop(ThreadID tid);
 
-        RASEssential getTop_meta();
+        RASEssential getTop_meta(ThreadID tid);
 
-        void printStack(const char *when) {
-            DPRINTF(RAS, "printStack when %s: \n", when);
+        void printStack(const char *when, ThreadID tid) {
+            auto &state = threadStates[tid];
+            DPRINTF(RAS, "[tid:%u] printStack when %s: \n", tid, when);
             for (int i = 0; i < numEntries; i++) {
-                DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, stack[i].data.retAddr, stack[i].data.ctr);
-                if (ssp == i) {
+                DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i,
+                         state.stack[i].data.retAddr, state.stack[i].data.ctr);
+                if (state.ssp == i) {
                     DPRINTFR(RAS, " <-- SSP");
                 }
-                if (nsp == i) {
+                if (state.nsp == i) {
                     DPRINTFR(RAS, " <-- NSP");
                 }
                 DPRINTFR(RAS, "\n");
             }
             DPRINTFR(RAS, "non-volatile stack:\n");
             for (int i = 0; i < numInflightEntries; i++) {
-                DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, inflightStack[i].data.retAddr, inflightStack[i].data.ctr, inflightStack[i].nos);
-                if (TOSW == i) {
+                DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i,
+                         state.inflightStack[i].data.retAddr,
+                         state.inflightStack[i].data.ctr,
+                         state.inflightStack[i].nos);
+                if (state.TOSW == i) {
                     DPRINTFR(RAS, " <-- TOSW");
                 }
-                if (TOSR == i) {
+                if (state.TOSR == i) {
                     DPRINTFR(RAS, " <-- TOSR");
                 }
-                if (BOS == i) {
+                if (state.BOS == i) {
                     DPRINTFR(RAS, " <-- BOS");
                 }
                 DPRINTFR(RAS, "\n");
@@ -190,27 +209,11 @@ namespace btb_pred {
 
         unsigned numInflightEntries;
 
-        int TOSW; // inflight pointer to the write top of stack
-
-        int TOSR; // inflight pointer to the read top of stack
-
-        int BOS; // inflight pointer to the bottom of stack
-
         int maxCtr;
 
-        int ssp; // spec sp
-        
-        int nsp; // non-spec sp
-
-        int sctr;
-
-        //int ndepth;
-
-        std::vector<RASEntry> stack;
-        
-        std::vector<RASInflightEntry> inflightStack;
+        unsigned numThreads;
 
-        std::shared_ptr<RASMeta> meta;
+        std::vector<ThreadRASState> threadStates;
 
 #ifdef UNIT_TEST
     typedef uint64_t Scalar;
diff --git a/src/dev/riscv/HartCtrl.py b/src/dev/riscv/HartCtrl.py
new file mode 100644
index 0000000000..242c10cccd
--- /dev/null
+++ b/src/dev/riscv/HartCtrl.py
@@ -0,0 +1,13 @@
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.Device import BasicPioDevice
+
+
+class HartCtrl(BasicPioDevice):
+    type = 'HartCtrl'
+    cxx_header = "dev/riscv/hart_ctrl.hh"
+    cxx_class = 'gem5::HartCtrl'
+    pio_addr = 0x39001000
+    pio_size = Param.Addr(0x1000, "Hart control register space size")
+    num_threads = Param.Int("Number of threads in the system.")
diff --git a/src/dev/riscv/SConscript b/src/dev/riscv/SConscript
index 15bf707400..267399e9c0 100755
--- a/src/dev/riscv/SConscript
+++ b/src/dev/riscv/SConscript
@@ -34,6 +34,7 @@ SimObject('HiFive.py', sim_objects=['HiFive', 'GenericRiscvPciHost'],
 SimObject('LupV.py', sim_objects=['LupV'], tags='riscv isa')
 SimObject('Clint.py', sim_objects=['Clint'], tags='riscv isa')
 SimObject('Lint.py', sim_objects=['Lint'], tags='riscv isa')
+SimObject('HartCtrl.py', sim_objects=['HartCtrl'], tags='riscv isa')
 SimObject('PlicDevice.py', sim_objects=['PlicIntDevice'], tags='riscv isa')
 SimObject('Plic.py', sim_objects=['Plic'], tags='riscv isa')
 SimObject('RTC.py', sim_objects=['RiscvRTC'], tags='riscv isa')
@@ -55,6 +56,7 @@ Source('hifive.cc', tags='riscv isa')
 Source('lupv.cc', tags='riscv isa')
 Source('clint.cc', tags='riscv isa')
 Source('lint.cc', tags='riscv isa')
+Source('hart_ctrl.cc', tags='riscv isa')
 Source('plic_device.cc', tags='riscv isa')
 Source('plic.cc', tags='riscv isa')
 Source('rtc.cc', tags='riscv isa')
diff --git a/src/dev/riscv/hart_ctrl.cc b/src/dev/riscv/hart_ctrl.cc
new file mode 100644
index 0000000000..b0afe6c8a9
--- /dev/null
+++ b/src/dev/riscv/hart_ctrl.cc
@@ -0,0 +1,98 @@
+#include "dev/riscv/hart_ctrl.hh"
+
+#include "cpu/thread_context.hh"
+#include "mem/packet_access.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+HartCtrl::HartCtrl(const Params &p)
+    : BasicPioDevice(p, p.pio_size),
+      hartResetState(p.num_threads, 1)
+{
+    if (!hartResetState.empty()) {
+        // Hart 0 is the boot hart and is considered released by default.
+        hartResetState[0] = 0;
+    }
+}
+
+Tick
+HartCtrl::read(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
+    assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t));
+
+    const Addr offset = pkt->getAddr() - pioAddr;
+    panic_if(offset % sizeof(uint64_t) != 0,
+             "HartCtrl only supports 64-bit aligned accesses: addr=%#lx",
+             pkt->getAddr());
+
+    const ThreadID tid = offset / sizeof(uint64_t);
+    panic_if(tid >= hartResetState.size(),
+             "HartCtrl access out of range: tid=%u addr=%#lx",
+             tid, pkt->getAddr());
+
+    pkt->setLE(hartResetState[tid]);
+    pkt->makeAtomicResponse();
+    return pioDelay;
+}
+
+Tick
+HartCtrl::write(PacketPtr pkt)
+{
+    assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize);
+    assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t));
+
+    const Addr offset = pkt->getAddr() - pioAddr;
+    panic_if(offset % sizeof(uint64_t) != 0,
+             "HartCtrl only supports 64-bit aligned accesses: addr=%#lx",
+             pkt->getAddr());
+
+    const ThreadID tid = offset / sizeof(uint64_t);
+    panic_if(tid >= hartResetState.size(),
+             "HartCtrl access out of range: tid=%u addr=%#lx",
+             tid, pkt->getAddr());
+
+    uint64_t value = 0;
+    switch (pkt->getSize()) {
+      case sizeof(uint8_t):
+        value = pkt->getLE<uint8_t>();
+        break;
+      case sizeof(uint16_t):
+        value = pkt->getLE<uint16_t>();
+        break;
+      case sizeof(uint32_t):
+        value = pkt->getLE<uint32_t>();
+        break;
+      case sizeof(uint64_t):
+        value = pkt->getLE<uint64_t>();
+        break;
+      default:
+        panic("Unsupported HartCtrl write size %u\n", pkt->getSize());
+    }
+
+    hartResetState[tid] = value;
+
+    if (value == 0) {
+        tryWakeHart(tid);
+    }
+
+    pkt->makeAtomicResponse();
+    return pioDelay;
+}
+
+void
+HartCtrl::tryWakeHart(ThreadID tid)
+{
+    panic_if(tid >= sys->threads.size(),
+             "HartCtrl wake target %u out of system thread range %zu",
+             tid, sys->threads.size());
+
+    auto *tc = sys->threads[tid];
+    panic_if(!tc, "HartCtrl target %u has no thread context", tid);
+
+    tc->activate();
+}
+
+} // namespace gem5
diff --git a/src/dev/riscv/hart_ctrl.hh b/src/dev/riscv/hart_ctrl.hh
new file mode 100644
index 0000000000..5fe47306f6
--- /dev/null
+++ b/src/dev/riscv/hart_ctrl.hh
@@ -0,0 +1,33 @@
+//
+// Created for Xiangshan bare-metal hart control MMIO.
+//
+
+#ifndef GEM5_HART_CTRL_HH
+#define GEM5_HART_CTRL_HH
+
+#include <vector>
+
+#include "dev/io_device.hh"
+#include "params/HartCtrl.hh"
+
+namespace gem5
+{
+
+class HartCtrl : public BasicPioDevice
+{
+  public:
+    typedef HartCtrlParams Params;
+    explicit HartCtrl(const Params &p);
+
+    Tick read(PacketPtr pkt) override;
+    Tick write(PacketPtr pkt) override;
+
+  private:
+    void tryWakeHart(ThreadID tid);
+
+    std::vector<uint64_t> hartResetState;
+};
+
+} // namespace gem5
+
+#endif // GEM5_HART_CTRL_HH

From 76939fc592731101b6bd34df00989f64db7e0e09 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Fri, 27 Mar 2026 18:31:54 +0800
Subject: [PATCH 06/38] cpu-o3: fix smt shared sbuffer

Change-Id: Ifbeb947f5f27ebdc9dc39dcfe0172eaa308f8e6f
---
 src/cpu/base.cc        |  74 +++++--
 src/cpu/base.hh        |  11 +
 src/cpu/o3/commit.cc   |  16 +-
 src/cpu/o3/iew.hh      |   4 +
 src/cpu/o3/lsq.cc      | 477 ++++++++++++++++++++++++++++++++++++++---
 src/cpu/o3/lsq.hh      |  67 +++++-
 src/cpu/o3/lsq_unit.cc | 383 ++++++++++++++++++++++++++++-----
 src/cpu/o3/lsq_unit.hh |  14 +-
 8 files changed, 929 insertions(+), 117 deletions(-)

diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 68808f3b3a..264e17bf4d 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -211,6 +211,7 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker)
 
     diffAllStates.resize(numThreads);
     recentCommittedStores.resize(numThreads);
+    syncVisibleStoreReplayArmed.resize(numThreads, false);
     if (enableDifftest) {
         assert(params().difftest_ref_so.length() > 2);
         for (ThreadID tid = 0; tid < numThreads; ++tid) {
@@ -1484,12 +1485,23 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                 if (system->multiContextDifftest() &&
                     (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) &&
                     _goldenMemManager->inPmem(diffInfo.physEffAddr)) {
-                    warn("Difference on %s instr found in multicore mode, check in golden memory\n",
-                         diffInfo.inst->isLoad() ? "load" : "amo");
-                    uint8_t *golden_ptr = diffInfo.goldenValue;
+                    DPRINTF(Diff,
+                            "Difference on %s instr found in multicore mode, "
+                            "check in golden memory\n",
+                            diffInfo.inst->isLoad() ? "load" : "amo");
+                    uint8_t current_golden_data[16] = {};
+                    panic_if(diffInfo.effSize > sizeof(current_golden_data),
+                             "Unexpected large mem diff size: %u\n",
+                             diffInfo.effSize);
+                    _goldenMemManager->readGoldenMem(diffInfo.physEffAddr,
+                                                     current_golden_data,
+                                                     diffInfo.effSize);
+                    uint8_t *golden_ptr = current_golden_data;
+                    uint8_t *exec_golden_ptr = diffInfo.goldenValue;
                     const RecentCommittedStore *matched_recent_store = nullptr;
                     if (diffInfo.inst->isLoad()) {
-                        const auto &recent_history = recentCommittedStores.at(tid);
+                        const auto &recent_history =
+                            recentCommittedStores.at(tid);
                         for (auto it = recent_history.rbegin();
                              it != recent_history.rend(); ++it) {
                             if (!it->valid ||
@@ -1506,21 +1518,39 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                             }
                         }
                     }
+                    auto sync_reg = [&]() {
+                        diffAllStates->referenceRegFile[dest_tag] = gem5_val;
+                        diffAllStates->proxy->regcpy(
+                            &(diffAllStates->referenceRegFile), DUT_TO_REF);
+                    };
 
-                    // a lambda function to sync memory and register from golden results to ref
+                    // Sync both memory and register when the value is already
+                    // globally visible in golden memory.
                     auto sync_mem_reg = [&](const uint8_t *mem_src) {
                         diffAllStates->proxy->memcpy(diffInfo.physEffAddr,
                                                      const_cast<uint8_t *>(mem_src),
                                                      diffInfo.effSize,
                                                      DIFFTEST_TO_REF);
-                        diffAllStates->referenceRegFile[dest_tag] = gem5_val;
-                        diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF);
+                        sync_reg();
                     };
 
-                    if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) {
-                        DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n");
+                    if (diffInfo.inst->isLoad() &&
+                               memcmp(golden_ptr, &gem5_val,
+                                      diffInfo.effSize) == 0) {
+                        DPRINTF(Diff,
+                                "Load content matched in golden memory. "
+                                "Sync from golden to ref\n");
                         sync_mem_reg(golden_ptr);
                         continue;
+                    } else if (diffInfo.inst->isLoad() && exec_golden_ptr &&
+                               memcmp(exec_golden_ptr, &gem5_val,
+                                      diffInfo.effSize) == 0) {
+                        DPRINTF(Diff,
+                                "Load content matched the execution-time "
+                                "golden snapshot. Sync from the recorded "
+                                "snapshot to ref\n");
+                        sync_mem_reg(exec_golden_ptr);
+                        continue;
                     } else if (matched_recent_store) {
                         DPRINTF(Diff,
                                 "Load content matched recent committed store "
@@ -1534,13 +1564,22 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq)
                         DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue,
                                 gem5_val);
                         DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr);
-                        if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) {
+                        if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val,
+                                   diffInfo.effSize) == 0) {
                             DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n");
                             sync_mem_reg(golden_ptr);
                             continue;
-                        } else {
-                            warn("Atomic old value not matched!\n");
                         }
+                    } else if (diffInfo.inst->isLoad()) {
+                        DPRINTF(Diff,
+                                "Unresolved shared-memory load mismatch at "
+                                "addr=%#lx gem5=%#lx current_golden=%#lx "
+                                "exec_snapshot=%#lx; falling back to normal "
+                                "difftest reporting.\n",
+                                diffInfo.physEffAddr, gem5_val,
+                                *(uint64_t *)golden_ptr,
+                                exec_golden_ptr ?
+                                    *(uint64_t *)exec_golden_ptr : 0);
                     }
                 }
 
@@ -1638,25 +1677,22 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
             diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr();
             if (noHypeMode) {
                 auto start = pmemStart + pmemSize * difftestHartId(tid);
-                warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize);
                 diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF);
             } else if (enableMemDedup) {
                 if (system->multiContextDifftest()) {
-                    warn("Let ref share the multi-context golden memory\n");
                     assert(goldenMemPtr);
                     assert(diffAllStates->proxy->ref_get_backed_memory);
-                    diffAllStates->proxy->ref_get_backed_memory(goldenMemPtr, pmemSize);
+                    diffAllStates->proxy->ref_get_backed_memory(
+                        system->createCopyOnWriteBranch(), pmemSize);
+                    diffAllStates->proxy->memcpy_init(
+                        0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF);
                 } else {
-                    warn("Let ref share a COW mirror of root memory\n");
                     assert(diffAllStates->proxy->ref_get_backed_memory);
                     diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);
                 }
             } else {
-                warn("MemDedup disabled, copying pmem to NEMU\n");
-                warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize);
                 diffAllStates->proxy->memcpy_init(0x80000000u, pmemStart, pmemSize, DUT_TO_REF);
             }
-            warn("Start regcpy to NEMU\n");
             diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), DUT_TO_REF);
         }
     }
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index feaf6e13cd..21c13388db 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -149,6 +149,7 @@ class BaseCPU : public ClockedObject
     };
 
     std::vector<std::deque<RecentCommittedStore>> recentCommittedStores;
+    std::vector<bool> syncVisibleStoreReplayArmed;
 
     const unsigned IntRegIndexBase = 0;
     const unsigned FPRegIndexBase = 32;
@@ -790,6 +791,16 @@ class BaseCPU : public ClockedObject
     void difftestStep(ThreadID tid, InstSeqNum seq);
 
     void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst);
+    void armSyncVisibleStoreReplay(ThreadID tid)
+    {
+        syncVisibleStoreReplayArmed.at(tid) = true;
+    }
+    bool consumeSyncVisibleStoreReplay(ThreadID tid)
+    {
+        bool armed = syncVisibleStoreReplayArmed.at(tid);
+        syncVisibleStoreReplayArmed.at(tid) = false;
+        return armed;
+    }
 
     inline bool difftestEnabled() const { return enableDifftest; }
 
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index e1b20025ce..e289754896 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -1482,6 +1482,11 @@ Commit::commitInsts()
 
                     }
 
+                    if (head_inst->isReadBarrier() ||
+                        head_inst->isWriteBarrier()) {
+                        cpu->armSyncVisibleStoreReplay(tid);
+                    }
+
                     if (cpu->difftestEnabled()) {
                         diffInst(tid, head_inst);
                     }
@@ -1678,9 +1683,12 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
         // Memory-ordering instructions such as sfence.vma must not execute
         // until older stores are visible; otherwise page-table updates may
         // race with the TLB invalidation.
-        if ((head_inst->isMemRef() || head_inst->isReturn() ||
-             head_inst->isReadBarrier() || head_inst->isWriteBarrier()) &&
-            (inst_num > 0 || !iewStage->flushStores(tid))) {
+        const bool needs_store_drain =
+            head_inst->isMemRef() || head_inst->isReturn() ||
+            head_inst->isReadBarrier() || head_inst->isWriteBarrier();
+        const bool stores_drained =
+            !needs_store_drain || iewStage->flushStores(tid, head_inst->seqNum);
+        if (needs_store_drain && (inst_num > 0 || !stores_drained)) {
             DPRINTF(Commit,
                     "[tid:%i] [sn:%llu] "
                     "Waiting for all stores to writeback.\n",
@@ -1734,7 +1742,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
 
     if (inst_fault != NoFault) {
         traceLogInstFault(head_inst, inst_fault);
-        if (!iewStage->flushStores(tid) || inst_num > 0) {
+        if (!iewStage->flushStores(tid, head_inst->seqNum) || inst_num > 0) {
             DPRINTF(Commit,
                     "[tid:%i] [sn:%llu] "
                     "Stores outstanding, fault must wait.\n",
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index e23d0fb490..94cfbcb8cc 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -260,6 +260,10 @@ class IEW
      * the store queue or the store buffer to write back to.
      */
     bool flushStores(ThreadID tid) { return ldstQueue.flushStores(tid); }
+    bool flushStores(ThreadID tid, InstSeqNum seq_num)
+    {
+        return ldstQueue.flushStores(tid, seq_num);
+    }
 
     /** Check if we need to squash after a load/store/branch is executed. */
     void SquashCheckAfterExe(DynInstPtr inst);
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index e070f076d0..9cc59f560d 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -89,23 +89,96 @@ LSQ::DcachePort::DcachePort(LSQ *_lsq, CPU *_cpu) :
 
 std::list<LSQ::SingleDataRequest*> LSQ::SingleDataRequest::singleList;
 
+namespace
+{
+
+bool
+storeBufferEntryEligibleForLoad(const LSQ::StoreBufferEntry *entry,
+                                ThreadID load_tid, InstSeqNum load_seq,
+                                uint64_t visible_generation)
+{
+    if (!entry) {
+        return false;
+    }
+
+    if (entry->tid == load_tid) {
+        return entry->seqNum < load_seq;
+    }
+
+    return entry->generation != 0 && entry->generation <= visible_generation;
+}
+
+bool
+storeBufferByteEligibleForLoad(const LSQ::StoreBufferEntry *entry,
+                               size_t byte_idx, ThreadID load_tid,
+                               InstSeqNum load_seq,
+                               uint64_t visible_generation)
+{
+    if (!entry) {
+        return false;
+    }
+
+    if (entry->tid == load_tid) {
+        return entry->seqNum < load_seq;
+    }
+
+    if (!entry->sending) {
+        return false;
+    }
+
+    return byte_idx < entry->byteGenerations.size() &&
+           entry->byteGenerations[byte_idx] != 0 &&
+           entry->byteGenerations[byte_idx] <= visible_generation;
+}
+
+uint64_t
+storeBufferEligibleGeneration(const LSQ::StoreBufferEntry *entry,
+                              ThreadID load_tid, InstSeqNum load_seq,
+                              uint64_t visible_generation)
+{
+    if (!entry) {
+        return 0;
+    }
+
+    uint64_t best_generation = 0;
+    if (storeBufferEntryEligibleForLoad(entry, load_tid, load_seq,
+                                        visible_generation)) {
+        best_generation = entry->generation;
+    }
+    if (storeBufferEntryEligibleForLoad(entry->vice, load_tid, load_seq,
+                                        visible_generation)) {
+        best_generation = std::max(best_generation, entry->vice->generation);
+    }
+    return best_generation;
+}
+
+} // anonymous namespace
+
 void
-LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr,
+LSQ::StoreBufferEntry::reset(ThreadID tid, InstSeqNum seq_num,
+                             uint64_t block_vaddr, uint64_t block_paddr,
                              uint64_t offset, uint8_t *datas, uint64_t size,
-                             const std::vector<bool> &mask)
+                             const std::vector<bool> &mask,
+                             uint64_t generation)
 {
     std::fill(validMask.begin(), validMask.begin() + offset, false);
+    std::fill(byteGenerations.begin(), byteGenerations.end(), 0);
 
     for (int i = 0; i < size; i++) {
         validMask[offset + i] = mask[i];
+        if (mask[i]) {
+            byteGenerations[offset + i] = generation;
+        }
     }
 
     std::fill(validMask.begin() + offset + size, validMask.end(), false);
     memcpy(blockDatas.data() + offset, datas, size);
 
     this->tid = tid;
+    this->seqNum = seq_num;
     this->blockVaddr = block_vaddr;
     this->blockPaddr = block_paddr;
+    this->generation = generation;
     this->sending = false;
     this->request = nullptr;
     this->vice = nullptr;
@@ -113,19 +186,23 @@ LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_
 
 void
 LSQ::StoreBufferEntry::merge(uint64_t offset, uint8_t *datas, uint64_t size,
-                             const std::vector<bool> &mask)
+                             const std::vector<bool> &mask,
+                             uint64_t generation)
 {
     assert(offset + size <= validMask.size());
     for (uint64_t i = 0; i < size; ++i) {
         if (mask[i]) {
             blockDatas[offset + i] = datas[i];
             validMask[offset + i] = true;
+            byteGenerations[offset + i] = generation;
         }
     }
 }
 
 bool
-LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq)
+LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq,
+                                     ThreadID load_tid, InstSeqNum load_seq,
+                                     uint64_t visible_generation)
 {
     int offset = req->getPaddr() & (validMask.size() - 1);
     // the offset in the split request
@@ -136,13 +213,21 @@ LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq)
     bool full_forward = true;
     for (int i = 0; i < req->getSize(); i++) {
         assert(goffset + i < lsqreq->_size);
-        if (vice && vice->validMask[offset + i]) {
+        const bool vice_eligible =
+            vice && vice->validMask[offset + i] &&
+            storeBufferByteEligibleForLoad(vice, offset + i, load_tid,
+                                           load_seq, visible_generation);
+        const bool self_eligible =
+            validMask[offset + i] &&
+            storeBufferByteEligibleForLoad(this, offset + i, load_tid,
+                                           load_seq, visible_generation);
+        if (vice_eligible) {
             // vice is newer
             assert(vice->blockVaddr == blockVaddr);
             lsqreq->SBforwardPackets.push_back(
                 LSQRequest::FWDPacket{
                     .idx = goffset + i, .byte = vice->blockDatas[offset + i]});
-        } else if (validMask[offset + i]) {
+        } else if (self_eligible) {
             lsqreq->SBforwardPackets.push_back(
                 LSQRequest::FWDPacket{
                     .idx = goffset + i, .byte = blockDatas[offset + i]});
@@ -182,6 +267,40 @@ LSQ::StoreBuffer::size() const
     return _size;
 }
 
+uint64_t
+LSQ::StoreBuffer::size(ThreadID tid) const
+{
+    uint64_t count = 0;
+    for (size_t index = 0; index < data_vec.size(); ++index) {
+        if (!data_vld[index]) {
+            continue;
+        }
+
+        auto *entry = data_vec[index];
+        if (entry && entry->tid == tid) {
+            ++count;
+        }
+    }
+    return count;
+}
+
+uint64_t
+LSQ::StoreBuffer::size(ThreadID tid, InstSeqNum seq_num) const
+{
+    uint64_t count = 0;
+    for (size_t index = 0; index < data_vec.size(); ++index) {
+        if (!data_vld[index]) {
+            continue;
+        }
+
+        auto *entry = data_vec[index];
+        if (entry && entry->tid == tid && entry->seqNum < seq_num) {
+            ++count;
+        }
+    }
+    return count;
+}
+
 uint64_t
 LSQ::StoreBuffer::unsentSize() const
 {
@@ -243,6 +362,47 @@ LSQ::StoreBuffer::getEvict()
     return data_vec[index];
 }
 
+LSQ::StoreBufferEntry *
+LSQ::StoreBuffer::getEvict(const bool *eligible_tids, size_t num_threads)
+{
+    return getEvict(eligible_tids, nullptr, num_threads);
+}
+
+LSQ::StoreBufferEntry *
+LSQ::StoreBuffer::getEvict(const bool *eligible_tids,
+                           const InstSeqNum *eligible_seq,
+                           size_t num_threads)
+{
+    if (eligible_tids == nullptr && eligible_seq == nullptr) {
+        return getEvict();
+    }
+
+    for (auto it = lru_index.rbegin(); it != lru_index.rend(); ++it) {
+        auto *entry = data_vec[*it];
+        if (!entry) {
+            continue;
+        }
+
+        const ThreadID tid = entry->tid;
+        if (tid >= num_threads) {
+            continue;
+        }
+        if (eligible_tids && !eligible_tids[tid]) {
+            continue;
+        }
+        if (eligible_seq &&
+            eligible_seq[tid] != static_cast<InstSeqNum>(-1) &&
+            entry->seqNum >= eligible_seq[tid]) {
+            continue;
+        }
+
+        lru_index.erase(std::find(lru_index.begin(), lru_index.end(), *it));
+        return entry;
+    }
+
+    return nullptr;
+}
+
 LSQ::StoreBufferEntry *
 LSQ::StoreBuffer::createVice(StoreBufferEntry *entry)
 {
@@ -766,17 +926,17 @@ LSQ::processWriteback()
     std::vector<uint32_t> offload_demand(numThreads, 0);
     std::vector<ThreadID> requester_tids;
     requester_tids.reserve(activeThreads->size());
-    uint32_t sbuffer_flush_bitset = 0;
-    for (ThreadID tid : *activeThreads) {
-        bool sbuffer_flushing = storeBufferFlushing(tid);
-        sbuffer_flush_bitset |= (sbuffer_flushing << tid);
-    }
 
     for (ThreadID tid : *activeThreads) {
         offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries(
             maxStoreBufferEntriesAcceptedFromSQPerCycle);
-        // when other thread is flushing sbuffer, stop current thread sq offloading
-        bool conti = (sbuffer_flush_bitset & ~(1 << tid)) == 0;
+        // During a global sbuffer flush, only threads that requested the
+        // flush may keep draining older committed stores from their SQ.
+        // If both SMT threads are flushing simultaneously, both must still be
+        // allowed to make forward progress, otherwise they can deadlock while
+        // waiting on each other's flush bit.
+        const bool conti =
+            !storeBufferFlushing() || storeBufferFlushing(tid);
         if (conti && offload_demand[tid] != 0) {
             requester_tids.push_back(tid);
         }
@@ -822,11 +982,14 @@ LSQ::processWriteback()
         thread[tid].offloadToStoreBuffer(offload_quota[tid]);
     }
 
-    // If the store buffer is flushing and no entries remain to be sent,
-    // clear the flushing state to avoid deadlock.
-    if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] {
-        assert(storeBuffer.unsentSize() == 0);
-        clearStoreBufferFlushing();
+    // A fence/flush only waits for the requesting thread's sbuffer domain.
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (!storeBufferFlushing(tid) ||
+            !storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid])) {
+            continue;
+        }
+
+        clearStoreBufferFlushing(tid);
         cpu->activityThisCycle();
     }
 }
@@ -874,12 +1037,23 @@ LSQ::storeBufferWriteback()
         }
 
         if (cause) {
-            StoreBufferEntry *entry = storeBuffer.getEvict();
+            StoreBufferEntry *entry = nullptr;
+            if (*cause == StoreBufferEvictCause::Flush) {
+                entry = storeBuffer.getEvict(
+                    _storeBufferFlushing, _storeBufferFlushBeforeSeq,
+                    numThreads);
+            } else {
+                entry = storeBuffer.getEvict();
+            }
+            if (!entry) {
+                /* Disabled with the broad sbuffer watchdog above. */
+                return;
+            }
+            /* Disabled with the broad sbuffer watchdog above. */
             auto &owner_unit = thread[entry->tid];
             recordStoreBufferEviction(*cause);
             DPRINTF(StoreBuffer, "Evicting sbuffer entry[%#x]\n",
                     entry->blockPaddr);
-
             if (debug::StoreBuffer) {
                 DPRINTFR(StoreBuffer, "Dumping sbuffer entry data\n");
                 for (int i = 0; i < owner_unit.cacheLineSize(); i++) {
@@ -969,6 +1143,20 @@ void
 LSQ::completeSbufferEvict(PacketPtr pkt)
 {
     auto request = dynamic_cast<SbufferRequest *>(pkt->senderState);
+    const Addr block_paddr = request->sbuffer_entry->blockPaddr;
+    invalidateOtherThreadStoreBufferBytes(request->sbuffer_entry->tid,
+                                          request->mainReq()->getPaddr(),
+                                          request->mainReq()->getByteEnable(),
+                                          request->sbuffer_entry->generation);
+    markStoreBufferBlockVisible(block_paddr,
+                                request->sbuffer_entry->generation);
+    const bool replay_executed_loads =
+        cpu->consumeSyncVisibleStoreReplay(request->sbuffer_entry->tid);
+    notifyOtherThreadsStoreVisible(request->sbuffer_entry->tid,
+                                   request->mainReq()->getPaddr(),
+                                   request->mainReq()->getByteEnable(),
+                                   request->sbuffer_entry->seqNum,
+                                   replay_executed_loads);
     if (cpu->goldenMemManager() &&
         cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
         Addr paddr = request->mainReq()->getPaddr();
@@ -980,6 +1168,7 @@ LSQ::completeSbufferEvict(PacketPtr pkt)
     }
 
     storeBuffer.release(request->sbuffer_entry);
+    reclaimStoreBufferBlockMetadata(block_paddr);
     DPRINTF(StoreBuffer,
             "finish entry[%#x] evict to cache, sbuffer size: %d, "
             "unsentsize: %d\n",
@@ -1142,7 +1331,6 @@ LSQ::recvTimingResp(PacketPtr pkt)
     LSQRequest *request = dynamic_cast<LSQRequest*>(pkt->senderState);
     panic_if(!request, "Got packet back with unknown sender state\n");
 
-
     thread[request->_port.lsqID].recvTimingResp(pkt);
 
     if (pkt->isInvalidate()) {
@@ -1546,12 +1734,245 @@ LSQ::hasStoresToWB(ThreadID tid)
     return thread.at(tid).hasStoresToWB();
 }
 
-bool LSQ::flushStores(ThreadID tid)
+bool
+LSQ::hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num)
+{
+    return thread.at(tid).hasStoresToWBBefore(seq_num);
+}
+
+bool
+LSQ::flushStores(ThreadID tid)
+{
+    _storeBufferFlushing[tid] = true;
+    _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    const bool has_stores = hasStoresToWB(tid);
+    const bool sbuffer_empty =
+        storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid]);
+    if (!has_stores && sbuffer_empty) {
+        clearStoreBufferFlushing(tid);
+        return true;
+    }
+
+    return false;
+}
+
+bool
+LSQ::flushStores(ThreadID tid, InstSeqNum seq_num)
 {
     _storeBufferFlushing[tid] = true;
-    // TODO：high performance shared SMT storebuffer flushing
-    bool t = !hasStoresToWB(tid) && storeBufferEmpty();
-    return t;
+    _storeBufferFlushBeforeSeq[tid] = seq_num;
+    const bool has_older_stores = hasStoresToWBBefore(tid, seq_num);
+    const bool sbuffer_empty = storeBufferEmpty(tid, seq_num);
+    if (!has_older_stores && sbuffer_empty) {
+        clearStoreBufferFlushing(tid);
+        return true;
+    }
+
+    return false;
+}
+
+void
+LSQ::requestGlobalStoreBufferFlush()
+{
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        _storeBufferFlushing[tid] = true;
+        _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    }
+}
+
+bool
+LSQ::storeBufferHasConflict(ThreadID tid, Addr block_paddr) const
+{
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+
+        if (storeBuffer.get(other_tid, block_paddr)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+uint64_t
+LSQ::bumpStoreBufferBlockVersion(Addr block_paddr)
+{
+    auto &version = storeBufferBlockVersion[block_paddr];
+    ++version;
+    if (version == 0) {
+        version = 1;
+    }
+    return version;
+}
+
+uint64_t
+LSQ::currentStoreBufferBlockVersion(Addr block_paddr) const
+{
+    auto it = storeBufferBlockVersion.find(block_paddr);
+    return it == storeBufferBlockVersion.end() ? 0 : it->second;
+}
+
+void
+LSQ::markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation)
+{
+    auto &visible = storeBufferVisibleVersion[block_paddr];
+    visible = std::max(visible, generation);
+    reclaimStoreBufferBlockMetadata(block_paddr);
+}
+
+uint64_t
+LSQ::currentStoreBufferVisibleVersion(Addr block_paddr) const
+{
+    auto it = storeBufferVisibleVersion.find(block_paddr);
+    return it == storeBufferVisibleVersion.end() ? 0 : it->second;
+}
+
+LSQ::StoreBufferEntry *
+LSQ::findForwardingStoreBufferEntry(Addr block_paddr, ThreadID load_tid,
+                                    InstSeqNum load_seq) const
+{
+    StoreBufferEntry *best_entry = nullptr;
+    uint64_t best_generation = 0;
+    const auto visible_generation =
+        currentStoreBufferVisibleVersion(block_paddr);
+
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        auto entry = storeBuffer.get(tid, block_paddr);
+        if (!entry) {
+            continue;
+        }
+
+        const uint64_t entry_generation =
+            storeBufferEligibleGeneration(entry, load_tid, load_seq,
+                                          visible_generation);
+        if (entry_generation == 0) {
+            continue;
+        }
+
+        if (!best_entry || entry_generation > best_generation) {
+            best_entry = entry;
+            best_generation = entry_generation;
+        }
+    }
+
+    return best_entry;
+}
+
+bool
+LSQ::hasLiveStoreBufferBlock(Addr block_paddr) const
+{
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (storeBuffer.get(tid, block_paddr)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void
+LSQ::reclaimStoreBufferBlockMetadata(Addr block_paddr)
+{
+    if (hasLiveStoreBufferBlock(block_paddr)) {
+        return;
+    }
+
+    auto version_it = storeBufferBlockVersion.find(block_paddr);
+    if (version_it == storeBufferBlockVersion.end()) {
+        storeBufferVisibleVersion.erase(block_paddr);
+        return;
+    }
+
+    auto visible_it = storeBufferVisibleVersion.find(block_paddr);
+    const uint64_t visible_generation =
+        visible_it == storeBufferVisibleVersion.end() ? 0 : visible_it->second;
+    if (visible_generation < version_it->second) {
+        return;
+    }
+
+    storeBufferBlockVersion.erase(version_it);
+    if (visible_it != storeBufferVisibleVersion.end()) {
+        storeBufferVisibleVersion.erase(visible_it);
+    }
+}
+
+void
+LSQ::invalidateOtherThreadStoreBufferBytes(
+    ThreadID tid, Addr paddr, const std::vector<bool> &mask,
+    uint64_t generation)
+{
+    const Addr cache_block_mask =
+        ~((static_cast<Addr>(cpu->cacheLineSize())) - 1);
+    const Addr block_paddr = paddr & cache_block_mask;
+    const Addr offset = paddr & ~cache_block_mask;
+    auto invalidate_entry = [&](StoreBufferEntry *entry) {
+        if (!entry || offset + mask.size() > entry->validMask.size()) {
+            return;
+        }
+
+        if (!entry->sending) {
+            return;
+        }
+
+        for (size_t i = 0; i < mask.size(); ++i) {
+            if (mask[i] &&
+                entry->byteGenerations[offset + i] != 0 &&
+                entry->byteGenerations[offset + i] <= generation) {
+                entry->validMask[offset + i] = false;
+            }
+        }
+    };
+
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+
+        auto entry = storeBuffer.get(other_tid, block_paddr);
+        if (!entry) {
+            continue;
+        }
+
+        invalidate_entry(entry);
+        invalidate_entry(entry->vice);
+    }
+}
+
+void
+LSQ::notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr,
+                                    const std::vector<bool> &byte_enable,
+                                    InstSeqNum store_seq,
+                                    bool replay_executed_loads)
+{
+    if (numThreads <= 1) {
+        return;
+    }
+
+    Request::Flags flags;
+    const Addr cache_block_mask =
+        ~((static_cast<Addr>(cpu->cacheLineSize())) - 1);
+    RequestPtr req = std::make_shared<Request>(
+        store_paddr & cache_block_mask, cpu->cacheLineSize(), flags,
+        cpu->dataRequestorId());
+    Packet pkt(req, MemCmd::InvalidateReq);
+
+    for (ThreadID context_id = 0; context_id < numThreads; ++context_id) {
+        gem5::ThreadContext *tc = cpu->getContext(context_id);
+        bool no_squash = cpu->thread[context_id]->noSquashFromTC;
+        cpu->thread[context_id]->noSquashFromTC = true;
+        tc->getIsaPtr()->handleLockedSnoop(&pkt, cache_block_mask);
+        cpu->thread[context_id]->noSquashFromTC = no_squash;
+    }
+
+    for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) {
+        if (other_tid == tid) {
+            continue;
+        }
+        thread[other_tid].checkLocalStoreVisible(store_paddr, byte_enable,
+                                                 store_seq,
+                                                 replay_executed_loads);
+    }
 }
 
 int
@@ -2110,11 +2531,6 @@ LSQ::LSQRequest::forward()
 
 LSQ::LSQRequest::~LSQRequest()
 {
-    if (isAnyOutstandingRequest()) {
-        warn("numInTranslationFragments = %u, _numOutstandingPackets = %u\n",
-             numInTranslationFragments, _numOutstandingPackets);
-        std::raise(SIGINT);
-    }
     assert(!isAnyOutstandingRequest());
     if (_inst && _inst->savedRequest == this) {
         DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum);
@@ -2205,7 +2621,6 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt)
                     mainReq()->isUncacheable(), cacheHit, *((uint64_t*)buffer));
     }
 
-
     if (isLoad()) {
         auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this);
         if (it != lsqUnit()->inflightLoads.end()) {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index fc2c73a80c..83f47b5b91 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -147,10 +147,13 @@ class LSQ
       public:
         const int index;
         ThreadID tid;
+        InstSeqNum seqNum = 0;
         Addr blockVaddr;
         Addr blockPaddr;
         std::vector<uint8_t> blockDatas;
         std::vector<bool> validMask;
+        std::vector<uint64_t> byteGenerations;
+        uint64_t generation = 0;
         bool sending;
         // the another same addr entry when sending
         // another cannot sending until self sending finished
@@ -162,16 +165,20 @@ class LSQ
         {
             blockDatas.resize(size, 0);
             validMask.resize(size, false);
+            byteGenerations.resize(size, 0);
         }
 
-        void reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr,
-                   uint64_t offset, uint8_t *datas, uint64_t size,
-                   const std::vector<bool> &mask);
+        void reset(ThreadID tid, InstSeqNum seq_num, uint64_t block_vaddr,
+                   uint64_t block_paddr, uint64_t offset, uint8_t *datas,
+                   uint64_t size, const std::vector<bool> &mask,
+                   uint64_t generation);
 
         void merge(uint64_t offset, uint8_t *datas, uint64_t size,
-                   const std::vector<bool> &mask);
+                   const std::vector<bool> &mask, uint64_t generation);
 
-        bool recordForward(RequestPtr req, LSQRequest *lsqreq);
+        bool recordForward(RequestPtr req, LSQRequest *lsqreq,
+                           ThreadID load_tid, InstSeqNum load_seq,
+                           uint64_t visible_generation);
     };
 
     class StoreBuffer
@@ -198,12 +205,19 @@ class LSQ
         void setData(std::vector<StoreBufferEntry *> &data_vec);
         bool full() const;
         uint64_t size() const;
+        uint64_t size(ThreadID tid) const;
+        uint64_t size(ThreadID tid, InstSeqNum seq_num) const;
         uint64_t unsentSize() const;
         StoreBufferEntry *getEmpty();
         void insert(StoreBufferEntry *entry);
         StoreBufferEntry *get(ThreadID tid, uint64_t addr) const;
         void update(int index);
         StoreBufferEntry *getEvict();
+        StoreBufferEntry *getEvict(const bool *eligible_tids,
+                                   size_t num_threads);
+        StoreBufferEntry *getEvict(const bool *eligible_tids,
+                                   const InstSeqNum *eligible_seq,
+                                   size_t num_threads);
         StoreBufferEntry *createVice(StoreBufferEntry *entry);
         void release(StoreBufferEntry *entry);
     };
@@ -351,6 +365,8 @@ class LSQ
         AtomicOpFunctorPtr _amo_op;
         bool _hasStaleTranslation;
         bool _sbufferBypass;
+        bool _goldenSnapshotCaptured = false;
+        uint64_t _storeBufferGeneration = 0;
 
         struct FWDPacket
         {
@@ -477,6 +493,7 @@ class LSQ
 
         RequestPtr req(int idx = 0) { return _reqs.at(idx); }
         const RequestPtr req(int idx = 0) const { return _reqs.at(idx); }
+        size_t numReqs() const { return _reqs.size(); }
 
         Addr getVaddr(int idx = 0) const { return req(idx)->getVaddr(); }
         virtual void initiateTranslation() = 0;
@@ -977,9 +994,29 @@ class LSQ
      * to memory.
      */
     bool hasStoresToWB(ThreadID tid);
+    bool hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num);
 
     // true if all stores are flushed
     bool flushStores(ThreadID tid);
+    bool flushStores(ThreadID tid, InstSeqNum seq_num);
+    void requestGlobalStoreBufferFlush();
+    bool storeBufferHasConflict(ThreadID tid, Addr block_paddr) const;
+    uint64_t bumpStoreBufferBlockVersion(Addr block_paddr);
+    uint64_t currentStoreBufferBlockVersion(Addr block_paddr) const;
+    void markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation);
+    uint64_t currentStoreBufferVisibleVersion(Addr block_paddr) const;
+    StoreBufferEntry *findForwardingStoreBufferEntry(Addr block_paddr,
+                                                     ThreadID load_tid,
+                                                     InstSeqNum load_seq) const;
+    bool hasLiveStoreBufferBlock(Addr block_paddr) const;
+    void reclaimStoreBufferBlockMetadata(Addr block_paddr);
+    void invalidateOtherThreadStoreBufferBytes(
+        ThreadID tid, Addr paddr, const std::vector<bool> &mask,
+        uint64_t generation);
+    void notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr,
+                                        const std::vector<bool> &byte_enable,
+                                        InstSeqNum store_seq,
+                                        bool replay_executed_loads);
 
     /** Returns the number of stores a specific thread has to write back. */
     int numStoresToSbuffer(ThreadID tid);
@@ -1082,6 +1119,14 @@ class LSQ
     bool getDcacheWriteStall() { return dcacheWriteStall; }
     StoreBuffer &getStoreBuffer() { return storeBuffer; }
     bool storeBufferEmpty() const { return storeBuffer.size() == 0; }
+    bool storeBufferEmpty(ThreadID tid) const
+    {
+        return storeBuffer.size(tid) == 0;
+    }
+    bool storeBufferEmpty(ThreadID tid, InstSeqNum seq_num) const
+    {
+        return storeBuffer.size(tid, seq_num) == 0;
+    }
     bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; }
     bool storeBufferFlushing() const
     {
@@ -1091,10 +1136,15 @@ class LSQ
         }
         return false;
     }
-    void clearStoreBufferFlushing(ThreadID tid) { _storeBufferFlushing[tid] = false; }
+    void clearStoreBufferFlushing(ThreadID tid)
+    {
+        _storeBufferFlushing[tid] = false;
+        _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
+    }
     void clearStoreBufferFlushing() {
         for (auto tid : *activeThreads) {
             _storeBufferFlushing[tid] = false;
+            _storeBufferFlushBeforeSeq[tid] = static_cast<InstSeqNum>(-1);
         }
     }
     uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; }
@@ -1185,7 +1235,12 @@ class LSQ
     const uint64_t storeBufferInactiveThreshold;
     const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2;
     StoreBuffer storeBuffer;
+    std::unordered_map<Addr, uint64_t> storeBufferBlockVersion;
+    std::unordered_map<Addr, uint64_t> storeBufferVisibleVersion;
     bool _storeBufferFlushing[MaxThreads] = {false};
+    InstSeqNum _storeBufferFlushBeforeSeq[MaxThreads] = {
+        static_cast<InstSeqNum>(-1)
+    };
     uint64_t storeBufferWritebackInactive = 0;
     StoreBufferEntry *blockedSbufferEntry = nullptr;
     ThreadID nextStoreBufferOffloadTid = InvalidThreadID;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 6be535e5df..9d170af470 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -367,22 +367,24 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
                 assert(size == inst->effSize);
 
                 if (inst->isAtomic()) {
-                    uint8_t *golden_old =
-                        reinterpret_cast<uint8_t *>(inst->getAmoOldGoldenValuePtr());
-                    cpu->goldenMemManager()->readGoldenMem(addr, golden_old, size);
-                    if (memcmp(golden_old, loaded_data, size) != 0) {
-                        panic("[tid:%d] [sn:%llu] Atomic old value error at addr %#lx, "
-                              "size %d. %s\n",
-                              inst->threadNumber, inst->seqNum, addr, size,
-                              goldenDiffStr(loaded_data, golden_old, size).c_str());
-                    }
+                    uint8_t current_golden[8] = {};
+                    panic_if(size > sizeof(current_golden),
+                             "Unexpected AMO size %u at addr %#lx\n",
+                             size, addr);
+                    cpu->goldenMemManager()->readGoldenMem(addr, current_golden,
+                                                           size);
+
+                    // Preserve the DUT-observed old value until completeStore()
+                    // derives the post-AMO memory image. The golden old-value
+                    // snapshot used by difftest is captured when the request
+                    // is first sent, before later concurrent updates can
+                    // advance shared memory.
+                    inst->setGolden(loaded_data);
                 } else {
                     // check data with golden mem
                     uint8_t *golden_data =
                         (uint8_t *)cpu->goldenMemManager()->guestToHost(addr);
-                    if (memcmp(golden_data, loaded_data, size) == 0) {
-                        inst->setGolden(golden_data);
-                    } else {
+                    if (memcmp(golden_data, loaded_data, size) != 0) {
                         DPRINTF(Diff,
                                 "[tid:%d] [sn:%llu] Load sees value different from "
                                 "current golden memory at addr %#lx, size %d. "
@@ -980,6 +982,103 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     return;
 }
 
+namespace
+{
+
+bool
+overlapsVisibleStore(const o3::LSQ::LSQRequest *load_req, Addr store_paddr,
+                     const std::vector<bool> &store_byte_enable)
+{
+    if (!load_req) {
+        return false;
+    }
+
+    for (size_t req_idx = 0; req_idx < load_req->numReqs(); ++req_idx) {
+        const auto req = load_req->req(req_idx);
+        if (!req->hasPaddr()) {
+            continue;
+        }
+
+        const Addr load_start = req->getPaddr();
+        const Addr load_end = load_start + req->getSize();
+        for (size_t byte_idx = 0; byte_idx < store_byte_enable.size();
+             ++byte_idx) {
+            if (!store_byte_enable[byte_idx]) {
+                continue;
+            }
+
+            const Addr byte_addr = store_paddr + byte_idx;
+            if (byte_addr >= load_start && byte_addr < load_end) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+} // anonymous namespace
+
+void
+LSQUnit::checkLocalStoreVisible(Addr store_paddr,
+                                const std::vector<bool> &store_byte_enable,
+                                InstSeqNum store_seq,
+                                bool replay_executed_loads)
+{
+    [[maybe_unused]] const InstSeqNum visible_store_seq = store_seq;
+    [[maybe_unused]] const bool replay_visible_loads = replay_executed_loads;
+
+    if (loadQueue.empty()) {
+        return;
+    }
+
+    const Addr block_addr = store_paddr & cacheBlockMask;
+    DynInstPtr oldest_violator = memDepViolator;
+
+    for (auto it = loadQueue.begin(); it != loadQueue.end(); ++it) {
+        DynInstPtr ld_inst = it->instruction();
+        if (!ld_inst || ld_inst->isSquashed() || ld_inst->needReplay() ||
+            !ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
+            continue;
+        }
+
+        LSQRequest *request = ld_inst->savedRequest;
+        if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) {
+            continue;
+        }
+        if (!overlapsVisibleStore(request, store_paddr, store_byte_enable)) {
+            continue;
+        }
+        if (ld_inst->memReqFlags & Request::LLSC) {
+            ld_inst->tcBase()->getIsaPtr()->handleLockedSnoopHit(ld_inst.get());
+        }
+
+        if (ld_inst->isExecuted()) {
+            DPRINTF(LSQUnit,
+                    "Local visible store ignores already executed load "
+                    "[sn:%lli] on addr %#x\n",
+                    ld_inst->seqNum, store_paddr);
+            continue;
+        }
+
+        ld_inst->hitExternalSnoop(true);
+        ld_inst->possibleLoadViolation(true);
+        DPRINTF(LSQUnit,
+                "Local visible store replays not-yet-executed load [sn:%lli] "
+                "on addr %#x\n",
+                ld_inst->seqNum, store_paddr);
+        ld_inst->setNukeReplay();
+        loadSetReplay(ld_inst, request, true);
+    }
+
+    if (oldest_violator &&
+        (!memDepViolator || oldest_violator->seqNum < memDepViolator->seqNum)) {
+        memDepViolator = oldest_violator;
+        cpu->activityThisCycle();
+        iewStage->SquashCheckAfterExe(oldest_violator);
+    }
+}
+
 Fault
 LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
         const DynInstPtr& inst)
@@ -1102,10 +1201,7 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow)
     // clear request in loadQueue
     loadQueue[inst->lqIdx].setRequest(nullptr);
     if (dropReqNow) {
-        // discard this request
         request->discard();
-        // TODO: is this essential?
-        inst->savedRequest = nullptr;
     }
 
     DPRINTF(LoadPipeline, "Load [sn:%ld] set replay, dropReqNow: %d\n", inst->seqNum, dropReqNow);
@@ -1523,9 +1619,9 @@ LSQUnit::executeLoadPipeSx()
                 else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst);
                 else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst);
                 else if (inst->needNukeReplay()) {
-                    if (inst->cacheHit()) {
+                    if (inst->savedRequest && inst->cacheHit()) {
                         loadSetReplay(inst, inst->savedRequest, true);
-                    } else if (inst->hasPendingCacheReq()) {
+                    } else if (inst->savedRequest && inst->hasPendingCacheReq()) {
                         loadSetReplay(inst, inst->savedRequest, false);
                     }
                     inst->issueQue->retryMem(inst);
@@ -1902,7 +1998,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst)
             if (x.instruction()->seqNum > youngest_inst) {
                 break;
             }
-            assert(x.instruction()->isSplitStoreAddr() ? x.splitStoreFinish() : true);
+            // Commit can publish a new squash to IEW one cycle after IEW has
+            // already received an older doneMemSeqNum. If that stale
+            // doneMemSeqNum reaches here in the same cycle that ROB marks this
+            // store squashed, do not advance SQ writeback state past the
+            // squashed entry; IEW's next-cycle squash will remove it.
+            if (x.instruction()->isSquashed()) {
+                break;
+            }
+            if (x.instruction()->isSplitStoreAddr() && !x.splitStoreFinish()) {
+                panic("Split store reached commitStores unfinished: tid=%d "
+                      "seq=%llu pc=%#lx youngest=%llu canCommit=%d "
+                      "executed=%d squashed=%d addrReady=%d dataReady=%d "
+                      "staFinish=%d stdFinish=%d canWB=%d completed=%d\n",
+                      x.instruction()->threadNumber,
+                      static_cast<unsigned long long>(
+                          x.instruction()->seqNum),
+                      x.instruction()->pcState().instAddr(),
+                      static_cast<unsigned long long>(youngest_inst),
+                      x.instruction()->readyToCommit(),
+                      x.instruction()->isExecuted(),
+                      x.instruction()->isSquashed(),
+                      x.addrReady(), x.dataReady(),
+                      x.staFinish(), x.stdFinish(),
+                      x.canWB(), x.completed());
+            }
             DPRINTF(LSQUnit, "Marking store as able to write back, PC "
                     "%s [sn:%lli]\n",
                     x.instruction()->pcState(),
@@ -1915,6 +2035,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst)
     }
 }
 
+bool
+LSQUnit::hasStoresToWBBefore(InstSeqNum seq_num) const
+{
+    if (storesToWB == 0) {
+        return false;
+    }
+
+    for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) {
+        if (!it->valid() || !it->instruction()) {
+            continue;
+        }
+
+        const auto &inst = it->instruction();
+        if (inst->seqNum >= seq_num) {
+            break;
+        }
+
+        if (it->canWB() && !it->completed()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 bool
 LSQUnit::writebackBlockedStore()
 {
@@ -1922,8 +2067,25 @@ LSQUnit::writebackBlockedStore()
         return false;
     }
 
-    storeWBIt->request()->sendPacketToCache();
-    if (storeWBIt->request()->isSent()) {
+    auto *request = storeWBIt->request();
+    const auto &inst = storeWBIt->instruction();
+
+    if (request->mainReq()->hasPaddr() &&
+        system->multiContextDifftest() && inst->isAtomic() &&
+        cpu->goldenMemManager() &&
+        cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
+        uint8_t issue_golden[8] = {};
+        panic_if(request->_size > sizeof(issue_golden),
+                 "Unexpected AMO size %u at addr %#lx\n",
+                 request->_size, request->mainReq()->getPaddr());
+        cpu->goldenMemManager()->readGoldenMem(
+            request->mainReq()->getPaddr(), issue_golden, request->_size);
+        std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden,
+                    request->_size);
+    }
+
+    request->sendPacketToCache();
+    if (request->isSent()) {
         storePostSend();
     }
     return isStoreBlocked;
@@ -1934,6 +2096,7 @@ LSQUnit::directStoreToCache()
 {
     DynInstPtr inst = storeWBIt->instruction();
     LSQRequest* request = storeWBIt->request();
+
     if ((request->mainReq()->isLLSC() || request->mainReq()->isRelease()) && (storeWBIt.idx() != storeQueue.head())) {
         DPRINTF(LSQUnit,
                 "Store idx:%i PC:%s to Addr:%#x "
@@ -1982,6 +2145,28 @@ LSQUnit::directStoreToCache()
         }
     }
 
+    if (request->mainReq()->hasPaddr()) {
+        if (request->_storeBufferGeneration == 0) {
+            const Addr block_paddr =
+                request->mainReq()->getPaddr() & cacheBlockMask;
+            request->_storeBufferGeneration =
+                lsq->bumpStoreBufferBlockVersion(block_paddr);
+        }
+
+        if (system->multiContextDifftest() && inst->isAtomic() &&
+            cpu->goldenMemManager() &&
+            cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
+            uint8_t issue_golden[8] = {};
+            panic_if(request->_size > sizeof(issue_golden),
+                     "Unexpected AMO size %u at addr %#lx\n",
+                     request->_size, request->mainReq()->getPaddr());
+            cpu->goldenMemManager()->readGoldenMem(
+                request->mainReq()->getPaddr(), issue_golden, request->_size);
+            std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden,
+                        request->_size);
+        }
+    }
+
     if (request->mainReq()->isLocalAccess()) {
         assert(!inst->isStoreConditional());
         assert(!inst->inHtmTransactionalState());
@@ -2074,17 +2259,20 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
             request->mainReq()->isRelease() ||
             request->mainReq()->isStrictlyOrdered() ||
             inst->isStoreConditional()) {
-            DPRINTF(StoreBuffer, "Find atomic/SC store [sn:%llu]\n", storeWBIt->instruction()->seqNum);
             if (!(storeWBIt.idx() == storeQueue.head())) {
-                DPRINTF(StoreBuffer, "atomic/SC store waiting\n");
                 break;
             }
-            if (!storeBufferEmpty()) {
-                DPRINTF(StoreBuffer, "sbuffer need flush\n");
+            if (request->mainReq()->hasPaddr()) {
+                const Addr block_paddr =
+                    request->mainReq()->getPaddr() & cacheBlockMask;
+                if (lsq->storeBufferHasConflict(lsqID, block_paddr)) {
+                    lsq->requestGlobalStoreBufferFlush();
+                    break;
+                }
+            }
+            if (!storeBufferEmpty(lsqID)) {
                 lsq->flushStores(lsqID);
                 break;
-            } else {
-                DPRINTF(StoreBuffer, "sbuffer finishing flushed\n");
             }
             bool contin = directStoreToCache();
             if (isStoreBlocked) {
@@ -2107,8 +2295,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
                 uint64_t offset = vaddr - vbase;
                 DPRINTF(LSQUnit, "Spilt store idx %d [sn:%lli] insert into sbuffer\n", i, inst->seqNum);
                 assert(offset + req->getSize() <= storeWBIt->size());
-                bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, req->getSize(),
-                                                 req->getByteEnable());
+                bool success = insertStoreBuffer(
+                    vaddr, paddr, (uint8_t *)storeWBIt->data() + offset,
+                    req->getSize(), req->getByteEnable(), inst->seqNum);
                 if (success) {
                     request->_numOutstandingPackets++;
                 } else {
@@ -2128,8 +2317,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
             Addr vaddr = request->getVaddr();
             Addr paddr = request->mainReq()->getPaddr();
             DPRINTF(LSQUnit, "Store [sn:%lli] insert into sbuffer\n", inst->seqNum);
-            bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size,
-                                             request->mainReq()->getByteEnable());
+            bool success = insertStoreBuffer(
+                vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size,
+                request->mainReq()->getByteEnable(), inst->seqNum);
             if (!success) {
                 break;
             }
@@ -2141,7 +2331,10 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
     }
 }
 
-bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector<bool>& mask)
+bool
+LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
+                           uint64_t size, const std::vector<bool>& mask,
+                           InstSeqNum store_seq)
 {
     auto &storeBuffer = lsq->getStoreBuffer();
     // access range must in a cache block
@@ -2149,15 +2342,19 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
     Addr blockVaddr = vaddr & cacheBlockMask;
     Addr blockPaddr = paddr & cacheBlockMask;
     Addr offset = paddr & ~cacheBlockMask;
+
     // check request is not already in the storebuffer
     auto entry = storeBuffer.get(lsqID, blockPaddr);
+    const auto generation = lsq->bumpStoreBufferBlockVersion(blockPaddr);
+
     if (entry) {
         if (entry->sending) {
             if (entry->vice) {
                 // merge into vice
                 stats.sbufferMerge++;
                 entry = entry->vice;
-                entry->merge(offset, datas, size, mask);
+                entry->merge(offset, datas, size, mask, generation);
+                entry->generation = generation;
                 DPRINTF(StoreBuffer, "Merging vice entry[%#x] for addr %#x\n",
                         blockPaddr, paddr);
             } else {
@@ -2170,7 +2367,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
                 stats.sbufferNewline++;
                 stats.sbufferCreateVice++;
                 auto vice = storeBuffer.createVice(entry);
-                vice->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask);
+                vice->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset,
+                            datas, size, mask, generation);
+                vice->generation = generation;
                 DPRINTF(StoreBuffer, "Create new vice entry[%#x] for addr %#x\n",
                         blockPaddr, paddr);
             }
@@ -2178,7 +2377,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
             // merge into unsent
             stats.sbufferMerge++;
             storeBuffer.update(entry->index);
-            entry->merge(offset, datas, size, mask);
+            entry->merge(offset, datas, size, mask, generation);
+            entry->seqNum = std::max(entry->seqNum, store_seq);
+            entry->generation = generation;
             DPRINTF(StoreBuffer, "Merging entry[%#x] for addr %#x\n",
                     blockPaddr, paddr);
         }
@@ -2192,7 +2393,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t
         // insert
         stats.sbufferNewline++;
         auto entry = storeBuffer.getEmpty();
-        entry->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask);
+        entry->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, datas,
+                     size, mask, generation);
+        entry->generation = generation;
         storeBuffer.insert(entry);
         DPRINTF(StoreBuffer, "Create new entry[%#x] for addr %#x\n",
                 blockPaddr, paddr);
@@ -2411,6 +2614,7 @@ LSQUnit::squash(const InstSeqNum &squashed_num)
             break;
         }
     }
+
 }
 
 uint64_t
@@ -2538,11 +2742,41 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
      * store queue. */
     DynInstPtr store_inst = store_idx->instruction();
     auto request = store_idx->request();
-
     DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
             "idx:%i\n",
             store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1);
 
+    if (!from_sbuffer &&
+        (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
+        request->mainReq()->hasPaddr()) {
+        const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask;
+        auto generation = request->_storeBufferGeneration;
+        const bool replay_executed_loads =
+            store_inst->isAtomic() || cpu->consumeSyncVisibleStoreReplay(lsqID);
+        if (generation == 0) {
+            generation = lsq->bumpStoreBufferBlockVersion(block_paddr);
+        }
+        lsq->invalidateOtherThreadStoreBufferBytes(
+            lsqID, request->mainReq()->getPaddr(),
+            request->mainReq()->getByteEnable(), generation);
+        lsq->markStoreBufferBlockVisible(block_paddr, generation);
+        lsq->notifyOtherThreadsStoreVisible(lsqID,
+            request->mainReq()->getPaddr(),
+            request->mainReq()->getByteEnable(), store_inst->seqNum,
+            replay_executed_loads);
+    }
+
+    if (from_sbuffer &&
+        (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
+        request->mainReq()->hasPaddr()) {
+        auto generation = request->_storeBufferGeneration;
+        if (generation == 0) {
+            generation = lsq->bumpStoreBufferBlockVersion(
+                request->mainReq()->getPaddr() & cacheBlockMask);
+            request->_storeBufferGeneration = generation;
+        }
+    }
+
     if (!from_sbuffer &&
         (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
         cpu->goldenMemManager() &&
@@ -2559,9 +2793,10 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
             assert(request->req()->getAtomicOpFunctor());
 
             // The AMO response returns the old memory value. Capture it on the
-            // instruction so commit/difftest can use a per-inst copy under SMT.
-            cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue();
-            memcpy(tmp_data, store_inst->getAmoOldGoldenValuePtr(), request->_size);
+            // instruction so commit/difftest can use a per-inst golden copy
+            // under SMT, but derive the new memory image from the DUT-observed
+            // old value captured in goldenData.
+            memcpy(tmp_data, store_inst->getGolden(), request->_size);
 
             (*(request->req()->getAtomicOpFunctor()))(tmp_data);
 
@@ -2675,11 +2910,15 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo
         request->packetSent();
 
         if (isLoad) {
-            auto &storeBuffer = lsq->getStoreBuffer();
-            auto entry = storeBuffer.get(lsqID, pkt->getAddr() & cacheBlockMask);
+            const Addr block_addr = pkt->getAddr() & cacheBlockMask;
+            auto entry = lsq->findForwardingStoreBufferEntry(
+                block_addr, lsqID, request->instruction()->seqNum);
             if (entry) {
                 DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print());
-                if (entry->recordForward(pkt->req, request)) {
+                if (entry->recordForward(
+                        pkt->req, request, lsqID,
+                        request->instruction()->seqNum,
+                        lsq->currentStoreBufferVisibleVersion(block_addr))) {
                     assert(request->isSplit()); // here must be split request
                     stats.sbufferFullForward++;
                 } else if (!request->SBforwardPackets.empty()) {
@@ -2864,8 +3103,12 @@ LSQUnit::dumpInsts() const
     for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) {
         if (it->valid()) {
             const DynInstPtr &inst(it->instruction());
-            cprintf("idx:%d %s.[sn:%llu] %s\n", it.idx(), inst->pcState(), inst->seqNum,
-                    it->addrReady() ? "AddrReady" : "Not AddrReady");
+            cprintf("idx:%d %s.[sn:%llu] %s squashed=%d canWB=%d completed=%d "
+                    "dataReady=%d staFinish=%d stdFinish=%d\n",
+                    it.idx(), inst->pcState(), inst->seqNum,
+                    it->addrReady() ? "AddrReady" : "Not AddrReady",
+                    inst->isSquashed(), it->canWB(), it->completed(),
+                    it->dataReady(), it->staFinish(), it->stdFinish());
         }
     }
 
@@ -3097,19 +3340,37 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     }
 
     if (request) {
+        request->SBforwardPackets.clear();
         request->SQforwardPackets.clear();
+        request->_sbufferBypass = false;
+        if (!load_inst->hasPendingCacheReq()) {
+            request->_goldenSnapshotCaptured = false;
+        }
     }
 
     // Check the SQ for any previous stores that might lead to forwarding
     auto store_it = load_inst->sqIt;
-    panic_if(store_it < storeWBIt, "[sn:%llu] Load instruction's store index is younger than store writeback index",
-             load_inst->seqNum);
-    // End once we've reached the top of the LSQ
-    while (store_it != storeWBIt && !load_inst->isDataPrefetch()) {
+    if (storeWBIt.dereferenceable()) {
+        panic_if(store_it < storeWBIt,
+                 "[sn:%llu] Load instruction's store index is younger than "
+                 "store writeback index",
+                 load_inst->seqNum);
+    }
+    // End once we've reached the top of the LSQ. If storeWBIt is end(), there
+    // is no outstanding SQ forwarding window to scan.
+    while (storeWBIt.dereferenceable() &&
+           store_it != storeWBIt &&
+           !load_inst->isDataPrefetch()) {
         // Move the index to one younger
         store_it--;
         assert(store_it->valid());
         assert(store_it->instruction()->seqNum < load_inst->seqNum);
+        auto store_req = store_it->request();
+
+        if (store_it->completed()) {
+            continue;
+        }
+
         int store_size = store_it->size();
 
         // Cache maintenance instructions go down via the store
@@ -3244,9 +3505,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                             "addr %#x, data: %#lx\n", store_it->instruction()->seqNum, load_inst->seqNum,
                             request->mainReq()->getPaddr(), *((uint64_t*)buffer));
                 }
-
-
-
                 load_inst->setFullForward();
 
                 // Don't need to do anything special for split loads.
@@ -3298,11 +3556,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     // sbuffer forward
     if (!load_inst->isDataPrefetch() && !request->isSplit()) {
         Addr blk_addr = request->mainReq()->getPaddr() & cacheBlockMask;
-        int offset = request->mainReq()->getPaddr() & ~cacheBlockMask;
-        auto &storeBuffer = lsq->getStoreBuffer();
-        auto entry = storeBuffer.get(lsqID, blk_addr);
+        auto entry = lsq->findForwardingStoreBufferEntry(
+            blk_addr, lsqID, load_inst->seqNum);
         if (entry) {
-            if (entry->recordForward(request->mainReq(), request)) {
+            if (entry->recordForward(request->mainReq(), request, lsqID,
+                                     load_inst->seqNum,
+                                     lsq->currentStoreBufferVisibleVersion(
+                                         blk_addr))) {
                 // full forward
                 // no need to send to cache
                 stats.sbufferFullForward++;
@@ -3317,7 +3577,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
                     DPRINTF(LoadPipeline, "Load [sn:%llu] forward from sbuffer, data: %lx\n",
                             load_inst->seqNum, *((uint64_t*)buffer));
                 }
-
                 return NoFault;
             }
         }
@@ -3363,9 +3622,21 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx)
     } else {
         DPRINTF(LoadPipeline, "Load [sn:%llu] sendPacketToCache\n", load_inst->seqNum);
         // if cannot forward from bus, do real cache access
+        bool should_capture_golden =
+            system->multiContextDifftest() &&
+            cpu->goldenMemManager() &&
+            cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr()) &&
+            !request->_goldenSnapshotCaptured;
         request->buildPackets();
         // if the cache is not blocked, do cache access
         request->sendPacketToCache();
+        if (request->isSent() && should_capture_golden) {
+            uint8_t *issue_golden =
+                (uint8_t *)cpu->goldenMemManager()->guestToHost(
+                    request->mainReq()->getPaddr());
+            load_inst->setGolden(issue_golden);
+            request->_goldenSnapshotCaptured = true;
+        }
         if (!request->isSent() && !load_inst->needBankConflictReplay() && !load_inst->needMshrArbFailReplay() &&
             !load_inst->needMshrAliasFailReplay() &&!load_inst->needHitInWriteBufferReplay()) {
             iewStage->blockMemInst(load_inst);
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 4dace5eb99..76496f94d1 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -212,6 +212,8 @@ class LSQUnit
 
         bool addrReady() const { return _addrReady; }
         bool dataReady() const { return _dataReady; }
+        bool staFinish() const { return _staFinish; }
+        bool stdFinish() const { return _stdFinish; }
         bool canForwardToLoad() const { return _addrReady && _dataReady; }
         bool splitStoreFinish() const { return _staFinish && _stdFinish; }
 
@@ -326,6 +328,10 @@ class LSQUnit
      * of the intermediate invalidate.
      */
     void checkSnoop(PacketPtr pkt);
+    void checkLocalStoreVisible(Addr store_paddr,
+                                const std::vector<bool> &store_byte_enable,
+                                InstSeqNum store_seq,
+                                bool replay_executed_loads);
 
     /** Iq issues a load to load pipeline. */
     void issueToLoadPipe(const DynInstPtr &inst);
@@ -353,9 +359,12 @@ class LSQUnit
     /** Writes back stores. */
     void offloadToStoreBuffer(uint32_t max_entries);
 
-    bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector<bool>& mask);
+    bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
+                           uint64_t size, const std::vector<bool>& mask,
+                           InstSeqNum store_seq);
 
     bool storeBufferEmpty() { return lsq->storeBufferEmpty(); }
+    bool storeBufferEmpty(ThreadID tid) { return lsq->storeBufferEmpty(tid); }
     bool storeBufferSQWillFull() const
     {
         return storeQueue.size() > sqFullUpperLimit;
@@ -438,6 +447,9 @@ class LSQUnit
     /** Returns if there are any stores to writeback. */
     bool hasStoresToWB() { return storesToWB > 0; }
 
+    /** Returns if there are older stores/atomics still pending writeback. */
+    bool hasStoresToWBBefore(InstSeqNum seq_num) const;
+
     /** Returns the number of stores to writeback. */
     int numStoresToSbuffer() { return storesToWB; }
 

From 8485ee612541f8c96b985d8644df548c0236a52e Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 30 Mar 2026 17:37:59 +0800
Subject: [PATCH 07/38] cpu-o3: fix storeData uop squash

Change-Id: I146d1ac20d06015e98713f30bae71fef3f5d7bcf
---
 src/cpu/o3/iew.cc      |  5 +++++
 src/cpu/o3/lsq.cc      |  8 ++++++++
 src/cpu/o3/lsq.hh      |  1 +
 src/cpu/o3/lsq_unit.cc | 28 ++++++++++++++++++++++++++++
 src/cpu/o3/lsq_unit.hh |  1 +
 5 files changed, 43 insertions(+)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 7ea9c872ba..412fc9ab57 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -1576,6 +1576,11 @@ IEW::executeInsts()
         // executing
         ppExecute->notify(inst);
 
+        if (inst->isSplitStoreData() &&
+            ldstQueue.splitStoreAddrSquashed(inst)) {
+            inst->setSquashed();
+        }
+
         // Check if the instruction is squashed; if so then skip it
         if (inst->isSquashed()) {
             DPRINTF(IEW, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 9cc59f560d..c7d0c60a9d 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -850,6 +850,14 @@ LSQ::insertStore(const DynInstPtr &store_inst)
     thread[tid].insertStore(store_inst);
 }
 
+bool
+LSQ::splitStoreAddrSquashed(const DynInstPtr &inst)
+{
+    ThreadID tid = inst->threadNumber;
+
+    return thread[tid].splitStoreAddrSquashed(inst);
+}
+
 void
 LSQ::issueToLoadPipe(const DynInstPtr &inst)
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 83f47b5b91..257cf48354 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -841,6 +841,7 @@ class LSQ
     void insertLoad(const DynInstPtr &load_inst);
     /** Inserts a store into the LSQ. */
     void insertStore(const DynInstPtr &store_inst);
+    bool splitStoreAddrSquashed(const DynInstPtr &inst);
 
     /** Executes an amo inst. */
     Fault executeAmo(const DynInstPtr &inst);
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 9d170af470..76978531c1 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -833,6 +833,30 @@ LSQUnit::insertStore(const DynInstPtr& store_inst)
     storeQueue.back().set(store_inst);
 }
 
+bool
+LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst)
+{
+    if (!inst->isSplitStoreData()) {
+        return false;
+    }
+
+    if (!storeQueue.isValidIdx(inst->sqIdx)) {
+        return true;
+    }
+
+    auto sq_it = storeQueue.getIterator(inst->sqIdx);
+    if (!sq_it->valid()) {
+        return true;
+    }
+
+    const auto &sta_inst = sq_it->instruction();
+    if (!sta_inst || sta_inst->seqNum != inst->seqNum) {
+        return true;
+    }
+
+    return sta_inst->isSquashed();
+}
+
 bool
 LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst)
 {
@@ -1780,6 +1804,10 @@ LSQUnit::executeStorePipeSx()
                 continue;
             }
 
+            if (splitStoreAddrSquashed(inst)) {
+                inst->setSquashed();
+            }
+
             if (inst->isSquashed()) {
                 DPRINTF(StorePipeline, "Execute: Instruction was squashed. PC: %s, [tid:%i]"
                     " [sn:%llu]\n", inst->pcState(), inst->threadNumber,
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 76496f94d1..633c952d8f 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -304,6 +304,7 @@ class LSQUnit
     void insertLoad(const DynInstPtr &load_inst);
     /** Inserts a store instruction. */
     void insertStore(const DynInstPtr &store_inst);
+    bool splitStoreAddrSquashed(const DynInstPtr &inst);
 
     /** Check for ordering violations in the LSQ. For a store squash if we
      * ever find a conflicting load. For a load, only squash if we

From 1fcfb256aabf7f5307229df1120935b16cd61098 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 31 Mar 2026 11:26:19 +0800
Subject: [PATCH 08/38] cpu-o3: fix squash drain and wakeup recovery

Change-Id: Icc05a7320ee5bf1495ef98694c3e92847613d79e
---
 src/cpu/o3/commit.cc      |  2 +-
 src/cpu/o3/issue_queue.cc | 26 +++++++++++++++++++-------
 src/cpu/o3/issue_queue.hh |  3 ++-
 src/cpu/o3/rename.cc      |  7 +++++--
 src/cpu/o3/rob.cc         | 30 ++++++++++++++++++++++++++++++
 src/cpu/o3/rob.hh         |  5 +++++
 6 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index e289754896..746a39872b 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -1283,7 +1283,7 @@ Commit::commitInsts()
                 DPRINTF(Commit, "Retiring squashed instruction from "
                         "ROB.\n");
 
-                rob->retireHead(commit_thread);
+                rob->drainSquashedHead(commit_thread);
 
                 ++stats.commitSquashedInsts;
                 // Notify potential listeners that this instruction is squashed
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index f2d09e17de..d8eaaae2cb 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -393,7 +393,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst)
         }
         // check bypass data ready or not
         if (!scheduler->bypassScoreboard[src->flatIndex()]) [[unlikely]] {
-            auto dst_inst = scheduler->getInstByDstReg(src->flatIndex());
+            auto dst_inst = scheduler->getInstByDstReg(src->flatIndex(),
+                                                       inst->threadNumber,
+                                                       inst->seqNum);
             assert(dst_inst);
             if (!dst_inst->isLoad()) panic("dst[sn:%llu] is not load, src[sn:%llu]", dst_inst->seqNum, inst->seqNum);
             warn_once(
@@ -1270,18 +1272,28 @@ Scheduler::ready(OpClass op, int disp_seq)
 }
 
 DynInstPtr
-Scheduler::getInstByDstReg(RegIndex flatIdx)
+Scheduler::getInstByDstReg(RegIndex flatIdx, ThreadID tid,
+                           InstSeqNum consumerSeqNum)
 {
+    DynInstPtr candidate = nullptr;
+
     for (auto iq : issueQues) {
-        for (auto& inst : iq->instList) {
-            for (auto i = 0; i < inst->numDestRegs(); i++) {
-                if (inst->renamedDestIdx(i)->flatIndex() == flatIdx) {
-                    return inst;
+        for (auto &inst : iq->instList) {
+            if (inst->threadNumber != tid || inst->seqNum >= consumerSeqNum) {
+                continue;
+            }
+            for (int i = 0; i < inst->numDestRegs(); i++) {
+                if (inst->renamedDestIdx(i)->flatIndex() != flatIdx) {
+                    continue;
+                }
+                if (!candidate || inst->seqNum > candidate->seqNum) {
+                    candidate = inst;
                 }
             }
         }
     }
-    return nullptr;
+
+    return candidate;
 }
 
 void
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index a91da979db..a4416663a0 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -372,7 +372,8 @@ class Scheduler : public SimObject
     void issueAndSelect();
     void lookahead(std::deque<DynInstPtr>& insts);
     bool ready(const DynInstPtr& inst, int disp_seq);
-    DynInstPtr getInstByDstReg(RegIndex flatIdx);
+    DynInstPtr getInstByDstReg(RegIndex flatIdx, ThreadID tid,
+                               InstSeqNum consumerSeqNum);
 
     void addProducer(const DynInstPtr& inst);
     // return true if insert successful
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index 84e3e0e031..02c4f40144 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -449,9 +449,12 @@ Rename::releasePhysRegs()
         }
 
         removeFromHistory(releaseSeq[tid], tid);
-        // If we committed this cycle then doneSeqNum will be > 0
+        // doneSeqNum is also reused as a squash-progress marker while the
+        // ROB is walking younger entries. Only real commit progress should
+        // release physical registers.
         if (fromCommit->commitInfo[tid].doneSeqNum != 0 &&
-            !fromCommit->commitInfo[tid].squash) {
+            !fromCommit->commitInfo[tid].squash &&
+            !fromCommit->commitInfo[tid].robSquashing) {
 
             finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum;
             releaseSeq[tid] =
diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc
index 410d7dcfac..d57ea8b0df 100644
--- a/src/cpu/o3/rob.cc
+++ b/src/cpu/o3/rob.cc
@@ -428,6 +428,36 @@ ROB::retireHead(ThreadID tid)
     cpu->removeFrontInst(head_inst);
 }
 
+void
+ROB::drainSquashedHead(ThreadID tid)
+{
+    stats.writes++;
+
+    assert(numInstsInROB > 0);
+
+    InstIt head_it = instList[tid].begin();
+
+    DynInstPtr head_inst = std::move(*head_it);
+    instList[tid].erase(head_it);
+
+    assert(head_inst->readyToCommit());
+    assert(head_inst->isSquashed());
+
+    DPRINTF(ROB, "[tid:%i] Draining squashed head instruction, "
+            "instruction PC %s, [sn:%llu]\n", tid, head_inst->pcState(),
+            head_inst->seqNum);
+
+    --numInstsInROB;
+
+    commitGroup(head_inst, tid);
+
+    head_inst->clearInROB();
+
+    updateHead();
+
+    cpu->removeFrontInst(head_inst);
+}
+
 bool
 ROB::isHeadGroupReady(ThreadID tid)
 {
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index 1fdcbf0857..94b93d2593 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -164,6 +164,11 @@ class ROB
      */
     void retireHead(ThreadID tid);
 
+    /** Drains a squashed head instruction from a specific thread without
+     *  marking it committed.
+     */
+    void drainSquashedHead(ThreadID tid);
+
     /** Is the oldest instruction across all threads ready. */
 //    bool isHeadReady();
 

From a516e81504b6c7a4984c70e0ed13e5e8090ce736 Mon Sep 17 00:00:00 2001
From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com>
Date: Tue, 31 Mar 2026 19:51:53 +0800
Subject: [PATCH 09/38] cpu-o3: fix iew smt squash (#809)

Co-authored-by: mo haonan <mohaonan@node024.bosccluster.com>
---
 src/cpu/o3/iew.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 412fc9ab57..a98f92f1ff 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -830,7 +830,6 @@ IEW::checkSquash()
             fetchRedirect[i] = false;
             iewStats.stallEvents[ROBWalk]++;
             setAllStalls(StallReason::CommitSquash);
-            return;
         }
 
         if (fromCommit->commitInfo[i].robSquashing) {

From 0b4960f828010acb6a9a00836bcc6f118f33561c Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 2 Apr 2026 10:50:17 +0800
Subject: [PATCH 10/38] cpu-o3: fix smt fetch squash & load wakeup & iq init

Change-Id: Ic416b537a4f2c87059c92c7b5be81618b1898e22
---
 src/arch/riscv/tlb.cc               |  1 -
 src/cpu/o3/issue_queue.cc           |  1 +
 src/cpu/o3/lsq.cc                   | 42 +++++++++++++++++++++++++++++
 src/cpu/o3/lsq.hh                   |  6 +++++
 src/cpu/o3/lsq_unit.cc              | 16 ++++++++---
 src/cpu/o3/lsq_unit.hh              |  2 +-
 src/cpu/o3/smt_sched.hh             |  8 +++---
 src/cpu/pred/btb/decoupled_bpred.cc |  3 ++-
 src/cpu/pred/btb/ftq.cc             | 15 +++++++++++
 src/cpu/pred/btb/ftq.hh             |  1 +
 10 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc
index 96077f8273..050e0735b8 100644
--- a/src/arch/riscv/tlb.cc
+++ b/src/arch/riscv/tlb.cc
@@ -2146,7 +2146,6 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
 
     return NoFault;
 }
-
 PrivilegeMode
 TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
 {
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index d8eaaae2cb..c3739031aa 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -1750,6 +1750,7 @@ Scheduler::initIQICountSmtScheduler(int numThreads)
         InstsCounter* counter = iq->getInstsCounter();
         assert(counter);
         iq->initIndependentIQICountScheduler(numThreads);
+        iq->selector->setparent(this, iq);
     }
 }
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index c7d0c60a9d..3d72ae1930 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -2038,6 +2038,48 @@ LSQ::dumpInsts(ThreadID tid) const
     thread.at(tid).dumpInsts();
 }
 
+void
+LSQ::dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const
+{
+    cprintf("Store buffer state for tid %i:\n", tid);
+    cprintf("  flushing=%d flushBeforeSeq=%llu\n",
+            _storeBufferFlushing[tid],
+            static_cast<unsigned long long>(_storeBufferFlushBeforeSeq[tid]));
+    cprintf("  storesToWB=%d hasStoresToWBBefore=%d\n",
+            thread.at(tid).numStoresToSbuffer(),
+            thread.at(tid).hasStoresToWBBefore(seq_num));
+    cprintf("  sbufferSize(tid)=%llu sbufferSizeBeforeSeq=%llu\n",
+            static_cast<unsigned long long>(storeBuffer.size(tid)),
+            static_cast<unsigned long long>(storeBuffer.size(tid, seq_num)));
+}
+
+void
+LSQ::dumpStoreBuffer(ThreadID tid) const
+{
+    cprintf("Store buffer entries for tid %i:\n", tid);
+    const auto &entries = storeBuffer.entries();
+    for (size_t index = 0; index < entries.size(); ++index) {
+        if (!storeBuffer.valid(index)) {
+            continue;
+        }
+
+        auto *entry = entries[index];
+        if (!entry || entry->tid != tid) {
+            continue;
+        }
+
+        cprintf("  idx:%d seq:%llu paddr:%#lx vaddr:%#lx sending=%d vice=%d generation=%llu request=%p\n",
+                entry->index,
+                static_cast<unsigned long long>(entry->seqNum),
+                entry->blockPaddr,
+                entry->blockVaddr,
+                entry->sending,
+                entry->vice != nullptr,
+                static_cast<unsigned long long>(entry->generation),
+                entry->request);
+    }
+}
+
 bool
 LSQ::isMisaligned(const DynInstPtr& inst, Addr vaddr, int size)
 {
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 257cf48354..66038bf154 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -208,6 +208,8 @@ class LSQ
         uint64_t size(ThreadID tid) const;
         uint64_t size(ThreadID tid, InstSeqNum seq_num) const;
         uint64_t unsentSize() const;
+        const std::vector<StoreBufferEntry *> &entries() const { return data_vec; }
+        bool valid(size_t index) const { return data_vld.at(index); }
         StoreBufferEntry *getEmpty();
         void insert(StoreBufferEntry *entry);
         StoreBufferEntry *get(ThreadID tid, uint64_t addr) const;
@@ -1033,6 +1035,10 @@ class LSQ
     void dumpInsts() const;
     /** Debugging function to print out instructions from a specific thread. */
     void dumpInsts(ThreadID tid) const;
+    /** Debugging function to print store-buffer flush state for a thread. */
+    void dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const;
+    /** Debugging function to print store-buffer entries for a thread. */
+    void dumpStoreBuffer(ThreadID tid) const;
 
     bool isMisaligned(const DynInstPtr& inst, Addr vaddr, int size);
 
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 76978531c1..389931080d 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -1571,10 +1571,20 @@ LSQUnit::executeLoadPipeSx()
                         fault = loadDoTranslate(inst);
                         break;
                     case 1:
-                        iewStage->getScheduler()->specWakeUpFromLoadPipe(inst);
-                        // Loads will mark themselves as executed, and their writeback
-                        // event adds the instruction to the queue to commit
                         fault = loadDoSendRequest(inst);
+                        if (fault == NoFault &&
+                            !inst->replayOrSkipFollowingPipe() &&
+                            inst->readPredicate() &&
+                            inst->readMemAccPredicate() &&
+                            inst->savedRequest &&
+                            inst->savedRequest->isTranslationComplete() &&
+                            inst->savedRequest->isMemAccessRequired()) {
+                            iewStage->getScheduler()->specWakeUpFromLoadPipe(
+                                inst);
+                        }
+                        // Loads will mark themselves as executed, and their
+                        // writeback event adds the instruction to the queue
+                        // to commit.
                         iewStage->SquashCheckAfterExe(inst);
                         break;
                     case 2:
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 633c952d8f..2bfa53db38 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -452,7 +452,7 @@ class LSQUnit
     bool hasStoresToWBBefore(InstSeqNum seq_num) const;
 
     /** Returns the number of stores to writeback. */
-    int numStoresToSbuffer() { return storesToWB; }
+    int numStoresToSbuffer() const { return storesToWB; }
 
     /** Update loadCompletedIdx and storeCompletedIdx */
     void updateCompletedIdx();
diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh
index e6b00ab4d8..74198c44fd 100644
--- a/src/cpu/o3/smt_sched.hh
+++ b/src/cpu/o3/smt_sched.hh
@@ -137,12 +137,12 @@ public:
 
     ThreadID getThread() override {
         ThreadID selectedTid = 0;
-        uint64_t maxCount = counter->getCounter(0);
-        
+        uint64_t minCount = counter->getCounter(0);
+
         for (ThreadID tid = 1; tid < numThreads; ++tid) {
             uint64_t count = counter->getCounter(tid);
-            if (count > maxCount) {
-                maxCount = count;
+            if (count < minCount) {
+                minCount = count;
                 selectedTid = tid;
             }
         }
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index bb87772263..01e9b78ac3 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -330,7 +330,7 @@ DecoupledBPUWithBTB::generateFinalPredAndCreateBubbles(ThreadID tid)
         if (ubtb->isEnabled()) {
             ubtb->updateUsingS3Pred(predsOfEachStage[numStages - 1]);
         }
-        if (abtb->isEnabled() && ftq.backId(tid)) {
+        if (abtb->isEnabled() && !ftq.empty(tid)) {
             auto previous_block_startpc = ftq.back(tid).startPC;
             abtb->updateUsingS3Pred(predsOfEachStage[numStages - 1], previous_block_startpc);
         } else if (abtb->isEnabled()) {
@@ -462,6 +462,7 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id,
                 "Ignore squash for tid %u on missing FTQ target %u; "
                 "recovering predictor state from redirect PC %#lx\n",
                 tid, target_id, redirect_pc);
+        ftq.clear(tid);
         clearPreds(tid);
         threads[tid].validprediction = false;
         threads[tid].s0PC = redirect_pc;
diff --git a/src/cpu/pred/btb/ftq.cc b/src/cpu/pred/btb/ftq.cc
index 3642ef7162..b8abfe7996 100644
--- a/src/cpu/pred/btb/ftq.cc
+++ b/src/cpu/pred/btb/ftq.cc
@@ -1,3 +1,5 @@
+#include <algorithm>
+
 #include "ftq.hh"
 
 namespace gem5
@@ -53,6 +55,19 @@ FetchTargetQueue::squashAfter(FetchTargetId squashId, ThreadID tid)
     queue[tid].fetchptr = squashId + 1;
 }
 
+void
+FetchTargetQueue::clear(ThreadID tid)
+{
+    const FetchTargetId nextTargetId = std::max(
+        queue[tid].fetchptr,
+        queue[tid].baseTargetId +
+            static_cast<FetchTargetId>(queue[tid].cap.size()));
+
+    queue[tid].cap.clear();
+    queue[tid].baseTargetId = nextTargetId;
+    queue[tid].fetchptr = nextTargetId;
+}
+
 
 }
 }
diff --git a/src/cpu/pred/btb/ftq.hh b/src/cpu/pred/btb/ftq.hh
index c43d071447..c762cd0b83 100644
--- a/src/cpu/pred/btb/ftq.hh
+++ b/src/cpu/pred/btb/ftq.hh
@@ -80,6 +80,7 @@ public:
     void finishTarget(ThreadID tid);
     void commitTarget(ThreadID tid);
     void squashAfter(FetchTargetId targetId, ThreadID tid);
+    void clear(ThreadID tid);
 };
 
 }

From b11b00ed446a86b72aac2c423573e0701e93a864 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 2 Apr 2026 20:45:22 +0800
Subject: [PATCH 11/38] cpu-o3: fix lsq request lifetime and store completion

Change-Id: Ieb232b296c7c99ea216c14c23f135e6e081870a6
---
 src/cpu/o3/iew.cc      |   4 +-
 src/cpu/o3/lsq.cc      |  67 ++++++++++++++++++++++
 src/cpu/o3/lsq.hh      |  20 +++++++
 src/cpu/o3/lsq_unit.cc | 124 ++++++++++++++++++++++++++++++-----------
 src/cpu/o3/lsq_unit.hh |  10 ++++
 5 files changed, 190 insertions(+), 35 deletions(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index a98f92f1ff..04c2b893ca 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -1714,8 +1714,8 @@ IEW::writebackInsts()
         DynInstPtr inst = toCommit->insts[inst_num];
         ThreadID tid = inst->threadNumber;
 
-        if (inst->savedRequest && inst->isLoad()) {
-            inst->pf_source = inst->savedRequest->mainReq()->getPFSource();
+        if (inst->isLoad()) {
+            inst->pf_source = ldstQueue.getLoadPFSource(inst);
         }
 
         DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n",
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 3d72ae1930..4fe227f6ac 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -1695,6 +1695,29 @@ LSQ::getLSQHeadInst(ThreadID tid, bool isLoad)
     }
 }
 
+int
+LSQ::getLoadPFSource(const DynInstPtr &inst) const
+{
+    if (!inst || !inst->isLoad() || inst->lqIdx < 0) {
+        return -1;
+    }
+
+    const auto &entry = thread[inst->threadNumber].loadQueue[inst->lqIdx];
+    auto *request = entry.request();
+    if (!request) {
+        return -1;
+    }
+
+    // A load can retire through a split request or after replay/discard has
+    // detached some request state. Prefetch source is best-effort metadata, so
+    // only query a live sub-request when one still exists.
+    if (request->numReqs() == 0) {
+        return -1;
+    }
+
+    return request->req()->getPFSource();
+}
+
 bool
 LSQ::isStalled()
 {
@@ -2371,6 +2394,12 @@ LSQ::SplitDataRequest::mainReq()
     return _mainReq;
 }
 
+RequestPtr
+LSQ::SplitDataRequest::mainReq() const
+{
+    return _mainReq;
+}
+
 void
 LSQ::SplitDataRequest::initiateTranslation()
 {
@@ -2579,9 +2608,47 @@ LSQ::LSQRequest::forward()
     }
 }
 
+void
+LSQ::LSQRequest::detachLSQEntry()
+{
+    if (!_inst) {
+        return;
+    }
+
+    if (isLoad() && _inst->lqIdx >= 0 &&
+        _port.loadQueue[_inst->lqIdx].request() == this) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from LQ entry\n",
+                _inst->seqNum);
+        _port.loadQueue[_inst->lqIdx].setRequest(nullptr);
+    } else if ((isAtomic() || _inst->isStore()) && _inst->sqIdx >= 0 &&
+               _port.storeQueue[_inst->sqIdx].request() == this) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from SQ entry\n",
+                _inst->seqNum);
+        _port.storeQueue[_inst->sqIdx].setRequest(nullptr);
+    }
+}
+
+void
+LSQ::LSQRequest::detachInflightLoad()
+{
+    if (!isLoad()) {
+        return;
+    }
+
+    auto &inflight = _port.inflightLoads;
+    auto it = std::find(inflight.begin(), inflight.end(), this);
+    if (it != inflight.end()) {
+        DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from inflightLoads\n",
+                _inst ? _inst->seqNum : 0);
+        inflight.erase(it);
+    }
+}
+
 LSQ::LSQRequest::~LSQRequest()
 {
     assert(!isAnyOutstandingRequest());
+    detachLSQEntry();
+    detachInflightLoad();
     if (_inst && _inst->savedRequest == this) {
         DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum);
          _inst->savedRequest = nullptr;
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 66038bf154..159eaa0ab5 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -389,6 +389,14 @@ class LSQ
         /** Install the request in the LQ/SQ. */
         void install();
 
+        /** If the request is still installed in the current LQ/SQ slot,
+         * detach that slot so later scans do not observe a discarded or
+         * deleted request through the queue entry. */
+        void detachLSQEntry();
+
+        /** Remove the request from the in-flight load tracker if present. */
+        void detachInflightLoad();
+
         bool squashed() const override;
 
 
@@ -516,6 +524,13 @@ class LSQ
             return req();
         }
 
+        virtual RequestPtr
+        mainReq() const
+        {
+            assert (_reqs.size() == 1);
+            return req();
+        }
+
         /**
          * Test if there is any in-flight translation or mem access request
          */
@@ -655,6 +670,8 @@ class LSQ
         void
         discard()
         {
+            detachLSQEntry();
+            detachInflightLoad();
             release(Flag::Discarded);
         }
 
@@ -786,6 +803,7 @@ class LSQ
         virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask);
 
         virtual RequestPtr mainReq();
+        virtual RequestPtr mainReq() const;
         virtual PacketPtr mainPacket();
         virtual std::string name() const { return "SplitDataRequest"; }
     };
@@ -979,6 +997,8 @@ class LSQ
     /** Returns whether the head instruction of sq has completed*/
     const DynInstPtr& getLSQHeadInst(ThreadID tid, bool isLoad);
 
+    int getLoadPFSource(const DynInstPtr &inst) const;
+
     /**
      * Returns if the LSQ is stalled due to a memory operation that must be
      * replayed.
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 389931080d..467fd73160 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -182,10 +182,10 @@ LSQUnit::SQEntry::setStatus(SplitStoreStatus status)
 LSQUnit::WritebackRegEvent::WritebackRegEvent(const DynInstPtr &_inst,
         PacketPtr _pkt, LSQUnit *lsq_ptr)
     : Event(Default_Pri, AutoDelete),
-      inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr)
+      inst(_inst), request(_inst->savedRequest), pkt(_pkt), lsqPtr(lsq_ptr)
 {
-    assert(_inst->savedRequest);
-    _inst->savedRequest->writebackScheduled();
+    assert(request);
+    request->writebackScheduled();
 }
 
 void
@@ -195,8 +195,8 @@ LSQUnit::WritebackRegEvent::process()
 
     lsqPtr->writebackReg(inst, pkt);
 
-    assert(inst->savedRequest);
-    inst->savedRequest->writebackDone();
+    assert(request);
+    request->writebackDone();
     delete pkt;
 }
 
@@ -833,6 +833,20 @@ LSQUnit::insertStore(const DynInstPtr& store_inst)
     storeQueue.back().set(store_inst);
 }
 
+LSQUnit::LSQRequest *
+LSQUnit::currentLoadRequest(const DynInstPtr &inst)
+{
+    return (inst && inst->lqIdx >= 0) ? loadQueue[inst->lqIdx].request()
+                                      : nullptr;
+}
+
+LSQUnit::LSQRequest *
+LSQUnit::currentStoreRequest(const DynInstPtr &inst)
+{
+    return (inst && inst->sqIdx >= 0) ? storeQueue[inst->sqIdx].request()
+                                      : nullptr;
+}
+
 bool
 LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst)
 {
@@ -866,9 +880,10 @@ LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_
     Addr store_eff_addr1 = store_inst->physEffAddr >> depCheckShift;
     Addr store_eff_addr2 = (store_inst->physEffAddr + store_inst->effSize - 1) >> depCheckShift;
 
-    LSQRequest* store_req = store_inst->savedRequest;
+    LSQRequest* store_req = currentStoreRequest(store_inst);
+    LSQRequest* load_req = currentLoadRequest(load_inst);
     // Dont perform pipe line nuke check for split load
-    bool load_is_splited = load_inst->savedRequest && load_inst->savedRequest->isSplit();
+    bool load_is_splited = load_req && load_req->isSplit();
     bool load_need_check = !load_is_splited && load_inst->effAddrValid() &&
                             (load_inst->lqIt >= store_inst->lqIt);
     bool store_need_check = store_req && store_req->isTranslationComplete() &&
@@ -948,7 +963,7 @@ LSQUnit::checkSnoop(PacketPtr pkt)
 
     DynInstPtr ld_inst = iter->instruction();
     assert(ld_inst);
-    LSQRequest *request = ld_inst->savedRequest;
+    LSQRequest *request = iter->request();
 
     // Check that this snoop didn't just invalidate our lock flag
     if (ld_inst->effAddrValid() && request &&
@@ -962,7 +977,7 @@ LSQUnit::checkSnoop(PacketPtr pkt)
     while (++iter != loadQueue.end()) {
         ld_inst = iter->instruction();
         assert(ld_inst);
-        request = ld_inst->savedRequest;// iter->request();
+        request = iter->request();
         if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered())
             continue;
 
@@ -1066,7 +1081,10 @@ LSQUnit::checkLocalStoreVisible(Addr store_paddr,
             continue;
         }
 
-        LSQRequest *request = ld_inst->savedRequest;
+        LSQRequest *request = it->request();
+        // Replay/cancel paths can leave the dyninst carrying a stale
+        // savedRequest pointer after the active LQ request has been replaced
+        // or dropped. Only the current queue entry request is safe here.
         if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) {
             continue;
         }
@@ -1107,8 +1125,27 @@ Fault
 LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
         const DynInstPtr& inst)
 {
+    LSQRequest *request = nullptr;
+    if (inst->isLoad()) {
+        if (inst->lqIdx >= 0) {
+            request = loadQueue[inst->lqIdx].request();
+        }
+    } else if (inst->isStore() || inst->isAtomic()) {
+        if (inst->sqIdx >= 0) {
+            request = storeQueue[inst->sqIdx].request();
+        }
+    }
+
+    // Replay/cancel paths can drop the active LSQ request before the
+    // instruction is retried. In that window the dyninst may still carry a
+    // stale savedRequest pointer, so only the current LSQ entry request is
+    // safe to inspect here.
+    if (!request) {
+        return NoFault;
+    }
+
     auto saved_it = loadIt;
-    for (auto req0 : inst->savedRequest->_reqs) {
+    for (auto req0 : request->_reqs) {
         Addr inst_eff_addr1 = req0->getPaddr() >> depCheckShift;
         Addr inst_eff_addr2 = (req0->getPaddr() + req0->getSize() - 1) >> depCheckShift;
 
@@ -1222,6 +1259,7 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow)
     // Reset DTB translation state
     inst->translationStarted(false);
     inst->translationCompleted(false);
+    inst->savedRequest = nullptr;
     // clear request in loadQueue
     loadQueue[inst->lqIdx].setRequest(nullptr);
     if (dropReqNow) {
@@ -1291,8 +1329,9 @@ LSQUnit::loadDoTranslate(const DynInstPtr &inst)
         DPRINTF(LoadPipeline, "Load [sn:%llu] setTLBMissReplay\n", inst->seqNum);
     }
 
-    if (inst->savedRequest && inst->savedRequest->isTranslationComplete()) {
-        inst->setNormalLd(inst->savedRequest->isNormalLd());
+    if (auto *request = currentLoadRequest(inst);
+        request && request->isTranslationComplete()) {
+        inst->setNormalLd(request->isNormalLd());
 
         cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::VAddress, inst->effAddr);
         cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::PAddress, inst->physEffAddr);
@@ -1307,7 +1346,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst)
     DPRINTF(LoadPipeline, "loadDoSendRequest: load [sn:%lli]\n", inst->seqNum);
     assert(!inst->isSquashed());
     Fault load_fault = inst->getFault();
-    LSQRequest* request = inst->savedRequest;
+    LSQRequest* request = currentLoadRequest(inst);
 
     if (inst->effAddrValid()) {
         for (int i = 0; i < storePipeSx[1]->size; i++) {
@@ -1353,9 +1392,9 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst)
     }
 
     if (load_fault != NoFault && inst->translationCompleted() &&
-            inst->savedRequest->isPartialFault()
-            && !inst->savedRequest->isComplete()) {
-        assert(inst->savedRequest->isSplit());
+            request && request->isPartialFault()
+            && !request->isComplete()) {
+        assert(request->isSplit());
         // If we have a partial fault where the mem access is not complete yet
         // then the cache must have been blocked. This load will be re-executed
         // when the cache gets unblocked. We will handle the fault when the
@@ -1398,7 +1437,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
     DPRINTF(LoadPipeline, "loadDoRecvData: load [sn:%lli]\n", inst->seqNum);
 
     assert(!inst->isSquashed());
-    LSQRequest* request = inst->savedRequest;
+    LSQRequest* request = currentLoadRequest(inst);
     bool earlyWakeupCacheMissReplay = false;
 
     if (inst->wakeUpEarly()) {
@@ -1513,7 +1552,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
 
     // No nuke happens, prepare the inst data
     // assert(request->isNormalLd() ? !request->isAnyOutstandingRequest() : true);
-    request = inst->savedRequest;
+    request = currentLoadRequest(inst);
     if (inst->fullForward()) {
         DPRINTF(LoadPipeline, "Load [sn:%llu] fullForward\n", inst->seqNum);
         assert(request);
@@ -1570,15 +1609,16 @@ LSQUnit::executeLoadPipeSx()
                     case 0:
                         fault = loadDoTranslate(inst);
                         break;
-                    case 1:
+                    case 1: {
                         fault = loadDoSendRequest(inst);
+                        auto *request = currentLoadRequest(inst);
                         if (fault == NoFault &&
                             !inst->replayOrSkipFollowingPipe() &&
                             inst->readPredicate() &&
                             inst->readMemAccPredicate() &&
-                            inst->savedRequest &&
-                            inst->savedRequest->isTranslationComplete() &&
-                            inst->savedRequest->isMemAccessRequired()) {
+                            request &&
+                            request->isTranslationComplete() &&
+                            request->isMemAccessRequired()) {
                             iewStage->getScheduler()->specWakeUpFromLoadPipe(
                                 inst);
                         }
@@ -1587,6 +1627,7 @@ LSQUnit::executeLoadPipeSx()
                         // to commit.
                         iewStage->SquashCheckAfterExe(inst);
                         break;
+                    }
                     case 2:
                         fault = loadDoRecvData(inst);
 
@@ -1653,10 +1694,12 @@ LSQUnit::executeLoadPipeSx()
                 else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst);
                 else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst);
                 else if (inst->needNukeReplay()) {
-                    if (inst->savedRequest && inst->cacheHit()) {
-                        loadSetReplay(inst, inst->savedRequest, true);
-                    } else if (inst->savedRequest && inst->hasPendingCacheReq()) {
-                        loadSetReplay(inst, inst->savedRequest, false);
+                    if (auto *request = currentLoadRequest(inst); request) {
+                        if (inst->cacheHit()) {
+                            loadSetReplay(inst, request, true);
+                        } else if (inst->hasPendingCacheReq()) {
+                            loadSetReplay(inst, request, false);
+                        }
                     }
                     inst->issueQue->retryMem(inst);
                 }
@@ -1686,7 +1729,10 @@ LSQUnit::executeLoadPipeSx()
             }
 
             if (i == loadPipeStages - 1 && !inst->needReplay()) {
-                if (inst->isNormalLd() || !inst->readMemAccPredicate()) iewStage->readyToFinish(inst);
+                if (inst->isExecuted() &&
+                    (inst->isNormalLd() || !inst->readMemAccPredicate())) {
+                    iewStage->readyToFinish(inst);
+                }
                 iewStage->activityThisCycle();
                 inst->endPipelining();
                 DPRINTF(LoadPipeline, "Load [sn:%llu] ready to finish\n",
@@ -2734,7 +2780,8 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt)
 
             if (!htm_fault) {
                 assert(dynamic_cast<ReExec*>(inst->fault.get()) != nullptr ||
-                       inst->savedRequest->isPartialFault());
+                       (currentLoadRequest(inst) &&
+                        currentLoadRequest(inst)->isPartialFault()));
 
             } else if (!pkt->htmTransactionFailedInCache()) {
                 // Situation in which the instruction has a hardware
@@ -2755,8 +2802,12 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt)
         }
     }
 
-    if (!inst->savedRequest->isNormalLd()) {
-        // Need to insert instruction into queue to commit
+    const bool finish_after_writeback =
+        !inst->isNormalLd() || !inst->inPipe();
+    if (finish_after_writeback) {
+        // Normal loads usually wait for the last pipe stage to enqueue commit.
+        // If the response arrives after the load has already drained from the
+        // pipe, writeback must finish the instruction here.
         iewStage->readyToFinish(inst);
         iewStage->activityThisCycle();
     }
@@ -2780,13 +2831,19 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
      * store queue. */
     DynInstPtr store_inst = store_idx->instruction();
     auto request = store_idx->request();
+    // Predicated-off or zero-sized stores can legitimately reach completion
+    // without ever materializing a backing memory request.
+    const bool has_main_request =
+        request && request->numReqs() > 0;
+    const bool has_paddr =
+        has_main_request && request->mainReq()->hasPaddr();
     DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
             "idx:%i\n",
             store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1);
 
     if (!from_sbuffer &&
         (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
-        request->mainReq()->hasPaddr()) {
+        has_paddr) {
         const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask;
         auto generation = request->_storeBufferGeneration;
         const bool replay_executed_loads =
@@ -2806,7 +2863,7 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
 
     if (from_sbuffer &&
         (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
-        request->mainReq()->hasPaddr()) {
+        has_paddr) {
         auto generation = request->_storeBufferGeneration;
         if (generation == 0) {
             generation = lsq->bumpStoreBufferBlockVersion(
@@ -2818,6 +2875,7 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
     if (!from_sbuffer &&
         (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) &&
         cpu->goldenMemManager() &&
+        has_paddr &&
         cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) {
         Addr paddr = request->mainReq()->getPaddr();
         if (!store_inst->isAtomic()) {
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 2bfa53db38..837cc65506 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -154,6 +154,7 @@ class LSQUnit
         }
 
         LSQRequest* request() { return _request; }
+        const LSQRequest* request() const { return _request; }
         void setRequest(LSQRequest* r) { _request = r; }
         bool hasRequest() { return _request != nullptr; }
         /** Member accessors. */
@@ -390,6 +391,12 @@ class LSQUnit
     /** Check if there exists raw nuke between load and store. */
     bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst);
 
+    /** Returns the current request attached to an active LQ entry. */
+    LSQRequest *currentLoadRequest(const DynInstPtr &inst);
+
+    /** Returns the current request attached to an active SQ entry. */
+    LSQRequest *currentStoreRequest(const DynInstPtr &inst);
+
     /** Returns the number of free LQ entries. */
     unsigned numFreeLoadEntries();
 
@@ -583,6 +590,9 @@ class LSQUnit
         /** Instruction whose results are being written back. */
         DynInstPtr inst;
 
+        /** Request that owns the delayed writeback lifecycle. */
+        LSQRequest *request;
+
         /** The packet that would have been sent to memory. */
         PacketPtr pkt;
 

From 40bf365d1d262123eb1328740099996f3ff4ebd2 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 2 Apr 2026 20:45:46 +0800
Subject: [PATCH 12/38] arch-riscv: fix agnostic vector load fill

Change-Id: I11b460b6a6554998d052a020a02d84eb2b0664ad
---
 .../riscv/isa/vector/base/vector_mem.temp.isa | 28 +++++++++++++++++++
 .../isa/vector/simple/vector_mem.temp.isa     | 28 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
index e97eef0940..2448a9ad95 100644
--- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;
diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
index a8e5b71f99..4b64f5dac0 100644
--- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;

From a125904b9f30ee07218925db89d72eacea2bca46 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 8 Apr 2026 20:06:40 +0800
Subject: [PATCH 13/38] cpu: add asid hash to decoupled btb

Change-Id: Ice8e66d841a40e8c8420bd4756237eb9399d1642
---
 src/cpu/pred/btb/abtb.cc            | 61 ++++++++++++++++++-----------
 src/cpu/pred/btb/abtb.hh            | 14 ++++---
 src/cpu/pred/btb/btb_ittage.cc      | 33 +++++++++-------
 src/cpu/pred/btb/btb_ittage.hh      | 11 +++---
 src/cpu/pred/btb/btb_tage.cc        | 57 +++++++++++++++------------
 src/cpu/pred/btb/btb_tage.hh        | 15 ++++---
 src/cpu/pred/btb/btb_ubtb.cc        | 43 ++++++++++++--------
 src/cpu/pred/btb/btb_ubtb.hh        | 13 +++---
 src/cpu/pred/btb/common.hh          | 48 +++++++++++++++++++++++
 src/cpu/pred/btb/decoupled_bpred.cc | 17 ++++++++
 src/cpu/pred/btb/decoupled_bpred.hh |  1 +
 src/cpu/pred/btb/mbtb.cc            | 27 +++++++------
 src/cpu/pred/btb/mbtb.hh            | 16 ++++----
 src/cpu/pred/btb/microtage.cc       | 48 ++++++++++++++---------
 src/cpu/pred/btb/microtage.hh       | 13 +++---
 15 files changed, 274 insertions(+), 143 deletions(-)

diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc
index aeafc9bb38..8013900e83 100644
--- a/src/cpu/pred/btb/abtb.cc
+++ b/src/cpu/pred/btb/abtb.cc
@@ -166,28 +166,42 @@ AheadBTB::setTrace()
 std::vector<AheadBTB::TickedBTBEntry>
 AheadBTB::processEntries(const std::vector<TickedBTBEntry>& entries, Addr startAddr)
 {
-    int hitNum = entries.size();
-    bool hit = hitNum > 0;
+    auto processed_entries = entries;
     
+    // Sort by instruction order
+    std::sort(processed_entries.begin(), processed_entries.end(), 
+             [](const BTBEntry &a, const BTBEntry &b) {
+                 return a.pc < b.pc;
+             });
+
+    auto it = std::remove_if(processed_entries.begin(), processed_entries.end(),
+                           [startAddr](const BTBEntry &e) {
+                               return e.pc < startAddr;
+                           });
+    processed_entries.erase(it, processed_entries.end());
+
+    Addr abtb_end = (startAddr + predictWidth) &
+                    ~mask(floorLog2(predictWidth) - 1);
+    it = std::remove_if(processed_entries.begin(), processed_entries.end(),
+                        [abtb_end](const BTBEntry &e) {
+                            return e.pc >= abtb_end;
+                        });
+    processed_entries.erase(it, processed_entries.end());
+
+    int hitNum = processed_entries.size();
+    bool hit = hitNum > 0;
+
     // Update prediction statistics
     if (hit) {
         DPRINTF(ABTB, "BTB: lookup hit, dumping hit entry\n");
         btbStats.predHit += hitNum;
-        for (auto &entry: entries) {
+        for (auto &entry: processed_entries) {
             printTickedBTBEntry(entry);
         }
     } else {
         btbStats.predMiss++;
         DPRINTF(ABTB, "BTB: lookup miss\n");
     }
-
-    auto processed_entries = entries;
-    
-    // Sort by instruction order
-    std::sort(processed_entries.begin(), processed_entries.end(), 
-             [](const BTBEntry &a, const BTBEntry &b) {
-                 return a.pc < b.pc;
-             });
     return processed_entries;
 }
 
@@ -299,12 +313,13 @@ AheadBTB::putPCHistory(Addr startAddr,
                          std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<BTBMeta>();
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     // Lookup all matching entries in BTB
-    auto find_entries = lookup(startAddr);
-    
+    auto find_entries = lookup(startAddr, asidHash);
+
     // Process BTB entries
     auto processed_entries = processEntries(find_entries, startAddr);
-    
+
     // Fill predictions for each pipeline stage
     fillStagePredictions(processed_entries, stagePreds);
     
@@ -343,13 +358,13 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget
  * @return Vector of matching BTB entries
  */
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookupSingleBlock(Addr block_pc)
+AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
         return res; // ignore false hit when lowest bit is 1
     }
-    Addr btb_idx = getIndex(block_pc);
+    Addr btb_idx = getIndex(block_pc, asidHash);
     auto btb_set = btb[btb_idx];
     assert(btb_idx < numSets);
     // AheadBTB always uses ahead-pipelined implementation:
@@ -357,7 +372,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc)
     DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx);
     aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set));
 
-    Addr tag_curStartpc = getTag(block_pc);// abtb uses current FB pc to get tag
+    Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag
     Addr pc = 0;
     Addr idx_prvStartpc = 0;// abtb uses previous FB pc to get index
     BTBSet set;
@@ -392,7 +407,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc)
 }
 
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookup(Addr block_pc)
+AheadBTB::lookup(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -400,7 +415,7 @@ AheadBTB::lookup(Addr block_pc)
     }
 
     // AheadBTB always uses single block lookup
-    res = lookupSingleBlock(block_pc);
+    res = lookupSingleBlock(block_pc, asidHash);
     return res;
 }
 
@@ -594,12 +609,12 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC)
 
     for (auto &entry : entries_to_update) {
         Addr startPC = s3Pred.bbStart;
-        Addr btb_tag = getTag(startPC);  // use last pc to get tag
+        Addr btb_tag = getTag(startPC, s3Pred.asidHash);  // use last pc to get tag
         if (previousPC == 0) {
             DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n");
             return;
         }
-        Addr btb_idx = getIndex(previousPC);  // use last pc to get idx
+        Addr btb_idx = getIndex(previousPC, s3Pred.asidHash);  // use last pc to get idx
         BranchInfo takenbranchinfo;
         takenbranchinfo.pc = s3Pred.getTakenEntry().pc;
         takenbranchinfo.target = s3Pred.getTakenEntry().target;
@@ -670,7 +685,7 @@ AheadBTB::update(const FetchTarget &stream)
     // 4. Update BTB entries - each entry uses its own PC to calculate index and tag
     for (auto &entry : entries_to_update) {
         Addr startPC = stream.getRealStartPC();
-        Addr btb_tag = getTag(startPC);  // use current pc to get tag
+        Addr btb_tag = getTag(startPC, stream.asidHash);  // use current pc to get tag
 
         // AheadBTB always uses ahead-pipelined update logic
         Addr previousPC = getPreviousPC(stream);
@@ -678,7 +693,7 @@ AheadBTB::update(const FetchTarget &stream)
             DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n");
             return;
         }
-        Addr btb_idx = getIndex(previousPC);  // use last pc to get idx
+        Addr btb_idx = getIndex(previousPC, stream.asidHash);  // use last pc to get idx
         entry.source = getComponentIdx(); // mark the entry source as AheadBTB
         updateBTBEntry(btb_idx, btb_tag, entry, stream.exeBranchInfo, stream.exeTaken);
     }
diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh
index 677f5f7f32..e5e29f7ffd 100644
--- a/src/cpu/pred/btb/abtb.hh
+++ b/src/cpu/pred/btb/abtb.hh
@@ -224,8 +224,9 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch to look up.
      *  @return Returns the index into the BTB.
      */
-    inline Addr getIndex(Addr instPC) {
-        return (instPC >> idxShiftAmt) & idxMask;
+    inline Addr getIndex(Addr instPC, uint8_t asidHash) {
+        Addr baseIndex = (instPC >> idxShiftAmt) & idxMask;
+        return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash);
     }
 
     /** Returns the tag bits of a given address.
@@ -234,8 +235,9 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch's address.
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr instPC) {
-        return (instPC >> tagShiftAmt) & tagMask;
+    inline Addr getTag(Addr instPC, uint8_t asidHash) {
+        Addr baseTag = (instPC >> tagShiftAmt) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
 
@@ -365,13 +367,13 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The address of the block to look up.
      *  @return Returns all hit BTB entries.
      */
-    std::vector<TickedBTBEntry> lookup(Addr block_pc);
+    std::vector<TickedBTBEntry> lookup(Addr block_pc, uint8_t asidHash);
 
     /** Helper function to lookup entries in a single block
      * @param block_pc The aligned PC to lookup
      * @return Vector of matching BTB entries
      */
-    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, uint8_t asidHash);
 
     /** The BTB structure:
      *  - Organized as numSets sets
diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc
index e625650d10..dd5bc40008 100644
--- a/src/cpu/pred/btb/btb_ittage.cc
+++ b/src/cpu/pred/btb/btb_ittage.cc
@@ -102,8 +102,9 @@ BTBITTAGE::tick() {}
 
 void
 BTBITTAGE::lookupHelper(Addr startAddr, const std::vector<BTBEntry> &btbEntries,
-                        IndirectTargets& results, ThreadID tid)
+                        IndirectTargets& results, ThreadID tid, uint8_t asidHash)
 {
+    (void)asidHash;
     DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr);
     std::vector<TagePrediction> preds;
     for (auto &btb_entry : btbEntries) {
@@ -192,6 +193,7 @@ BTBITTAGE::dryRunCycle(Addr startPC) {
 void
 BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
     const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     const auto &state = historyState(tid);
     if (debugPC == stream_start) {
         debugFlag = true;
@@ -212,9 +214,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
     // all btb entries should use the same lookup result
     // but each btb entry can use prediction from different tables
     for (int i = 0; i < numPredictors; ++i) {
-        Addr index = getTageIndex(stream_start, i, state.indexFoldedHist[i].get());
+        Addr index = getTageIndex(stream_start, i, state.indexFoldedHist[i].get(), asidHash);
         Addr tag = getTageTag(stream_start, i, state.tagFoldedHist[i].get(),
-                              state.altTagFoldedHist[i].get());
+                              state.altTagFoldedHist[i].get(), asidHash);
         auto &entry = tageTable[i][index];
         lookupEntries.push_back(entry);
         lookupIndices.push_back(index);
@@ -229,7 +231,7 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector<Fu
         auto &stage_pred = stagePreds[s];
         stage_pred.indirectTargets.clear();
         lookupHelper(stream_start, stage_pred.btbEntries,
-                     stage_pred.indirectTargets, tid);
+                     stage_pred.indirectTargets, tid, asidHash);
     }
     DPRINTF(ITTAGE, "putPCHistory end\n");
     debugFlag = false;
@@ -403,8 +405,9 @@ BTBITTAGE::update(const FetchTarget &stream)
                 unsigned startTable = main_found ? main_info.table + 1 : 0;
 
                 for (int ti = startTable; ti < numPredictors; ti++) {
-                    Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get());
-                    Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(), updateAltTagFoldedHist[ti].get());
+                    Addr newIndex = getTageIndex(startAddr, ti, updateIndexFoldedHist[ti].get(), stream.asidHash);
+                    Addr newTag = getTageTag(startAddr, ti, updateTagFoldedHist[ti].get(),
+                                             updateAltTagFoldedHist[ti].get(), stream.asidHash);
                     assert(newIndex < tageTable[ti].size());
                     auto &newEntry = tageTable[ti][newIndex];
 
@@ -438,7 +441,8 @@ BTBITTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 }
 
 Addr
-BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist)
+BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                      uint8_t asidHash)
 {
     // Create mask for tableTagBits[t]
     uint64_t mask = ((1ULL << tableTagBits[t]) - 1);
@@ -450,32 +454,33 @@ BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis
     uint64_t altTagBits = (altFoldedHist << 1);
 
     // XOR all components
-    return (pcBits ^ foldedHist ^ altTagBits) & mask;
+    return injectAsidHashIntoTag((pcBits ^ foldedHist ^ altTagBits) & mask,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-BTBITTAGE::getTageTag(Addr pc, int t)
+BTBITTAGE::getTageTag(Addr pc, int t, uint8_t asidHash)
 {
     const auto &state = historyState(0);
     return getTageTag(pc, t, state.tagFoldedHist[t].get(),
-                      state.altTagFoldedHist[t].get());
+                      state.altTagFoldedHist[t].get(), asidHash);
 }
 
 Addr
-BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t]
     uint64_t mask = ((1ULL << tableIndexBits[t]) - 1);
 
     // Extract lower bits of PC and XOR with folded history
     uint64_t pcBits = (pc >> floorLog2(blockSize));
-    return (pcBits ^ foldedHist) & mask;
+    return xorAsidHashIntoIndex((pcBits ^ foldedHist) & mask, tableIndexBits[t], asidHash);
 }
 
 Addr
-BTBITTAGE::getTageIndex(Addr pc, int t)
+BTBITTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh
index 8269fdaeb6..7db7e39350 100644
--- a/src/cpu/pred/btb/btb_ittage.hh
+++ b/src/cpu/pred/btb/btb_ittage.hh
@@ -125,19 +125,20 @@ class BTBITTAGE : public TimedBaseBTBPredictor
 
     // return provided
     void lookupHelper(Addr stream_start, const std::vector<BTBEntry> &btbEntries,
-                      IndirectTargets& results, ThreadID tid);
+                      IndirectTargets& results, ThreadID tid, uint8_t asidHash);
 
     // use blockPC
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // use blockPC (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // use blockPC
-    Addr getTageTag(Addr pc, int table);
+    Addr getTageTag(Addr pc, int table, uint8_t asidHash = 0);
 
     // use blockPC (uint64_t version for performance)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    uint8_t asidHash = 0);
 
     Addr getOffset(Addr pc) {
         return (pc & (blockSize - 1)) >> 1;
diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc
index c81bfb1a1d..7623e591c3 100644
--- a/src/cpu/pred/btb/btb_tage.cc
+++ b/src/cpu/pred/btb/btb_tage.cc
@@ -297,7 +297,8 @@ BTBTAGE::TagePrediction
 BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
                                  std::shared_ptr<TageMeta> predMeta,
-                                 ThreadID tid) {
+                                 ThreadID tid,
+                                 uint8_t asidHash) {
     DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
     const auto &state = historyState(tid);
 
@@ -314,12 +315,13 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
     for (int i = numPredictors - 1; i >= 0; --i) {
         // Calculate index and tag: use snapshot if provided, otherwise use current folded history
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
-        Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get());
+        Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get(), asidHash)
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash);
         Addr tag = predMeta ? getTageTag(startPC, i,
-                            predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position)
+                            predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(),
+                            position, asidHash)
                         : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
-                                     state.altTagFoldedHist[i].get(), position);
+                                     state.altTagFoldedHist[i].get(), position, asidHash);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -416,7 +418,7 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
 void
 BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
                       std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
-                      CondTakens& results, ThreadID tid)
+                      CondTakens& results, ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -424,7 +426,7 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
     for (auto &btb_entry : btbEntries) {
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
-            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid);
+            auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid, asidHash);
             threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
@@ -468,6 +470,7 @@ BTBTAGE::dryRunCycle(Addr startPC) {
 void
 BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
     const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
@@ -497,7 +500,7 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPr
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
         lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs,
-                     stage_pred.condTakens, tid);
+                     stage_pred.condTakens, tid, asidHash);
     }
 
 }
@@ -720,6 +723,7 @@ BTBTAGE::updatePredictorStateAndCheckAllocation(const BTBEntry &entry,
  * @param actual_taken The actual outcome of the branch
  * @param start_table The starting table for allocation
  * @param meta The metadata of the predictor
+ * @param asidHash The ASID hash used in TAGE index/tag calculation
  * @return true if allocation is successful
  */
 bool
@@ -728,6 +732,7 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC,
                                  bool actual_taken,
                                  unsigned start_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  AllocationTraceInfo &allocInfo) {
     // Match RTL victim priority:
     // 1) invalid way
@@ -738,9 +743,9 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC,
     unsigned position = getBranchIndexInBlock(entry.pc, startPC);
 
     for (unsigned ti = start_table; ti < numPredictors; ++ti) {
-        Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get());
+        Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get(), asidHash);
         Addr newTag = getTageTag(startPC, ti,
-            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position);
+            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position, asidHash);
 
         auto &set = tageTable[ti][newIndex];
 
@@ -917,10 +922,12 @@ BTBTAGE::update(const FetchTarget &stream) {
 
         TagePrediction recomputed;
         if (updateOnRead || !has_original_pred) {
-            // Reconstruct providers when update-on-read is enabled or when a new
-            // BTB entry lacks prediction-time metadata.
-            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
-            if (has_original_pred && recomputed.taken != original_pred.taken) {
+            // Re-read providers using snapshot (do not rely on prediction-time main/alt)
+            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                 stream.tid, stream.asidHash);
+            // Track differences for statistics
+            auto it = predMeta->preds.find(btb_entry.pc);
+            if (has_original_pred && it != predMeta->preds.end() && recomputed.taken != original_pred.taken) {
                 hasRecomputedVsOriginalDiff = true;
             }
         } else { // otherwise, use the prediction from the prediction-time main/alt
@@ -944,7 +951,8 @@ BTBTAGE::update(const FetchTarget &stream) {
                 start_table = main_info.table + 1; // start from the table after the main prediction table
             }
             handleNewEntryAllocation(startAddr, btb_entry, actual_taken,
-                                     start_table, predMeta, allocInfo);
+                                     start_table, predMeta, stream.asidHash,
+                                     allocInfo);
         }
 
 #ifndef UNIT_TEST
@@ -1051,7 +1059,8 @@ BTBTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 
 // Calculate TAGE tag with folded history - optimized version using bitwise operations
 Addr
-BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position)
+BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position, uint8_t asidHash)
 {
     // Create mask for tableTagBits[t] to limit result size
     Addr mask = (1ULL << tableTagBits[t]) - 1;
@@ -1067,19 +1076,20 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
     Addr altTagBits = (altFoldedHist << 1) & mask;
 
     // XOR all components together, including position (like RTL)
-    return pcBits ^ foldedBits ^ altTagBits ^ position;
+    return injectAsidHashIntoTag(pcBits ^ foldedBits ^ altTagBits ^ position,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-BTBTAGE::getTageTag(Addr pc, int t, Addr position)
+BTBTAGE::getTageTag(Addr pc, int t, Addr position, uint8_t asidHash)
 {
     const auto &state = historyState(0);
     return getTageTag(pc, t, state.tagFoldedHist[t].get(),
-                      state.altTagFoldedHist[t].get(), position);
+                      state.altTagFoldedHist[t].get(), position, asidHash);
 }
 
 Addr
-BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t] to limit result size
     Addr mask = (1ULL << tableIndexBits[t]) - 1;
@@ -1088,14 +1098,13 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
     Addr pcBits = (pc >> pcShift) & mask;
     Addr foldedBits = foldedHist & mask;
 
-    // Support non-power-of-two table sizes when tuning capacities.
-    return (pcBits ^ foldedBits) % tableSizes[t];
+    return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash) % tableSizes[t];
 }
 
 Addr
-BTBTAGE::getTageIndex(Addr pc, int t)
+BTBTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh
index 33bd6826ae..42650a6ea1 100644
--- a/src/cpu/pred/btb/btb_tage.hh
+++ b/src/cpu/pred/btb/btb_tage.hh
@@ -179,21 +179,22 @@ class BTBTAGE : public TimedBaseBTBPredictor
     // Look up predictions in TAGE tables for a stream of instructions
     void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
                     std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
-                    CondTakens& results, ThreadID tid);
+                    CondTakens& results, ThreadID tid, uint8_t asidHash);
 
     // Calculate TAGE index for a given PC and table
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // Calculate TAGE index with folded history (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // Calculate TAGE tag for a given PC and table
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, Addr position = 0, uint8_t asidHash = 0);
 
     // Calculate TAGE tag with folded history (uint64_t version for performance)
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position = 0, uint8_t asidHash = 0);
 
     // Get offset within a block for a given PC
     Addr getOffset(Addr pc) {
@@ -466,7 +467,8 @@ private:
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
                                            const std::shared_ptr<TageMeta> predMeta = nullptr,
-                                           ThreadID tid = 0);
+                                           ThreadID tid = 0,
+                                           uint8_t asidHash = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -483,6 +485,7 @@ private:
                                  bool actual_taken,
                                  unsigned main_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  AllocationTraceInfo &allocInfo);
 
 
diff --git a/src/cpu/pred/btb/btb_ubtb.cc b/src/cpu/pred/btb/btb_ubtb.cc
index 755d8d8460..5f809713c3 100644
--- a/src/cpu/pred/btb/btb_ubtb.cc
+++ b/src/cpu/pred/btb/btb_ubtb.cc
@@ -137,7 +137,8 @@ void
 UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<UBTBMeta>();
-    auto it = lookup(startAddr);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
+    auto it = lookup(startAddr, asidHash);
     auto& entry = meta->hit_entry;
     entry = (it != ubtb.end()) ? *it : TickedUBTBEntry();
 
@@ -151,23 +152,29 @@ UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::
 }
 
 UBTB::UBTBIter
-UBTB::lookup(Addr startAddr)
+UBTB::lookup(Addr startAddr, uint8_t asidHash)
 {
     if (startAddr & 0x1) {
         return ubtb.end();  // ignore false hit when lowest bit is 1
     }
 
-    Addr current_tag = getTag(startAddr);
+    Addr current_tag = getTag(startAddr, asidHash);
+    Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1);
 
     DPRINTF(UBTB, "UBTB: Doing tag comparison for tag %#lx\n", current_tag);
 
     auto it = std::find_if(ubtb.begin(), ubtb.end(),
-                           [current_tag](const TickedUBTBEntry &way) { return way.valid && way.tag == current_tag; });
+                           [current_tag, startAddr, block_end](const TickedUBTBEntry &way) {
+                               return way.valid && way.tag == current_tag &&
+                                      way.pc >= startAddr && way.pc < block_end;
+                           });
 
     if (it != ubtb.end()) {
         // Found a hit - verify no duplicates
-        auto duplicate = std::find_if(std::next(it), ubtb.end(), [current_tag](const TickedUBTBEntry &way) {
-            return way.valid && way.tag == current_tag;
+        auto duplicate = std::find_if(std::next(it), ubtb.end(),
+                                      [current_tag, startAddr, block_end](const TickedUBTBEntry &way) {
+            return way.valid && way.tag == current_tag &&
+                   way.pc >= startAddr && way.pc < block_end;
         });
         if (duplicate != ubtb.end()) {
             DPRINTF(UBTB, "UBTB: Multiple hits found in uBTB for the same tag %#lx\n", current_tag);
@@ -184,7 +191,8 @@ UBTB::lookup(Addr startAddr)
 
 
 void
-UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr)
+UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry,
+                      Addr startAddr, uint8_t asidHash)
 {
     assert(newTakenEntry.valid);
     TickedUBTBEntry newEntry = TickedUBTBEntry(newTakenEntry, curTick());
@@ -192,7 +200,7 @@ UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr
     newEntry.target = newTakenEntry.target;
     newEntry.ctr = 0; // have a bug here:ubtb will accept ctr from mbtb, reset it to 0 at here
     // important: update tag (mbtb and ubtb have different tags, even diffferent tag length)
-    newEntry.tag = getTag(startAddr);
+    newEntry.tag = getTag(startAddr, asidHash);
     *oldEntryIter = newEntry;
 }
 
@@ -213,13 +221,14 @@ UBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred)
     auto startAddr = s3Pred.bbStart;
     UBTBIter oldEntryIter = lastPred.hit_entry;
     takenEntry.source = getComponentIdx();
-    updateNewEntry(oldEntryIter, takenEntry, startAddr);
+    updateNewEntry(oldEntryIter, takenEntry, startAddr, s3Pred.asidHash);
 
 }
 
 
 
-void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr)
+void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry,
+                          const Addr startAddr, uint8_t asidHash)
 {
     //using the FB final taken branch to update uBTB
     if (oldEntryIter != ubtb.end()) {
@@ -259,7 +268,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con
             }
 
             // Replace the entry with the new prediction
-            replaceOldEntry(toBeReplacedIter, takenEntry, startAddr);
+            replaceOldEntry(toBeReplacedIter, takenEntry, startAddr, asidHash);
 
         } else if (oldEntryIter != ubtb.end() && takenEntry.valid) {
             ubtbStats.s1Hits3Taken++;
@@ -269,7 +278,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con
                 updateUCtr(oldEntryIter->uctr, false);
                 if (oldEntryIter->uctr == 0) {
                     // replace the old entry with the new one
-                    replaceOldEntry(oldEntryIter, takenEntry, startAddr);
+                    replaceOldEntry(oldEntryIter, takenEntry, startAddr, asidHash);
                 }
             } else {
                 // S0 and S3 predict the same (brpc and target)
@@ -294,13 +303,15 @@ UBTB::update(const FetchTarget &stream)
      // Use BTBEntry instead of BranchInfo; make it invalid when not taken
     BTBEntry takenEntry = stream.exeTaken ? BTBEntry(stream.exeBranchInfo) : BTBEntry();
     auto startAddr = stream.getRealStartPC();
-    Addr oldtag = getTag(startAddr);
+    Addr oldtag = getTag(startAddr, stream.asidHash);
+    Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1);
 
     UBTBIter oldEntryIter = ubtb.end();
 
     oldEntryIter = meta->hit_entry.valid ?
-                    std::find_if(ubtb.begin(), ubtb.end(), [oldtag](const TickedUBTBEntry &e) {
-                        return e.valid && e.tag == oldtag;
+                    std::find_if(ubtb.begin(), ubtb.end(), [oldtag, startAddr, block_end](const TickedUBTBEntry &e) {
+                        return e.valid && e.tag == oldtag &&
+                               e.pc >= startAddr && e.pc < block_end;
                     }) : ubtb.end();
 
     if (stream.exeTaken) {
@@ -315,7 +326,7 @@ UBTB::update(const FetchTarget &stream)
     // Verify uBTB state
     assert(ubtb.size() <= numEntries);
     if (!usingS3Pred) {
-        updateNewEntry(oldEntryIter, takenEntry, startAddr);
+        updateNewEntry(oldEntryIter, takenEntry, startAddr, stream.asidHash);
     }
 }
 
diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh
index 649641b420..4898cec009 100644
--- a/src/cpu/pred/btb/btb_ubtb.hh
+++ b/src/cpu/pred/btb/btb_ubtb.hh
@@ -218,8 +218,9 @@ class UBTB : public TimedBaseBTBPredictor
      *  @param startPC The start address of the fetch block
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr startPC) {
-        return (startPC >> 1) & tagMask;
+    inline Addr getTag(Addr startPC, uint8_t asidHash) {
+        Addr baseTag = (startPC >> 1) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
     void updateUCtr(unsigned &ctr, bool inc) {
@@ -231,7 +232,7 @@ class UBTB : public TimedBaseBTBPredictor
      * @param startAddr The FB start address to look up
      * @return Iterator to the matching entry if found, or ubtb.end() if not found
      */
-    UBTBIter lookup(Addr startAddr);
+    UBTBIter lookup(Addr startAddr, uint8_t asidHash);
 
     /** helper method called by putPCHistory: Check uBTB entry pc range and update statistics
      * @param entry The uBTB entry to check
@@ -251,10 +252,12 @@ class UBTB : public TimedBaseBTBPredictor
      * @param oldEntry Iterator to the entry to replace
      * @param newPrediction The new prediction to store
      */
-    void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr);
+    void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry,
+                         Addr startAddr, uint8_t asidHash);
 
     //using the FB final taken branch to update uBTB
-    void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr);
+    void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry,
+                        const Addr startAddr, uint8_t asidHash);
 
 
     /** The uBTB structure:
diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh
index b61e459ff6..e40dee3cf2 100644
--- a/src/cpu/pred/btb/common.hh
+++ b/src/cpu/pred/btb/common.hh
@@ -1,6 +1,7 @@
 #ifndef __CPU_PRED_BTB_STREAM_STRUCT_HH__
 #define __CPU_PRED_BTB_STREAM_STRUCT_HH__
 
+#include <algorithm>
 #include <queue>
 #include <string>
 
@@ -18,6 +19,49 @@ namespace branch_prediction {
 
 namespace btb_pred {
 
+inline uint8_t
+foldAsidHash16To4(uint16_t asid)
+{
+    return (asid & 0xf) ^ ((asid >> 4) & 0xf) ^
+           ((asid >> 8) & 0xf) ^ ((asid >> 12) & 0xf);
+}
+
+inline Addr
+expandAsidHash(uint8_t asid_hash, unsigned bits)
+{
+    if (bits == 0) {
+        return 0;
+    }
+
+    Addr expanded = 0;
+    for (unsigned shift = 0; shift < bits; shift += 4) {
+        expanded |= static_cast<Addr>(asid_hash) << shift;
+    }
+    return expanded & mask(bits);
+}
+
+inline Addr
+injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash)
+{
+    if (tag_bits == 0) {
+        return 0;
+    }
+
+    const unsigned hash_bits = std::min<unsigned>(4, tag_bits);
+    const Addr hash_mask = mask(hash_bits);
+    return (base_tag & ~hash_mask) | (static_cast<Addr>(asid_hash) & hash_mask);
+}
+
+inline Addr
+xorAsidHashIntoIndex(Addr base_index, unsigned index_bits, uint8_t asid_hash)
+{
+    if (index_bits == 0) {
+        return 0;
+    }
+
+    return (base_index ^ expandAsidHash(asid_hash, index_bits)) & mask(index_bits);
+}
+
 enum EndType
 {
     END_CALL=0,
@@ -276,6 +320,7 @@ using IndirectTargets = std::vector<std::pair<Addr, Addr>>;
 struct FetchTarget
 {
     ThreadID tid;
+    uint8_t asidHash;
     Addr startPC;       // start pc of the stream
     bool predTaken;     // whether the FetchTarget has taken branch
     Addr predEndPC;     // predicted stream end pc (fall through pc)
@@ -324,6 +369,7 @@ struct FetchTarget
 
    FetchTarget()
        : tid(0),
+         asidHash(0),
          startPC(0),
          predTaken(false),
          predEndPC(0),
@@ -453,6 +499,7 @@ struct FetchTarget
 struct FullBTBPrediction
 {
     ThreadID tid;
+    uint8_t asidHash;
     Addr bbStart;
     std::vector<BTBEntry> btbEntries; // for BTB, only assigned when hit, sorted by inst order
     // for conditional branch predictors, mapped with lowest bits of branches
@@ -474,6 +521,7 @@ struct FullBTBPrediction
 
     FullBTBPrediction() :
         tid(0),
+        asidHash(0),
         bbStart(0),
         btbEntries(),
         condTakens(),
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 01e9b78ac3..aec2222806 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -2,6 +2,7 @@
 
 #include <array>
 
+#include "arch/riscv/regs/misc.hh"
 #include "base/debug_helper.hh"
 #include "base/output.hh"
 #include "cpu/o3/cpu.hh"
@@ -22,6 +23,19 @@ namespace branch_prediction
 namespace btb_pred
 {
 
+uint8_t
+DecoupledBPUWithBTB::getThreadAsidHash(ThreadID tid) const
+{
+    if (!cpu) {
+        return 0;
+    }
+
+    const RegVal satp =
+        cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_SATP, tid);
+    const uint16_t asid = (satp >> 44) & mask(16);
+    return foldAsidHash16To4(asid);
+}
+
 void
 DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid)
 {
@@ -209,6 +223,7 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid)
 {
     auto& thread = threads[tid];
     auto& predsOfEachStage = threads[tid].predsOfEachStage;
+    const uint8_t asid_hash = getThreadAsidHash(tid);
 
     DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC);
 
@@ -216,6 +231,7 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid)
     clearPreds(tid);
     for (int i = 0; i < numStages; i++) {
         predsOfEachStage[i].tid = tid;
+        predsOfEachStage[i].asidHash = asid_hash;
         predsOfEachStage[i].bbStart = thread.s0PC;
         predsOfEachStage[i].predSource = i;
     }
@@ -781,6 +797,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid)
     // Create a new fetch target entry
     FetchTarget entry;
     entry.tid = tid;
+    entry.asidHash = finalPred.asidHash;
     entry.startPC = s0PC;
 
     // Extract branch prediction information
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 2552ce9e44..0a46c1a4e5 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -330,6 +330,7 @@ class DecoupledBPUWithBTB : public BPredUnit
     }
 
     void setCpu(CPU *_cpu) { cpu = _cpu; }
+    uint8_t getThreadAsidHash(ThreadID tid) const;
 
     void consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid);
 
diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc
index abd2923739..de1e764fce 100644
--- a/src/cpu/pred/btb/mbtb.cc
+++ b/src/cpu/pred/btb/mbtb.cc
@@ -299,8 +299,9 @@ MBTB::putPCHistory(Addr startAddr,
                          std::vector<FullBTBPrediction> &stagePreds)
 {
     meta = std::make_shared<BTBMeta>();
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     // Lookup all matching entries in BTB
-    auto find_entries = lookup(startAddr, meta);
+    auto find_entries = lookup(startAddr, asidHash, meta);
 
     // Process BTB entries
     auto processed_entries = processEntries(find_entries, startAddr);
@@ -335,7 +336,7 @@ MBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &ent
  * @return Vector of matching BTB entries
  */
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookupSingleBlock(Addr block_pc)
+MBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -346,11 +347,11 @@ MBTB::lookupSingleBlock(Addr block_pc)
     auto& target_sram = (sram_id == 0) ? sram0 : sram1;
     auto& target_mru = (sram_id == 0) ? mru0 : mru1;
     
-    Addr btb_idx = getIndex(block_pc);
+    Addr btb_idx = getIndex(block_pc, asidHash);
     auto& btb_set = target_sram[btb_idx];
     assert(btb_idx < numSets);
 
-    Addr current_tag = getTag(block_pc);
+    Addr current_tag = getTag(block_pc, asidHash);
     DPRINTF(BTB, "BTB: Doing tag comparison for SRAM%d index 0x%lx tag %#lx\n",
         sram_id, btb_idx, current_tag);
         
@@ -365,7 +366,7 @@ MBTB::lookupSingleBlock(Addr block_pc)
 }
 
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta)
+MBTB::lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr<BTBMeta> meta)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -376,15 +377,15 @@ MBTB::lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta)
     // Calculate 32B aligned address
     Addr alignedPC = block_pc & ~(blockSize - 1);
     // Lookup first 32B block
-    res = lookupSingleBlock(alignedPC);
+    res = lookupSingleBlock(alignedPC, asidHash);
     // Lookup next 32B block
-    auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize);
+    auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize, asidHash);
     // Merge results
     res.insert(res.end(), nextBlockRes.begin(), nextBlockRes.end());
 
     // lookup victim cache if victim cache is enabled
     if (victimCacheSize > 0) {
-        auto victimResults = lookupVictimCache(block_pc);
+        auto victimResults = lookupVictimCache(block_pc, asidHash);
         if (!victimResults.empty()) {
             DPRINTF(BTB, "Victim cache hit for lookup at %#lx\n", block_pc);
             btbStats.victimCacheHit++;
@@ -460,7 +461,7 @@ MBTB::getAndSetNewBTBEntry(FetchTarget &stream)
     }
 
     // Set tag and update stream metadata for use in update()
-    entry_to_write.tag = getTag(entry_to_write.pc);
+    entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash);
     stream.updateNewBTBEntry = entry_to_write;
     stream.updateIsOldEntry = is_old_entry;
 }
@@ -508,7 +509,7 @@ MBTB::updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream)
     auto& target_mru = (sram_id == 0) ? mru0 : mru1;
     
     // Calculate index and tag for this entry
-    Addr btb_idx = getIndex(entry.pc);
+    Addr btb_idx = getIndex(entry.pc, stream.asidHash);
 
     // Look for matching entry in the target SRAM
     bool found = false;
@@ -564,7 +565,7 @@ MBTB::buildUpdatedEntry(const BTBEntry& req_entry,
                               ? BTBEntry(*existing_entry)
                               : req_entry;
     // Always recalculate tag based on the actual PC being written
-    entry_to_write.tag = getTag(entry_to_write.pc);
+    entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash);
     entry_to_write.resolved = false; // reset resolved status
 
     // Update saturating counter and alwaysTaken
@@ -723,7 +724,7 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) {
  * Victim cache operations implementation
  */
 std::vector<MBTB::TickedBTBEntry>
-MBTB::lookupVictimCache(Addr block_pc)
+MBTB::lookupVictimCache(Addr block_pc, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> results;
     Addr alignedPC = block_pc & ~(blockSize - 1);
@@ -735,7 +736,7 @@ MBTB::lookupVictimCache(Addr block_pc)
         Addr entryAlignedPC = entry.pc & ~(blockSize - 1);
         // Check if this entry is in either of the two 32B blocks we're looking for
         if (entryAlignedPC == alignedPC || entryAlignedPC == (alignedPC + blockSize)) {
-            Addr current_tag = getTag(entry.pc);
+            Addr current_tag = getTag(entry.pc, asidHash);
             if (entry.tag == current_tag) {
                 results.push_back(entry);
                 DPRINTF(BTB, "Victim cache hit for pc %#lx\n", entry.pc);
diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh
index b4f587a141..3b2ec76fe4 100644
--- a/src/cpu/pred/btb/mbtb.hh
+++ b/src/cpu/pred/btb/mbtb.hh
@@ -215,8 +215,9 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch to look up.
      *  @return Returns the index into the BTB.
      */
-    inline Addr getIndex(Addr instPC) {
-        return (instPC >> idxShiftAmt) & idxMask;
+    inline Addr getIndex(Addr instPC, uint8_t asidHash) {
+        Addr baseIndex = (instPC >> idxShiftAmt) & idxMask;
+        return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash);
     }
 
     /** Returns the tag bits of a given address.
@@ -225,8 +226,9 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The branch's address.
      *  @return Returns the tag bits.
      */
-    inline Addr getTag(Addr instPC) {
-        return (instPC >> tagShiftAmt) & tagMask;
+    inline Addr getTag(Addr instPC, uint8_t asidHash) {
+        Addr baseTag = (instPC >> tagShiftAmt) & tagMask;
+        return injectAsidHashIntoTag(baseTag, tagBits, asidHash);
     }
 
     /** Update the 2-bit saturating counter for conditional branches
@@ -340,16 +342,16 @@ class MBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The address of the block to look up.
      *  @return Returns all hit BTB entries.
      */
-    std::vector<TickedBTBEntry> lookup(Addr block_pc, std::shared_ptr<BTBMeta> meta);
+    std::vector<TickedBTBEntry> lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr<BTBMeta> meta);
 
     /** Helper function to lookup entries in a single block
      * @param block_pc The aligned PC to lookup
      * @return Vector of matching BTB entries
      */
-    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, uint8_t asidHash);
 
     /** Victim cache operations */
-    std::vector<TickedBTBEntry> lookupVictimCache(Addr block_pc);
+    std::vector<TickedBTBEntry> lookupVictimCache(Addr block_pc, uint8_t asidHash);
     void insertVictimCache(const TickedBTBEntry& evicted_entry);
     bool eraseFromVictimCacheByPC(Addr pc);
 
diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc
index 7fd88b0845..d4cb7b4533 100644
--- a/src/cpu/pred/btb/microtage.cc
+++ b/src/cpu/pred/btb/microtage.cc
@@ -212,7 +212,8 @@ MicroTAGE::TagePrediction
 MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
                                  const Addr &startPC,
                                  std::shared_ptr<TageMeta> predMeta,
-                                 ThreadID tid) {
+                                 ThreadID tid,
+                                 uint8_t asidHash) {
     DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc);
     const auto &state = historyState(tid);
 
@@ -227,12 +228,13 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
         // Calculate index and tag: use snapshot if provided, otherwise use current folded history
         // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition)
         Addr index = predMeta ? getTageIndex(startPC, i,
-                            predMeta->indexFoldedHist[i].get())
-                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get());
+                            predMeta->indexFoldedHist[i].get(), asidHash)
+                          : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash);
         Addr tag = predMeta ? getTageTag(startPC, i,
-                            predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position)
+                            predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(),
+                            position, asidHash)
                         : getTageTag(startPC, i, state.tagFoldedHist[i].get(),
-                                     state.altTagFoldedHist[i].get(), position);
+                                     state.altTagFoldedHist[i].get(), position, asidHash);
 
         bool match = false; // for each table, only one way can be matched
         TageEntry matching_entry;
@@ -289,7 +291,7 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry,
  */
 void
 MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                        CondTakens& results, ThreadID tid)
+                        CondTakens& results, ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -298,7 +300,7 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEnt
         // Only predict for valid conditional branches
         if (btb_entry.isCond && btb_entry.valid) {
             auto pred = generateSinglePrediction(btb_entry, startPC, nullptr,
-                                                 tid);
+                                                 tid, asidHash);
             threadMeta[tid]->preds[btb_entry.pc] = pred;
             tageStats.updateStatsWithTagePrediction(pred, true);
             results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
@@ -331,6 +333,7 @@ MicroTAGE::dryRunCycle(Addr startPC) {
 void
 MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTBPrediction> &stagePreds) {
     const ThreadID tid = predictorTid(stagePreds);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     const auto &state = historyState(tid);
     // Record prediction bank for next tick's conflict detection
     lastPredBankId = getBankId(startPC);
@@ -368,7 +371,7 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector<FullBTB
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
         lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens,
-                     tid);
+                     tid, asidHash);
     }
 
 }
@@ -536,6 +539,7 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC,
                                  bool actual_taken,
                                  unsigned start_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way) {
@@ -549,9 +553,10 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC,
 
     for (unsigned ti = start_table; ti < numPredictors; ++ti) {
         Addr newIndex = getTageIndex(startPC, ti,
-            meta->indexFoldedHist[ti].get());
+            meta->indexFoldedHist[ti].get(), asidHash);
         Addr newTag = getTageTag(startPC, ti,
-            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position);
+            meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(),
+            position, asidHash);
 
         auto &set = tageTable[ti][newIndex];
 
@@ -679,7 +684,8 @@ MicroTAGE::update(const FetchTarget &stream) {
         TagePrediction recomputed;
         if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot
             // Re-read providers using snapshot (do not rely on prediction-time main/alt)
-            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
+            recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                 stream.tid, stream.asidHash);
         } else { // otherwise, use the prediction from the prediction-time main/alt
             auto pred_it = predMeta->preds.find(btb_entry.pc);
             if (pred_it != predMeta->preds.end()) {
@@ -687,7 +693,8 @@ MicroTAGE::update(const FetchTarget &stream) {
             } else {
                 DPRINTF(UTAGE, "update: missing predMeta entry for pc %#lx, recompute with snapshot\n",
                         btb_entry.pc);
-                recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta);
+                recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta,
+                                                     stream.tid, stream.asidHash);
             }
         }
         if (recomputed.mainprovided) {
@@ -710,7 +717,8 @@ MicroTAGE::update(const FetchTarget &stream) {
                 start_table = main_info.table + 1; // start from the table after the main prediction table
             }
             alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken,
-                                   start_table, predMeta, allocated_table, allocated_index, allocated_way);
+                                   start_table, predMeta, stream.asidHash,
+                                   allocated_table, allocated_index, allocated_way);
         }
 
 #ifndef UNIT_TEST
@@ -792,7 +800,8 @@ MicroTAGE::updateCounter(bool taken, unsigned width, short &counter) {
 
 // Calculate TAGE tag with folded history - optimized version using bitwise operations
 Addr
-MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position)
+MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist,
+                      Addr position, uint8_t asidHash)
 {
     // Create mask for tableTagBits[t] to limit result size
     Addr mask = (1ULL << tableTagBits[t]) - 1;
@@ -807,11 +816,12 @@ MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis
     Addr altTagBits = (altFoldedHist << 1) & mask;
 
     // XOR all components together, including position (like RTL)
-    return pcBits ^ foldedBits ^ position ^ altTagBits;
+    return injectAsidHashIntoTag(pcBits ^ foldedBits ^ position ^ altTagBits,
+                                 tableTagBits[t], asidHash);
 }
 
 Addr
-MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
+MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash)
 {
     // Create mask for tableIndexBits[t] to limit result size
     Addr mask = (1ULL << tableIndexBits[t]) - 1;
@@ -820,13 +830,13 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist)
     Addr pcBits = (pc >> pcShift) & mask;
     Addr foldedBits = foldedHist & mask;
 
-    return pcBits ^ foldedBits;
+    return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash);
 }
 
 Addr
-MicroTAGE::getTageIndex(Addr pc, int t)
+MicroTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash)
 {
-    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get());
+    return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash);
 }
 
 bool
diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh
index 3a5fcc518c..51dc756746 100644
--- a/src/cpu/pred/btb/microtage.hh
+++ b/src/cpu/pred/btb/microtage.hh
@@ -168,17 +168,18 @@ class MicroTAGE : public TimedBaseBTBPredictor
 
     // Look up predictions in TAGE tables for a stream of instructions
     void lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
-                      CondTakens& results, ThreadID tid);
+                      CondTakens& results, ThreadID tid, uint8_t asidHash);
 
     // Calculate TAGE index for a given PC and table
-    Addr getTageIndex(Addr pc, int table);
+    Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0);
 
     // Calculate TAGE index with folded history (uint64_t version for performance)
-    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist);
+    Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0);
 
     // Calculate TAGE tag with folded history (uint64_t version for performance)
     // position: branch position within the block (xored into tag like RTL)
-    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0);
+    Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist,
+                    Addr position = 0, uint8_t asidHash = 0);
 
     // Get branch index within a prediction block
     unsigned getBranchIndexInBlock(Addr branchPC, Addr startPC);
@@ -355,7 +356,8 @@ private:
     TagePrediction generateSinglePrediction(const BTBEntry &btb_entry,
                                            const Addr &startPC,
                                            const std::shared_ptr<TageMeta> predMeta = nullptr,
-                                           ThreadID tid = 0);
+                                           ThreadID tid = 0,
+                                           uint8_t asidHash = 0);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
@@ -372,6 +374,7 @@ private:
                                  bool actual_taken,
                                  unsigned main_table,
                                  std::shared_ptr<TageMeta> meta,
+                                 uint8_t asidHash,
                                  uint64_t &allocated_table,
                                  uint64_t &allocated_index,
                                  uint64_t &allocated_way);

From c79fec16c5483d6a59d7a3c23c4f33f72a80f449 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 9 Apr 2026 11:50:38 +0800
Subject: [PATCH 14/38] cpu-o3: fix smt thread-local inst stop threshold

Use per-thread committed instruction counts for O3 warmup and stat-dump stop checks in SMT mode, instead of summing instructions across threads.

Change-Id: I6ecd5f96a18ce9aa96d0712a9e05f3d8dedcbac4
---
 src/cpu/o3/cpu.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 5961aed7b1..9433193c06 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -1377,10 +1377,10 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst)
             cpi_r.roll(1);
         }
 
-        uint64_t committedInsts = totalInsts();
+        const uint64_t committedThreadInsts = thread[tid]->numInst;
 
         if (this->nextDumpInstCount && !dump_done
-                && committedInsts >= this->nextDumpInstCount) {
+                && committedThreadInsts >= this->nextDumpInstCount) {
             fprintf(stderr, "Will trigger stat dump and reset\n");
             statistics::schedStatEvent(true, true, curTick(), 0);
             scheduleInstStop(tid,0,"Will trigger stat dump and reset");
@@ -1394,7 +1394,8 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst)
         // Check for instruction-count-based events.
         thread[tid]->comInstEventQueue.serviceEvents(thread[tid]->numInst);
 
-        if (this->warmupInstCount && !warmup_done && committedInsts >= this->warmupInstCount) {
+        if (this->warmupInstCount && !warmup_done &&
+                committedThreadInsts >= this->warmupInstCount) {
             fprintf(stderr, "Will trigger stat dump and reset\n");
             statistics::schedStatEvent(true, true, curTick(), 0);
             scheduleInstStop(tid,0,"Will trigger stat dump and reset");

From 1105968e1019cae1bcfe54d6b9d3dffc9f095b8a Mon Sep 17 00:00:00 2001
From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com>
Date: Thu, 9 Apr 2026 11:51:18 +0800
Subject: [PATCH 15/38] cpu-o3: fix Decoder scheduler,thread 1 count is
 incorrect (#816)

Co-authored-by: mo haonan <mohaonan@node023.bosccluster.com>
---
 src/cpu/o3/decode.cc | 26 ++++++++++++++++++++++++++
 src/cpu/o3/decode.hh |  4 ++++
 src/cpu/o3/fetch.cc  | 30 +++++++++++++++++++++++++++++-
 src/cpu/o3/fetch.hh  |  8 ++++++++
 src/cpu/o3/iew.cc    |  7 ++++---
 5 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index 93ede3d673..0d36e05a85 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -138,8 +138,14 @@ Decode::DecodeStats::DecodeStats(CPU *cpu)
     : statistics::Group(cpu, "decode"),
       ADD_STAT(idleCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is idle"),
+      ADD_STAT(smtidleCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch was idle per tid"),           
       ADD_STAT(blockedCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is blocked"),
+      ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch has spent blocked per tid"),  
+      ADD_STAT(smtnotactiveCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch no active per tid"),                
       ADD_STAT(runCycles, statistics::units::Cycle::get(),
                "Number of cycles decode is running"),
       ADD_STAT(unblockCycles, statistics::units::Cycle::get(),
@@ -179,6 +185,16 @@ Decode::DecodeStats::DecodeStats(CPU *cpu)
     mispredictedByPC.flags(statistics::total);
     mispredictedByNPC.flags(statistics::total);
     fusedInsts.init(128).flags(statistics::nozero);
+
+    smtidleCycles
+            .init(4)
+            .flags(statistics::total);
+    smtblockedCycles
+            .init(4)
+            .flags(statistics::total);    
+    smtnotactiveCycles
+            .init(4)
+            .flags(statistics::total);          
 }
 
 void
@@ -488,6 +504,15 @@ Decode::tick()
         bool block = stallSig->blockDecode[i];
         bool active = !block && !fixedbuffer[i].empty();
 
+        if(block){
+            ++stats.smtblockedCycles[i];
+        }
+
+        if(!active)
+        {
+            ++stats.smtnotactiveCycles[i];
+        }
+
         stallSig->blockFetch[i] = block || fifoBackpressured;
         stallSig->fetchBlockReason[i] =
             stallSig->blockFetch[i] ?
@@ -583,6 +608,7 @@ Decode::decodeInsts(ThreadID tid)
                 " early.\n",tid);
         // Should I change the status to idle?
         ++stats.idleCycles;
+        ++stats.smtidleCycles[tid];
 
         StallReason stall = StallReason::NoStall;
         for (auto iter : fromFetch->fetchStallReason) {
diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh
index c548fad3c7..f2e39b56a6 100644
--- a/src/cpu/o3/decode.hh
+++ b/src/cpu/o3/decode.hh
@@ -259,8 +259,12 @@ class Decode
 
         /** Stat for total number of idle cycles. */
         statistics::Scalar idleCycles;
+
+        statistics::Vector smtidleCycles;
         /** Stat for total number of blocked cycles. */
         statistics::Scalar blockedCycles;
+        statistics::Vector smtblockedCycles;
+        statistics::Vector smtnotactiveCycles;
         /** Stat for total number of normal running cycles. */
         statistics::Scalar runCycles;
         /** Stat for total number of unblocking cycles. */
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index f95738bd2c..ff31aa9bb9 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -204,8 +204,12 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
              "Number of cycles fetch has spent waiting for tlb"),
     ADD_STAT(idleCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch was idle"),
+    ADD_STAT(smtidleCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch was idle per tid"),         
     ADD_STAT(blockedCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch has spent blocked"),
+    ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(),
+             "Number of cycles fetch has spent blocked per tid"),         
     ADD_STAT(miscStallCycles, statistics::units::Cycle::get(),
              "Number of cycles fetch has spent waiting on interrupts, or bad "
              "addresses, or out of MSHRs"),
@@ -241,6 +245,10 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
              "Distribution of fetch status"),
     ADD_STAT(decodeStalls, statistics::units::Count::get(),
              "Number of decode stalls"),
+    ADD_STAT(smtdecodeStalls, statistics::units::Count::get(),
+             "Number of decode stalls per tid"),  
+    ADD_STAT(smtftqempty, statistics::units::Count::get(),
+             "Number of ftq empty per tid"),                  
     ADD_STAT(decodeStallRate, statistics::units::Rate<
                     statistics::units::Count, statistics::units::Cycle>::get(),
              "Number of decode stalls per cycle",
@@ -336,6 +344,18 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch)
         }
         decodeStalls
             .prereq(decodeStalls);
+        smtdecodeStalls
+            .init(fetch->numThreads)
+            .flags(statistics::total);  
+        smtftqempty
+            .init(fetch->numThreads)
+            .flags(statistics::total);
+        smtidleCycles
+            .init(fetch->numThreads)
+            .flags(statistics::total);
+        smtblockedCycles
+            .init(fetch->numThreads)
+            .flags(statistics::total);     
         decodeStallRate
             .flags(statistics::total);
         fetchBubbles
@@ -1401,9 +1421,12 @@ Fetch::sendInstructionsToDecode()
     for (int i = 0; i < numThreads; i++) {
         if (!stallSig->blockFetch[i]) {
             any_thread_active = true;
-            break;
+            //break;
+        }else{
+            fetchStats.smtdecodeStalls[i]++; 
         }
     }
+
     if (!any_thread_active) {
         // All threads are blocked, no instructions to send
         ThreadID blocked_tid = InvalidThreadID;
@@ -1427,6 +1450,7 @@ Fetch::sendInstructionsToDecode()
     }
 
     ThreadID tid =selectUnstalledThread();
+    DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid);
 
     // fetch totally stalled
     if (stallSig->blockFetch[tid]) {
@@ -1512,6 +1536,7 @@ Fetch::measureFrontendBubbles(unsigned insts_to_decode, ThreadID tid)
 
     if (stallSig->blockFetch[tid]) {
         fetchStats.decodeStalls++;
+        //fetchStats.smtdecodeStalls[tid]++;
     }
 }
 
@@ -1849,6 +1874,7 @@ Fetch::prepareFetchAddress(ThreadID tid, bool &status_change)
     } else {
         if (fetchStatus[tid] == Idle) {
             ++fetchStats.idleCycles;
+            ++fetchStats.smtidleCycles[tid];
             DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid);
         }
         // Status is Idle, so fetch should do nothing.
@@ -2111,6 +2137,7 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) {
     }
 
     if (ftqEmpty(tid)) {
+        ++fetchStats.smtftqempty[tid];
         DPRINTF(Fetch, "[tid:%i] No FSQ entry available for next fetch\n", tid);
         return;
     }
@@ -2183,6 +2210,7 @@ Fetch::profileStall(ThreadID tid)
         DPRINTF(Fetch, "Fetch has no active thread!\n");
     } else if (fetchStatus[tid] == Blocked) {
         ++fetchStats.blockedCycles;
+        ++fetchStats.smtblockedCycles[tid];
         DPRINTF(Fetch, "[tid:%i] Fetch is blocked!\n", tid);
     } else if (fetchStatus[tid] == Squashing) {
         ++fetchStats.squashCycles;
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 0061b87912..18e6159022 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -1054,8 +1054,12 @@ class Fetch
          * the pipeline.
          */
         statistics::Scalar idleCycles;
+
+        statistics::Vector smtidleCycles;
         /** Total number of cycles spent blocked. */
         statistics::Scalar blockedCycles;
+
+        statistics::Vector smtblockedCycles;
         /** Total number of cycles spent in any other state. */
         statistics::Scalar miscStallCycles;
         /** Total number of cycles spent in waiting for drains. */
@@ -1091,6 +1095,10 @@ class Fetch
         statistics::Vector fetchStatusDist;
         /** Number of decode stalls */
         statistics::Scalar decodeStalls;
+
+        statistics::Vector smtdecodeStalls;
+
+        statistics::Vector smtftqempty;
         /** Number of decode stalls per cycle */
         statistics::Formula decodeStallRate;
         /** Unutilized issue-pipeline slots while there is no backend-stall */
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 04c2b893ca..a9a1a14565 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -1547,6 +1547,9 @@ IEW::executeInsts()
     while (threads != end) {
         ThreadID tid = *threads++;
         fetchRedirect[tid] = false;
+        toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid);
+        toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid);
+        toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid);
     }
 
     // Uncomment this if you want to see all available instructions.
@@ -1557,9 +1560,7 @@ IEW::executeInsts()
     ThreadID tid = *activeThreads->begin();
     toFetch->iewInfo[tid].resolvedCFIs.clear();
 
-    toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid);
-    toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid);
-    toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid);
+    
     // Execute/writeback any instructions that are available.
     int insts_to_execute = fromIssue->size;
     fromIssue->size = 0;

From 325d970614beb9434d6648fdf776c39cb412e33c Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 9 Apr 2026 19:18:03 +0800
Subject: [PATCH 16/38] cpu: add shared lsq and ftq modes for smt

Change-Id: Idea57378fbe47dedd654f168141d0284faee716f
---
 configs/example/smt_idealkmhv3.py   |  47 +++++
 src/cpu/o3/BaseO3CPU.py             |   7 +-
 src/cpu/o3/SConscript               |   3 +-
 src/cpu/o3/lsq.cc                   | 276 +++++++++++++++++++++++-----
 src/cpu/o3/lsq.hh                   |  67 +++----
 src/cpu/o3/lsq_unit.cc              |  10 +-
 src/cpu/o3/lsq_unit.hh              |  10 +-
 src/cpu/pred/BranchPredictor.py     |  11 ++
 src/cpu/pred/SConscript             |   2 +-
 src/cpu/pred/btb/decoupled_bpred.cc |  94 +++++++++-
 src/cpu/pred/btb/decoupled_bpred.hh |  22 ++-
 11 files changed, 454 insertions(+), 95 deletions(-)
 create mode 100644 configs/example/smt_idealkmhv3.py

diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py
new file mode 100644
index 0000000000..a83681506f
--- /dev/null
+++ b/configs/example/smt_idealkmhv3.py
@@ -0,0 +1,47 @@
+from m5.objects import Root
+
+from m5.util import addToPath
+
+addToPath('../')
+
+from common import Simulation
+from common.xiangshan import build_xiangshan_system, xiangshan_system_init
+from idealkmhv3 import setKmhV3IdealParams
+
+
+def setSharedLSQParams(args, system):
+    setKmhV3IdealParams(args, system)
+
+    for cpu in system.cpu:
+        # Reuse the ideal KMHV3 LSQ-related sizes, but interpret them as a
+        # shared SMT-wide pool. For example, LQEntries=128 means both threads
+        # compete for a total of 128 load entries instead of 128 each. The
+        # same shared-mode accounting applies to SQ/RARQ/RAWQ. Likewise,
+        # branchPred.ftq_size is interpreted as a shared SMT-wide FTQ pool.
+        # Keep FTQ partitioned by default so one thread cannot monopolize the
+        # shared target queue and starve the other thread's frontend.
+        cpu.smtLSQMode = 'Shared'
+        cpu.smtLSQPolicy = 'Dynamic'
+        cpu.branchPred.smtFTQMode = 'Shared'
+        cpu.branchPred.smtFTQPolicy = 'Partitioned'
+
+
+if __name__ == '__m5_main__':
+    FutureClass = None
+
+    args = xiangshan_system_init()
+
+    assert not args.external_memory_system
+
+    args.smt = True
+    args.bp_type = 'DecoupledBPUWithBTB'
+    args.l2_size = '2MB'
+
+    Simulation.setMemClass(args)
+
+    test_sys = build_xiangshan_system(args)
+    setSharedLSQParams(args, test_sys)
+
+    root = Root(full_system=True, system=test_sys)
+
+    Simulation.run_vanilla(args, root, test_sys, FutureClass)
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index f6f46d85b8..b1f6979368 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -53,6 +53,9 @@ class SMTFetchPolicy(ScopedEnum):
 class SMTQueuePolicy(ScopedEnum):
     vals = [ 'Dynamic', 'Partitioned', 'Threshold' ]
 
+class SMTLSQMode(ScopedEnum):
+    vals = [ 'Independent', 'Shared' ]
+
 class CommitPolicy(ScopedEnum):
     vals = [ 'RoundRobin', 'OldestReady' ]
 
@@ -233,8 +236,10 @@ def support_take_over(cls):
 
     smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching Threads")
     smtFetchPolicy = Param.SMTFetchPolicy('RoundRobin', "SMT Fetch policy")
+    smtLSQMode = Param.SMTLSQMode('Independent',
+                                  "SMT LSQ mode: per-thread independent or shared quota")
     smtLSQPolicy    = Param.SMTQueuePolicy('Partitioned',
-                                           "SMT LSQ Sharing Policy")
+                                           "SMT shared LSQ allocation policy")
     smtLSQThreshold = Param.Int(100, "SMT LSQ Threshold Sharing Parameter")
     smtIQPolicy    = Param.SMTQueuePolicy('Partitioned',
                                           "SMT IQ Sharing Policy")
diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript
index 463a8cdfc0..3c2902a6b4 100755
--- a/src/cpu/o3/SConscript
+++ b/src/cpu/o3/SConscript
@@ -35,7 +35,8 @@ if env['CONF']['TARGET_ISA'] != 'null':
               'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler'])
     SimObject('FuncUnitConfig.py', sim_objects=[])
     SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[
-        'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord'])
+        'SMTFetchPolicy', 'SMTQueuePolicy', 'SMTLSQMode', 'CommitPolicy',
+        'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord'])
 
     Source('commit.cc')
     Source('cpu.cc')
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 4fe227f6ac..a341c1eaa0 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -484,15 +484,15 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
       _storeWbStage(params.StoreWbStage),
       waitingForStaleTranslation(false),
       staleTranslationWaitTxnId(0),
+      lsqMode(params.smtLSQMode),
       lsqPolicy(params.smtLSQPolicy),
+      smtLSQThreshold(params.smtLSQThreshold),
       stats(nullptr),
       LQEntries(params.LQEntries),
       SQEntries(params.SQEntries),
       enqueueWidth(params.renameWidth),
-      maxLQEntries(maxLSQAllocation(lsqPolicy, LQEntries, params.numThreads,
-                  params.smtLSQThreshold)),
-      maxSQEntries(maxLSQAllocation(lsqPolicy, SQEntries, params.numThreads,
-                  params.smtLSQThreshold)),
+      RARQEntries(params.RARQEntries),
+      RAWQEntries(params.RAWQEntries),
       dcachePort(this, cpu_ptr),
       numThreads(params.numThreads)
 {
@@ -518,30 +518,37 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
     //************ Handle SMT Parameters ***********
     //**********************************************
 
-    /* Run SMT olicy checks. */
+    if (lsqMode == SMTLSQMode::Independent) {
+        DPRINTF(LSQ, "LSQ mode set to Independent: each thread gets up to "
+                "%u LQ, %u SQ, %u RARQ and %u RAWQ entries\n",
+                LQEntries, SQEntries, RARQEntries, RAWQEntries);
+    } else if (lsqMode == SMTLSQMode::Shared) {
+        panic_if(lsqPolicy == SMTQueuePolicy::Threshold &&
+                 smtLSQThreshold == 0,
+                 "SMT LSQ threshold must be non-zero in shared threshold mode");
+
         if (lsqPolicy == SMTQueuePolicy::Dynamic) {
-        DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n");
-    } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
-        DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: "
-                "%i entries per LQ | %i entries per SQ\n",
-                maxLQEntries,maxSQEntries);
-    } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
-
-        assert(params.smtLSQThreshold > params.LQEntries);
-        assert(params.smtLSQThreshold > params.SQEntries);
-
-        DPRINTF(LSQ, "LSQ sharing policy set to Threshold: "
-                "%i entries per LQ | %i entries per SQ\n",
-                maxLQEntries,maxSQEntries);
+            DPRINTF(LSQ, "LSQ mode set to Shared/Dynamic: %u LQ and %u SQ "
+                    "entries are shared across active SMT threads, along "
+                    "with %u RARQ and %u RAWQ entries\n",
+                    LQEntries, SQEntries, RARQEntries, RAWQEntries);
+        } else if (lsqPolicy == SMTQueuePolicy::Partitioned) {
+            DPRINTF(LSQ, "LSQ mode set to Shared/Partitioned\n");
+        } else if (lsqPolicy == SMTQueuePolicy::Threshold) {
+            DPRINTF(LSQ, "LSQ mode set to Shared/Threshold: threshold=%u\n",
+                    smtLSQThreshold);
+        } else {
+            panic("Invalid LSQ sharing policy. Options are: Dynamic, "
+                        "Partitioned, Threshold");
+        }
     } else {
-        panic("Invalid LSQ sharing policy. Options are: Dynamic, "
-                    "Partitioned, Threshold");
+        panic("Invalid SMT LSQ mode. Options are: Independent, Shared");
     }
 
     thread.reserve(numThreads);
     // TODO: Parameterize the load/store pipeline stages
     for (ThreadID tid = 0; tid < numThreads; tid++) {
-        thread.emplace_back(maxLQEntries, maxSQEntries,
+        thread.emplace_back(LQEntries, SQEntries,
             params.LdPipeStages, params.StPipeStages, params.RARQEntries, params.RAWQEntries,
             params.RARDequeuePerCycle, params.RAWDequeuePerCycle, params.LoadCompletionWidth,
             params.StoreCompletionWidth);
@@ -778,13 +785,13 @@ LSQ::notifyDcacheRefill(Addr addr)
 unsigned
 LSQ::getFreeLQEntries(ThreadID tid)
 {
-    return thread[tid].numFreeLoadEntries();
+    return logicalFreeLoadEntries(tid);
 }
 
 unsigned
 LSQ::getFreeSQEntries(ThreadID tid)
 {
-    return thread[tid].numFreeStoreEntries();
+    return logicalFreeStoreEntries(tid);
 }
 
 unsigned
@@ -1240,7 +1247,9 @@ LSQ::getStoreHeadSeqNum(ThreadID tid)
 
 int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); }
 
-int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); }
+int LSQ::numLoads(ThreadID tid) const { return thread.at(tid).numLoads(); }
+int LSQ::numRAREntries(ThreadID tid) const { return thread.at(tid).numRAREntries(); }
+int LSQ::numRAWEntries(ThreadID tid) const { return thread.at(tid).numRAWEntries(); }
 
 int LSQ::anyInflightLoadsNotComplete()
 {
@@ -1273,7 +1282,7 @@ LSQ::anyStoreNotExecute()
     return false;
 }
 
-int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); }
+int LSQ::numStores(ThreadID tid) const { return thread.at(tid).numStores(); }
 
 int
 LSQ::numHtmStarts(ThreadID tid) const
@@ -1471,9 +1480,8 @@ LSQ::getCount()
 
     return total;
 }
-
 int
-LSQ::numLoads()
+LSQ::numLoads() const
 {
     unsigned total = 0;
 
@@ -1490,7 +1498,24 @@ LSQ::numLoads()
 }
 
 int
-LSQ::numStores()
+LSQ::numRAREntries() const
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += numRAREntries(tid);
+    }
+
+    return total;
+}
+
+int
+LSQ::numStores() const
 {
     unsigned total = 0;
 
@@ -1506,9 +1531,149 @@ LSQ::numStores()
     return total;
 }
 
+int
+LSQ::numRAWEntries() const
+{
+    unsigned total = 0;
+
+    std::list<ThreadID>::iterator threads = activeThreads->begin();
+    std::list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        total += numRAWEntries(tid);
+    }
+
+    return total;
+}
+
+bool
+LSQ::sharedLSQMode() const
+{
+    return lsqMode == SMTLSQMode::Shared;
+}
+
+unsigned
+LSQ::activeLSQThreads() const
+{
+    if (!activeThreads || activeThreads->empty()) {
+        return numThreads;
+    }
+    return activeThreads->size();
+}
+
+unsigned
+LSQ::sharedLSQAllocation(unsigned entries) const
+{
+    const unsigned active_threads = std::max(1U, activeLSQThreads());
+
+    switch (lsqPolicy) {
+      case SMTQueuePolicy::Dynamic:
+        return entries;
+      case SMTQueuePolicy::Partitioned:
+        return entries / active_threads;
+      case SMTQueuePolicy::Threshold:
+        return active_threads == 1 ? entries :
+            std::min(entries, smtLSQThreshold);
+      default:
+        panic("Invalid LSQ sharing policy. Options are: Dynamic, "
+              "Partitioned, Threshold");
+    }
+}
+
+unsigned
+LSQ::logicalMaxLoadEntries(ThreadID tid) const
+{
+    return sharedLSQMode() ? sharedLSQAllocation(LQEntries) : LQEntries;
+}
+
+unsigned
+LSQ::logicalMaxStoreEntries(ThreadID tid) const
+{
+    return sharedLSQMode() ? sharedLSQAllocation(SQEntries) : SQEntries;
+}
+
+unsigned
+LSQ::logicalMaxRAREntries(ThreadID tid) const
+{
+    return sharedLSQMode() ? sharedLSQAllocation(RARQEntries) : RARQEntries;
+}
+
+unsigned
+LSQ::logicalMaxRAWEntries(ThreadID tid) const
+{
+    return sharedLSQMode() ? sharedLSQAllocation(RAWQEntries) : RAWQEntries;
+}
+
+unsigned
+LSQ::logicalFreeLoadEntries(ThreadID tid) const
+{
+    const unsigned thread_free = std::max(0,
+        static_cast<int>(logicalMaxLoadEntries(tid)) - thread[tid].numLoads());
+    if (!sharedLSQMode()) {
+        return thread_free;
+    }
+
+    const unsigned shared_used = numLoads();
+    const unsigned shared_free = std::max(
+        0, static_cast<int>(LQEntries) - static_cast<int>(shared_used));
+    return std::min(thread_free, shared_free);
+}
+
+unsigned
+LSQ::logicalFreeStoreEntries(ThreadID tid) const
+{
+    const unsigned thread_free = std::max(0,
+        static_cast<int>(logicalMaxStoreEntries(tid)) - thread[tid].numStores());
+    if (!sharedLSQMode()) {
+        return thread_free;
+    }
+
+    const unsigned shared_used = numStores();
+    const unsigned shared_free = std::max(
+        0, static_cast<int>(SQEntries) - static_cast<int>(shared_used));
+    return std::min(thread_free, shared_free);
+}
+
+unsigned
+LSQ::logicalFreeRAREntries(ThreadID tid) const
+{
+    const unsigned thread_free = std::max(0,
+        static_cast<int>(logicalMaxRAREntries(tid)) - numRAREntries(tid));
+    if (!sharedLSQMode()) {
+        return thread_free;
+    }
+
+    const unsigned shared_used = numRAREntries();
+    const unsigned shared_free = std::max(
+        0, static_cast<int>(RARQEntries) - static_cast<int>(shared_used));
+    return std::min(thread_free, shared_free);
+}
+
+unsigned
+LSQ::logicalFreeRAWEntries(ThreadID tid) const
+{
+    const unsigned thread_free = std::max(0,
+        static_cast<int>(logicalMaxRAWEntries(tid)) - numRAWEntries(tid));
+    if (!sharedLSQMode()) {
+        return thread_free;
+    }
+
+    const unsigned shared_used = numRAWEntries();
+    const unsigned shared_free = std::max(
+        0, static_cast<int>(RAWQEntries) - static_cast<int>(shared_used));
+    return std::min(thread_free, shared_free);
+}
+
 unsigned
 LSQ::numFreeLoadEntries()
 {
+    if (sharedLSQMode()) {
+        const unsigned used = numLoads();
+        return used < LQEntries ? LQEntries - used : 0;
+    }
+
     unsigned total = 0;
 
     std::list<ThreadID>::iterator threads = activeThreads->begin();
@@ -1526,6 +1691,11 @@ LSQ::numFreeLoadEntries()
 unsigned
 LSQ::numFreeStoreEntries()
 {
+    if (sharedLSQMode()) {
+        const unsigned used = numStores();
+        return used < SQEntries ? SQEntries - used : 0;
+    }
+
     unsigned total = 0;
 
     std::list<ThreadID>::iterator threads = activeThreads->begin();
@@ -1543,18 +1713,22 @@ LSQ::numFreeStoreEntries()
 unsigned
 LSQ::numFreeLoadEntries(ThreadID tid)
 {
-        return thread[tid].numFreeLoadEntries();
+    return logicalFreeLoadEntries(tid);
 }
 
 unsigned
 LSQ::numFreeStoreEntries(ThreadID tid)
 {
-        return thread[tid].numFreeStoreEntries();
+    return logicalFreeStoreEntries(tid);
 }
 
 bool
 LSQ::isFull()
 {
+    if (sharedLSQMode()) {
+        return lqFull() || sqFull();
+    }
+
     std::list<ThreadID>::iterator threads = activeThreads->begin();
     std::list<ThreadID>::iterator end = activeThreads->end();
 
@@ -1571,12 +1745,12 @@ LSQ::isFull()
 bool
 LSQ::isFull(ThreadID tid)
 {
-    //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return isFull();
-    else
-        return thread[tid].lqFull() || thread[tid].sqFull();
+    if (sharedLSQMode()) {
+        return logicalFreeLoadEntries(tid) == 0 ||
+               logicalFreeStoreEntries(tid) == 0;
+    }
+
+    return thread[tid].lqFull() || thread[tid].sqFull();
 }
 
 bool
@@ -1632,6 +1806,10 @@ LSQ::sqEmpty(ThreadID tid) const
 bool
 LSQ::lqFull()
 {
+    if (sharedLSQMode()) {
+        return numFreeLoadEntries() == 0;
+    }
+
     std::list<ThreadID>::iterator threads = activeThreads->begin();
     std::list<ThreadID>::iterator end = activeThreads->end();
 
@@ -1648,17 +1826,20 @@ LSQ::lqFull()
 bool
 LSQ::lqFull(ThreadID tid)
 {
-    //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return lqFull();
-    else
-        return thread[tid].lqFull();
+    if (sharedLSQMode()) {
+        return logicalFreeLoadEntries(tid) == 0;
+    }
+
+    return thread[tid].lqFull();
 }
 
 bool
 LSQ::sqFull()
 {
+    if (sharedLSQMode()) {
+        return numFreeStoreEntries() == 0;
+    }
+
     std::list<ThreadID>::iterator threads = activeThreads->begin();
     std::list<ThreadID>::iterator end = activeThreads->end();
 
@@ -1675,12 +1856,11 @@ LSQ::sqFull()
 bool
 LSQ::sqFull(ThreadID tid)
 {
-     //@todo: Change to Calculate All Entries for
-    //Dynamic Policy
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
-        return sqFull();
-    else
-        return thread[tid].sqFull();
+    if (sharedLSQMode()) {
+        return logicalFreeStoreEntries(tid) == 0;
+    }
+
+    return thread[tid].sqFull();
 }
 
 const DynInstPtr&
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 159eaa0ab5..28cb6e0146 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -66,6 +66,7 @@
 #include "cpu/o3/dyn_inst_xsmeta.hh"
 #include "cpu/o3/limits.hh"
 #include "cpu/utils.hh"
+#include "enums/SMTLSQMode.hh"
 #include "enums/SMTQueuePolicy.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -928,18 +929,28 @@ class LSQ
     int getCount(ThreadID tid);
 
     /** Returns the total number of loads in the load queue. */
-    int numLoads();
+    int numLoads() const;
     /** Returns the total number of loads for a single thread. */
-    int numLoads(ThreadID tid);
+    int numLoads(ThreadID tid) const;
 
     int anyInflightLoadsNotComplete();
 
     bool anyStoreNotExecute();
 
     /** Returns the total number of stores in the store queue. */
-    int numStores();
+    int numStores() const;
     /** Returns the total number of stores for a single thread. */
-    int numStores(ThreadID tid);
+    int numStores(ThreadID tid) const;
+
+    /** Returns the total number of entries in the RAR queue. */
+    int numRAREntries() const;
+    /** Returns the total number of RAR queue entries for a single thread. */
+    int numRAREntries(ThreadID tid) const;
+
+    /** Returns the total number of entries in the RAW queue. */
+    int numRAWEntries() const;
+    /** Returns the total number of RAW queue entries for a single thread. */
+    int numRAWEntries(ThreadID tid) const;
 
 
     // hardware transactional memory
@@ -1207,6 +1218,18 @@ class LSQ
     unsigned getFreeSQEntries(ThreadID tid);
     unsigned getAndResetLastSQPopEntries(ThreadID tid);
 
+    bool sharedLSQMode() const;
+    unsigned activeLSQThreads() const;
+    unsigned sharedLSQAllocation(unsigned entries) const;
+    unsigned logicalMaxLoadEntries(ThreadID tid) const;
+    unsigned logicalMaxStoreEntries(ThreadID tid) const;
+    unsigned logicalFreeLoadEntries(ThreadID tid) const;
+    unsigned logicalFreeStoreEntries(ThreadID tid) const;
+    unsigned logicalMaxRAREntries(ThreadID tid) const;
+    unsigned logicalMaxRAWEntries(ThreadID tid) const;
+    unsigned logicalFreeRAREntries(ThreadID tid) const;
+    unsigned logicalFreeRAWEntries(ThreadID tid) const;
+
     /** Is D-cache blocked? */
     bool cacheBlocked() const;
     /** Set D-cache blocked status */
@@ -1292,30 +1315,13 @@ class LSQ
     Addr staleTranslationWaitTxnId;
 
     /** The LSQ policy for SMT mode. */
+    SMTLSQMode lsqMode;
+
+    /** The LSQ allocation policy used in shared mode. */
     SMTQueuePolicy lsqPolicy;
 
-    /** Auxiliary function to calculate per-thread max LSQ allocation limit.
-     * Depending on a policy, number of entries and possibly number of threads
-     * and threshold, this function calculates how many resources each thread
-     * can occupy at most.
-     */
-    static uint32_t
-    maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries,
-            uint32_t numThreads, uint32_t SMTThreshold)
-    {
-        if (pol == SMTQueuePolicy::Dynamic) {
-            return entries;
-        } else if (pol == SMTQueuePolicy::Partitioned) {
-            //@todo:make work if part_amt doesnt divide evenly.
-            return entries / numThreads;
-        } else if (pol == SMTQueuePolicy::Threshold) {
-            //Divide up by threshold amount
-            //@todo: Should threads check the max and the total
-            //amount of the LSQ
-            return SMTThreshold;
-        }
-        return 0;
-    }
+    /** The per-thread threshold used in shared threshold mode. */
+    unsigned smtLSQThreshold;
 
     struct LSQStats : public statistics::Group
     {
@@ -1352,11 +1358,10 @@ class LSQ
     /** Max number of memory instructions that may enter LSQ in one cycle. */
     const unsigned enqueueWidth;
 
-    /** Max LQ Size - Used to Enforce Sharing Policies. */
-    unsigned maxLQEntries;
-
-    /** Max SQ Size - Used to Enforce Sharing Policies. */
-    unsigned maxSQEntries;
+    /** Total Size of RARQ Entries. */
+    unsigned RARQEntries;
+    /** Total Size of RAWQ Entries. */
+    unsigned RAWQEntries;
 
     /** Data port. */
     DcachePort dcachePort;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 467fd73160..5112ee8c40 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -1478,11 +1478,13 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
     const bool trackRAR =
         loadCompletedIdx != loadQueue.tail() && inst->isNormalLd() &&
         inst->lqIt.idx() > loadCompletedIdx + 1;
-    const bool rarReplay = trackRAR && RARQueue.size() >= maxRARQEntries;
+    const bool rarReplay =
+        trackRAR && lsq->logicalFreeRAREntries(lsqID) == 0;
     const bool trackRAW =
         storeCompletedIdx != storeQueue.tail() && inst->isNormalLd() &&
         inst->sqIt.idx() > storeCompletedIdx + 1;
-    const bool rawReplay = trackRAW && RAWQueue.size() >= maxRAWQEntries;
+    const bool rawReplay =
+        trackRAW && lsq->logicalFreeRAWEntries(lsqID) == 0;
 
     if (cacheMissReplay) {
         inst->markReplayFlag(LdStReplayType::CacheMissReplay);
@@ -3853,7 +3855,7 @@ LSQUnit::processReplayQueues()
 
     // Collect instructions from RAR replay queue when space available
     assert(RARQueue.size() <= maxRARQEntries);
-    const int freeRARSize = maxRARQEntries - RARQueue.size();
+    const int freeRARSize = lsq->logicalFreeRAREntries(lsqID);
     const int maxRARCollect = std::min(freeRARSize, (int)rarDequeuePerCycle - RARReplayCount);
     for (int i = 0; i < maxRARCollect && !RARReplayQueue.empty(); ++i) {
         DynInstPtr inst = RARReplayQueue.front();
@@ -3863,7 +3865,7 @@ LSQUnit::processReplayQueues()
 
     // Collect instructions from RAW replay queue when space available
     assert(RAWQueue.size() <= maxRAWQEntries);
-    const int freeRAWSize = maxRAWQEntries - RAWQueue.size();
+    const int freeRAWSize = lsq->logicalFreeRAWEntries(lsqID);
     const int maxRAWCollect = std::min(freeRAWSize, (int)rawDequeuePerCycle - RAWReplayCount);
     for (int i = 0; i < maxRAWCollect && !RAWReplayQueue.empty(); ++i) {
         DynInstPtr inst = RAWReplayQueue.front();
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 837cc65506..fd2ff7d172 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -410,10 +410,16 @@ class LSQUnit
     unsigned getAndResetLastClockSQPopEntries();
 
     /** Returns the number of loads in the LQ. */
-    int numLoads() { return loadQueue.size(); }
+    int numLoads() const { return loadQueue.size(); }
 
     /** Returns the number of stores in the SQ. */
-    int numStores() { return storeQueue.size(); }
+    int numStores() const { return storeQueue.size(); }
+
+    /** Returns the number of entries in the per-thread RAR queue. */
+    int numRAREntries() const { return RARQueue.size(); }
+
+    /** Returns the number of entries in the per-thread RAW queue. */
+    int numRAWEntries() const { return RAWQueue.size(); }
 
     // hardware transactional memory
     int numHtmStarts() const { return htmStarts; }
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 3171928e1b..4b1b48c097 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -959,6 +959,12 @@ class DecoupledBPUWithFTB(BranchPredictor):
     enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks")
     enableTwoTaken = Param.Bool(False, "Enable predicting two taken blocks per cycle")
 
+class SMTFTQMode(ScopedEnum):
+    vals = [ 'Independent', 'Shared' ]
+
+class SMTFTQPolicy(ScopedEnum):
+    vals = [ 'Dynamic', 'Partitioned', 'Threshold' ]
+
 class TimedBaseBTBPredictor(SimObject):
     type = 'TimedBaseBTBPredictor'
     cxx_class = 'gem5::branch_prediction::btb_pred::TimedBaseBTBPredictor'
@@ -1188,6 +1194,11 @@ class DecoupledBPUWithBTB(BranchPredictor):
 
     # n = 2
     ftq_size = Param.Unsigned(128, "Fetch target queue size")
+    smtFTQMode = Param.SMTFTQMode('Independent',
+                                  "SMT FTQ mode: per-thread independent or shared quota")
+    smtFTQPolicy = Param.SMTFTQPolicy('Partitioned',
+                                      "SMT shared FTQ allocation policy")
+    smtFTQThreshold = Param.Int(100, "SMT FTQ Threshold Sharing Parameter")
     fsq_size = Param.Unsigned(64, "Fetch stream queue size")
     maxHistLen = Param.Unsigned(970, "The length of history")
 
diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript
index a7b3ff30f1..ae3ab2da7f 100644
--- a/src/cpu/pred/SConscript
+++ b/src/cpu/pred/SConscript
@@ -51,7 +51,7 @@ SimObject('BranchPredictor.py', sim_objects=[
     'AheadBTB', 'MBTB', 'UBTB', 'DecoupledBPUWithBTB',
     'TimedBaseBTBPredictor', 'BTBRAS', 'BTBTAGE', 'BTBTAGEUpperBound',
     'MicroTAGE',
-    'BTBITTAGE', 'BTBMGSC'], enums=["BpType"])
+    'BTBITTAGE', 'BTBMGSC'], enums=["BpType", "SMTFTQMode", "SMTFTQPolicy"])
 
 DebugFlag('Indirect')
 Source('bpred_unit.cc')
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index aec2222806..4ba14a04ed 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -1,5 +1,6 @@
 #include "cpu/pred/btb/decoupled_bpred.hh"
 
+#include <algorithm>
 #include <array>
 
 #include "arch/riscv/regs/misc.hh"
@@ -60,10 +61,20 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
       // uras(p.uras),
       bpDBSwitches(p.bpDBSwitches),
       numStages(p.numStages),
+      ftqEntries(p.ftq_size),
+      ftqMode(p.smtFTQMode),
+      ftqPolicy(p.smtFTQPolicy),
+      smtFTQThreshold(p.smtFTQThreshold),
       ftq(p.numThreads, p.ftq_size),
       resolveBlockThreshold(p.resolveBlockThreshold),
       dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum)
 {
+    panic_if(ftqMode == SMTFTQMode::Shared &&
+             ftqPolicy == SMTFTQPolicy::Threshold &&
+             smtFTQThreshold > ftqEntries,
+             "SMT FTQ threshold (%u) exceeds total FTQ entries (%u)",
+             smtFTQThreshold, ftqEntries);
+
     if (bpDBSwitches.size() > 0) {
         initDB();
     }
@@ -135,6 +146,85 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p)
     });
 }
 
+bool
+DecoupledBPUWithBTB::sharedFTQMode() const
+{
+    return ftqMode == SMTFTQMode::Shared;
+}
+
+unsigned
+DecoupledBPUWithBTB::activeFTQThreads() const
+{
+    if (!sharedFTQMode()) {
+        return 1;
+    }
+
+    if (!cpu) {
+        return std::max(1u, numThreads);
+    }
+
+    return std::max(1, cpu->numActiveThreads());
+}
+
+unsigned
+DecoupledBPUWithBTB::totalFTQEntries() const
+{
+    unsigned total = 0;
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        total += ftq.size(tid);
+    }
+    return total;
+}
+
+unsigned
+DecoupledBPUWithBTB::sharedFTQAllocation(unsigned entries) const
+{
+    const unsigned active_threads = activeFTQThreads();
+
+    switch (ftqPolicy) {
+      case SMTFTQPolicy::Dynamic:
+        return entries;
+      case SMTFTQPolicy::Partitioned:
+        return entries / active_threads;
+      case SMTFTQPolicy::Threshold:
+        return active_threads == 1 ? entries : std::min(entries, smtFTQThreshold);
+      default:
+        panic("Invalid SMT FTQ sharing policy");
+    }
+}
+
+unsigned
+DecoupledBPUWithBTB::logicalMaxFTQEntries(ThreadID tid) const
+{
+    if (!sharedFTQMode()) {
+        return ftqEntries;
+    }
+
+    return sharedFTQAllocation(ftqEntries);
+}
+
+unsigned
+DecoupledBPUWithBTB::logicalFreeFTQEntries(ThreadID tid) const
+{
+    const unsigned local_max = logicalMaxFTQEntries(tid);
+    const unsigned local_used = ftq.size(tid);
+    const unsigned local_free = local_used >= local_max ? 0 : local_max - local_used;
+
+    if (!sharedFTQMode()) {
+        return local_free;
+    }
+
+    const unsigned total_used = totalFTQEntries();
+    const unsigned shared_free = total_used >= ftqEntries ? 0 : ftqEntries - total_used;
+    return std::min(local_free, shared_free);
+}
+
+bool
+DecoupledBPUWithBTB::ftqFull(ThreadID tid) const
+{
+    return logicalFreeFTQEntries(tid) == 0;
+}
+
 ThreadID
 DecoupledBPUWithBTB::scheduleThread()
 {
@@ -187,7 +277,7 @@ DecoupledBPUWithBTB::tick()
     }
 
     // 1. Request new prediction if FSQ not full and we are idle
-    if (!threads[curTid].validprediction && !ftq.full(curTid)) {
+    if (!threads[curTid].validprediction && !ftqFull(curTid)) {
         if (threads[curTid].blockPredictionPending) {
             DPRINTF(Override, "Prediction blocked to prioritize resolve update\n");
             dbpBtbStats.predictionBlockedForUpdate++;
@@ -394,7 +484,7 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid)
 
     // Monitor FSQ size for statistics
     dbpBtbStats.fsqEntryDist.sample(ftq.size(tid), 1);
-    if (ftq.full(tid)) {
+    if (ftqFull(tid)) {
         dbpBtbStats.fsqFullCannotEnq++;
         DPRINTF(Override, "FSQ is full (%lu entries)\n", ftq.size(tid));
         return;
diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh
index 0a46c1a4e5..380e2e6eb5 100644
--- a/src/cpu/pred/btb/decoupled_bpred.hh
+++ b/src/cpu/pred/btb/decoupled_bpred.hh
@@ -19,20 +19,20 @@
 #include "cpu/pred/btb/btb_mgsc.hh"
 #include "cpu/pred/btb/btb_tage.hh"
 #include "cpu/pred/btb/btb_ubtb.hh"
+#include "cpu/pred/btb/common.hh"
 #include "cpu/pred/btb/ftq.hh"
+#include "cpu/pred/btb/history_manager.hh"
 #include "cpu/pred/btb/mbtb.hh"
 #include "cpu/pred/btb/microtage.hh"
 #include "cpu/pred/btb/ras.hh"
-#include "cpu/pred/general_arch_db.hh"
-
-// #include "cpu/pred/btb/uras.hh"
-#include "cpu/pred/btb/common.hh"
-#include "cpu/pred/btb/history_manager.hh"
 #include "cpu/pred/btb/timed_base_pred.hh"
+#include "cpu/pred/general_arch_db.hh"
 #include "cpu/timebuf.hh"
 #include "debug/DBPBTBStats.hh"
 #include "debug/DecoupleBP.hh"
 #include "debug/DecoupleBPProbe.hh"
+#include "enums/SMTFTQMode.hh"
+#include "enums/SMTFTQPolicy.hh"
 #include "params/DecoupledBPUWithBTB.hh"
 
 namespace gem5
@@ -121,6 +121,10 @@ class DecoupledBPUWithBTB : public BPredUnit
     // std::vector<FullBTBPrediction> predsOfEachStage{};
     unsigned numComponents{};
     unsigned numStages{};
+    unsigned ftqEntries;
+    SMTFTQMode ftqMode;
+    SMTFTQPolicy ftqPolicy;
+    unsigned smtFTQThreshold;
 
     FetchTargetQueue ftq;
 
@@ -144,6 +148,14 @@ class DecoupledBPUWithBTB : public BPredUnit
     std::vector<unsigned> resolveDequeueFailCounters;
     const unsigned resolveBlockThreshold;
 
+    bool sharedFTQMode() const;
+    unsigned activeFTQThreads() const;
+    unsigned totalFTQEntries() const;
+    unsigned sharedFTQAllocation(unsigned entries) const;
+    unsigned logicalMaxFTQEntries(ThreadID tid) const;
+    unsigned logicalFreeFTQEntries(ThreadID tid) const;
+    bool ftqFull(ThreadID tid) const;
+
     ThreadID scheduleThread();
 
     void processNewPrediction(ThreadID tid);

From cc766e3127d5fa1487a1ef2943666a20ba1d7daa Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 13 Apr 2026 12:26:39 +0800
Subject: [PATCH 17/38] cpu: avoid full memcpy_init for dedup difftest

Change-Id: Ic49b3e7ab82a32b81427026ce8d185e7ebeaba76
---
 src/cpu/base.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 264e17bf4d..2c64a00014 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -1684,8 +1684,6 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq)
                     assert(diffAllStates->proxy->ref_get_backed_memory);
                     diffAllStates->proxy->ref_get_backed_memory(
                         system->createCopyOnWriteBranch(), pmemSize);
-                    diffAllStates->proxy->memcpy_init(
-                        0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF);
                 } else {
                     assert(diffAllStates->proxy->ref_get_backed_memory);
                     diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize);

From ea80c0a5e88ddecb0a76ed1f51bb7baf266cd707 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 13 Apr 2026 17:01:59 +0800
Subject: [PATCH 18/38] cpu: apply asid hash to mgsc lookups

Change-Id: I4ee24d52f8ffe8fdc0a2086d97bcbc9be10860cb
---
 src/cpu/pred/btb/btb_mgsc.cc        | 89 ++++++++++++++++++-----------
 src/cpu/pred/btb/btb_mgsc.hh        | 18 +++---
 src/cpu/pred/btb/decoupled_bpred.cc | 12 +++-
 3 files changed, 75 insertions(+), 44 deletions(-)

diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc
index f0a3837191..fa51bbdb33 100755
--- a/src/cpu/pred/btb/btb_mgsc.cc
+++ b/src/cpu/pred/btb/btb_mgsc.cc
@@ -345,10 +345,12 @@ BTBMGSC::calculatePercsum(const std::vector<std::vector<std::vector<int16_t>>> &
  * @return Found weight or 0 if not found
  */
 int
-BTBMGSC::findWeight(const std::vector<int16_t> &weightTable, Addr pc)
+BTBMGSC::findWeight(const std::vector<int16_t> &weightTable, Addr pc,
+                    uint8_t asidHash)
 {
     auto mask = (1 << weightTableIdxWidth) - 1;
     auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask;
+    pcHash = xorAsidHashIntoIndex(pcHash, weightTableIdxWidth, asidHash);
     auto &entry = weightTable[pcHash];
     return entry;
 }
@@ -369,10 +371,12 @@ BTBMGSC::calculateScaledPercsum(int weight, int percsum)
  * @return Found threshold or default value if not found
  */
 int
-BTBMGSC::findThreshold(const std::vector<int16_t> &thresholdTable, Addr pc)
+BTBMGSC::findThreshold(const std::vector<int16_t> &thresholdTable, Addr pc,
+                       uint8_t asidHash)
 {
     auto mask = (1 << thresholdTablelogSize) - 1;
     auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask;
+    pcHash = xorAsidHashIntoIndex(pcHash, thresholdTablelogSize, asidHash);
     auto &entry = thresholdTable[pcHash];
     return entry;
 }
@@ -403,7 +407,7 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum)
 BTBMGSC::MgscPrediction
 BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
                                   const TageInfoForMGSC &tage_info,
-                                  ThreadID tid)
+                                  ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc,
             btb_entry.alwaysTaken);
@@ -412,12 +416,15 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
     // Calculate indices for all tables
     for (unsigned int i = 0; i < bwTableNum; ++i) {
         bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits,
-                                  state.indexBwFoldedHist[i].get());
+                                  state.indexBwFoldedHist[i].get(), asidHash);
     }
 
+    const Addr localHistoryIndex =
+        getPcIndex(startPC, log2(numEntriesFirstLocalHistories), asidHash);
     for (unsigned int i = 0; i < lTableNum; ++i) {
         lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits,
-                                 state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get());
+                                 state.indexLFoldedHist[localHistoryIndex][i].get(),
+                                 asidHash);
     }
     // std::string buf;
     // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf);
@@ -425,46 +432,46 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
 
     for (unsigned int i = 0; i < iTableNum; ++i) {
         iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits,
-                                 state.indexIFoldedHist[i].get());
+                                 state.indexIFoldedHist[i].get(), asidHash);
     }
 
     for (unsigned int i = 0; i < gTableNum; ++i) {
         gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits,
-                                 state.indexGFoldedHist[i].get());
+                                 state.indexGFoldedHist[i].get(), asidHash);
     }
 
     for (unsigned int i = 0; i < pTableNum; ++i) {
         pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits,
-                                 state.indexPFoldedHist[i].get());
+                                 state.indexPFoldedHist[i].get(), asidHash);
     }
 
     for (unsigned int i = 0; i < biasTableNum; ++i) {
         biasIndex[i] = getBiasIndex(startPC, biasTableIdxWidth - numCtrsPerLineBits, tage_info.tage_main_taken,
-                                    tage_info.tage_pred_conf_low);
+                                    tage_info.tage_pred_conf_low, asidHash);
     }
 
     int bw_percsum = enableBwTable ? calculatePercsum(bwTable, bwIndex, bwTableNum, btb_entry.pc) : 0;
-    int bw_weight = findWeight(bwWeightTable, btb_entry.pc);
+    int bw_weight = findWeight(bwWeightTable, btb_entry.pc, asidHash);
     int bw_scaled_percsum = calculateScaledPercsum(bw_weight, bw_percsum);
 
     int l_percsum = enableLTable ? calculatePercsum(lTable, lIndex, lTableNum, btb_entry.pc) : 0;
-    int l_weight = findWeight(lWeightTable, btb_entry.pc);
+    int l_weight = findWeight(lWeightTable, btb_entry.pc, asidHash);
     int l_scaled_percsum = calculateScaledPercsum(l_weight, l_percsum);
 
     int i_percsum = enableITable ? calculatePercsum(iTable, iIndex, iTableNum, btb_entry.pc) : 0;
-    int i_weight = findWeight(iWeightTable, btb_entry.pc);
+    int i_weight = findWeight(iWeightTable, btb_entry.pc, asidHash);
     int i_scaled_percsum = calculateScaledPercsum(i_weight, i_percsum);
 
     int g_percsum = enableGTable ? calculatePercsum(gTable, gIndex, gTableNum, btb_entry.pc) : 0;
-    int g_weight = findWeight(gWeightTable, btb_entry.pc);
+    int g_weight = findWeight(gWeightTable, btb_entry.pc, asidHash);
     int g_scaled_percsum = calculateScaledPercsum(g_weight, g_percsum);
 
     int p_percsum = enablePTable ? calculatePercsum(pTable, pIndex, pTableNum, btb_entry.pc) : 0;
-    int p_weight = findWeight(pWeightTable, btb_entry.pc);
+    int p_weight = findWeight(pWeightTable, btb_entry.pc, asidHash);
     int p_scaled_percsum = calculateScaledPercsum(p_weight, p_percsum);
 
     int bias_percsum = enableBiasTable ? calculatePercsum(biasTable, biasIndex, biasTableNum, btb_entry.pc) : 0;
-    int bias_weight = findWeight(biasWeightTable, btb_entry.pc);
+    int bias_weight = findWeight(biasWeightTable, btb_entry.pc, asidHash);
     int bias_scaled_percsum = calculateScaledPercsum(bias_weight, bias_percsum);
 
     // Calculate total sum of all weighted percsums
@@ -473,7 +480,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
 
     // Find thresholds
     // pc-indexed threshold table (only if enabled)
-    int p_update_thres = enablePCThreshold ? findThreshold(pUpdateThreshold, btb_entry.pc) : 0;
+    int p_update_thres =
+        enablePCThreshold ? findThreshold(pUpdateThreshold, btb_entry.pc, asidHash) : 0;
 
     int total_thres = (updateThreshold / 8) + p_update_thres;
     // Threshold is used as a confidence gate; avoid negative values which
@@ -530,7 +538,7 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC
 void
 BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntries,
                       const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
-                      CondTakens &results, ThreadID tid)
+                      CondTakens &results, ThreadID tid, uint8_t asidHash)
 {
     DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC);
 
@@ -541,7 +549,8 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector<BTBEntry> &btbEntri
             auto tage_info = tageInfoForMgscs.find(btb_entry.pc);
             if (tage_info != tageInfoForMgscs.end()) {
                 auto pred = generateSinglePrediction(btb_entry, startPC,
-                                                     tage_info->second, tid);
+                                                     tage_info->second, tid,
+                                                     asidHash);
                 threadMeta[tid]->preds[btb_entry.pc] = pred;
                 results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken});
             } else {
@@ -569,6 +578,7 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
 {
     const ThreadID tid = predictorTid(stagePreds);
     const auto &state = historyState(tid);
+    const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start);
 
     // IMPORTANT: when this function is called,
@@ -592,7 +602,8 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history,
         auto &stage_pred = stagePreds[s];
         stage_pred.condTakens.clear();
         lookupHelper(stream_start, stage_pred.btbEntries,
-                     stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid);
+                     stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid,
+                     asidHash);
     }
 }
 
@@ -705,10 +716,11 @@ BTBMGSC::updateWeightTable(std::vector<int16_t> &weightTable, Addr tableIndex, A
  * @param update_direction Direction to update (true=increment, false=decrement)
  */
 void
-BTBMGSC::updatePCThresholdTable(Addr pc, bool update_direction)
+BTBMGSC::updatePCThresholdTable(Addr pc, uint8_t asidHash, bool update_direction)
 {
     auto mask = (1 << thresholdTablelogSize) - 1;
     auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask;
+    pcHash = xorAsidHashIntoIndex(pcHash, thresholdTablelogSize, asidHash);
     auto &entry = pUpdateThreshold[pcHash];
     updateCounter(update_direction, pUpdateThresholdWidth, entry);
 }
@@ -876,10 +888,11 @@ BTBMGSC::updateSinglePredictor(const BTBEntry &entry, bool actual_taken, const M
     }
 #endif
 
-    // Only update tables if prediction was wrong or confidence was low
+        // Only update tables if prediction was wrong or confidence was low
     if (sc_pred_taken != actual_taken || abs(total_sum) < (total_thres / 2)) {
         // get weight table index from startPC
-        Addr weightTableIdx = getPcIndex(stream.startPC, weightTableIdxWidth);
+        Addr weightTableIdx = getPcIndex(stream.startPC, weightTableIdxWidth,
+                                         stream.asidHash);
         bool threshold_inc = (sc_pred_taken != actual_taken);
         if (threshold_inc) {
             mgscStats.pcThresholdInc++;
@@ -921,7 +934,8 @@ BTBMGSC::updateSinglePredictor(const BTBEntry &entry, bool actual_taken, const M
 
         // Update PC-indexed threshold table (only if enabled)
         if (enablePCThreshold) {
-            updatePCThresholdTable(entry.pc, sc_pred_taken != actual_taken);
+            updatePCThresholdTable(entry.pc, stream.asidHash,
+                                   sc_pred_taken != actual_taken);
         }
 
         // Update global threshold table
@@ -1007,7 +1021,8 @@ BTBMGSC::updateCounter<uint64_t>(bool taken, unsigned width, uint64_t &counter);
 
 
 Addr
-BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist)
+BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist,
+                      uint8_t asidHash)
 {
     // Create mask to limit result size to tableIndexBits
     Addr mask = (1ULL << tableIndexBits) - 1;
@@ -1016,11 +1031,12 @@ BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist)
     Addr pcBits = (pc >> floorLog2(blockSize)) & mask;
     Addr foldedBits = foldedHist & mask;
 
-    return pcBits ^ foldedBits;
+    return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits, asidHash);
 }
 
 Addr
-BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1)
+BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0,
+                      bool lowbit1, uint8_t asidHash)
 {
     // Create mask for tableIndexBits-2 to extract PC bits
     Addr mask = (1ULL << (tableIndexBits - 2)) - 1;
@@ -1028,17 +1044,18 @@ BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbi
     // Extract lower bits of PC directly and combine with low bits
     Addr pcBits = (pc >> floorLog2(blockSize)) & mask;
     unsigned index = (pcBits << 2) + (lowbit1 << 1) + lowbit0;
-    return index;
+    return xorAsidHashIntoIndex(index, tableIndexBits, asidHash);
 }
 
 Addr
-BTBMGSC::getPcIndex(Addr pc, unsigned tableIndexBits)
+BTBMGSC::getPcIndex(Addr pc, unsigned tableIndexBits, uint8_t asidHash)
 {
     // Create mask to extract tableIndexBits from PC
     Addr mask = (1ULL << tableIndexBits) - 1;
 
     // Extract lower bits of PC directly without bitset
-    return (pc >> floorLog2(blockSize)) & mask;
+    Addr baseIndex = (pc >> floorLog2(blockSize)) & mask;
+    return xorAsidHashIntoIndex(baseIndex, tableIndexBits, asidHash);
 }
 
 template<typename T>
@@ -1244,8 +1261,10 @@ BTBMGSC::specUpdateLHist(const std::vector<boost::dynamic_bitset<>> &history, Fu
     int shamt;
     bool cond_taken;
     std::tie(shamt, cond_taken) = pred.getHistInfo();
-    doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                 state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]);
+    const Addr localHistoryIndex =
+        getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories), pred.asidHash);
+    doUpdateHist(history[localHistoryIndex], shamt, cond_taken,
+                 state.indexLFoldedHist[localHistoryIndex]);
 }
 
 /**
@@ -1386,9 +1405,11 @@ BTBMGSC::recoverLHist(const std::vector<boost::dynamic_bitset<>> &history, const
             state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]);
         }
     }
-            doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken,
-                         state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]);
-        }
+    const Addr localHistoryIndex =
+        getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories), entry.asidHash);
+    doUpdateHist(history[localHistoryIndex], shamt, cond_taken,
+                 state.indexLFoldedHist[localHistoryIndex]);
+}
 
 #ifndef UNIT_TEST
 // Constructor for TAGE statistics
diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh
index 6ff29b13c8..99c4c3be98 100755
--- a/src/cpu/pred/btb/btb_mgsc.hh
+++ b/src/cpu/pred/btb/btb_mgsc.hh
@@ -198,7 +198,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
                          ThreadID tid, const char *when);  // Check GHR folded
 
     // Calculate MGSC weight index
-    Addr getPcIndex(Addr pc, unsigned tableIndexBits);
+    Addr getPcIndex(Addr pc, unsigned tableIndexBits, uint8_t asidHash = 0);
 
   private:
     // Utility functions for reducing code duplication
@@ -211,7 +211,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     /**
      * Find weight in a weight table for a given PC
      */
-    int findWeight(const std::vector<int16_t> &weightTable, Addr pc);
+    int findWeight(const std::vector<int16_t> &weightTable, Addr pc, uint8_t asidHash);
 
     /**
      * Calculate scaled percsum using weight
@@ -221,7 +221,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     /**
      * Find threshold in a threshold table for a given PC
      */
-    int findThreshold(const std::vector<int16_t> &thresholdTable, Addr pc);
+    int findThreshold(const std::vector<int16_t> &thresholdTable, Addr pc, uint8_t asidHash);
 
     /**
      * Calculate if weight scale causes prediction difference
@@ -243,7 +243,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     /**
      * Update a threshold table and allocate new entry if needed
      */
-    void updatePCThresholdTable(Addr pc, bool update_direction);
+    void updatePCThresholdTable(Addr pc, uint8_t asidHash, bool update_direction);
 
     /**
      * Update the global threshold table and allocate new entry if needed
@@ -253,13 +253,15 @@ class BTBMGSC : public TimedBaseBTBPredictor
     // Look up predictions in MGSC tables for a stream of instructions
     void lookupHelper(const Addr &stream_start, const std::vector<BTBEntry> &btbEntries,
                       const std::unordered_map<Addr, TageInfoForMGSC> &tageInfoForMgscs,
-                      CondTakens &results, ThreadID tid);
+                      CondTakens &results, ThreadID tid, uint8_t asidHash);
 
     // Calculate MGSC history index with folded history
-    Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist);
+    Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist,
+                      uint8_t asidHash = 0);
 
     // Calculate MGSC bias index
-    Addr getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1);
+    Addr getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1,
+                      uint8_t asidHash = 0);
 
     // Get offset within a block for a given PC
     Addr getOffset(Addr pc) { return (pc & (blockSize - 1)) >> 1; }
@@ -284,7 +286,7 @@ class BTBMGSC : public TimedBaseBTBPredictor
     // Helper method to generate prediction for a single BTB entry
     MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC,
                                             const TageInfoForMGSC &tage_info,
-                                            ThreadID tid);
+                                            ThreadID tid, uint8_t asidHash);
 
     // Helper method to prepare BTB entries for update
     std::vector<BTBEntry> prepareUpdateEntries(const FetchTarget &stream);
diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc
index 4ba14a04ed..07e3b138e1 100644
--- a/src/cpu/pred/btb/decoupled_bpred.cc
+++ b/src/cpu/pred/btb/decoupled_bpred.cc
@@ -1083,8 +1083,12 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry)
     pHistShiftIn(2, p_taken, s0PHistory, p_pc, p_target);
 
     // Update local history
+    const Addr localHistoryIndex =
+        mgsc->getPcIndex(finalPred.bbStart,
+                         log2(mgsc->getNumEntriesFirstLocalHistories()),
+                         finalPred.asidHash);
     histShiftIn(shamt, taken,
-        s0LHistory[mgsc->getPcIndex(finalPred.bbStart, log2(mgsc->getNumEntriesFirstLocalHistories()))]);
+        s0LHistory[localHistoryIndex]);
 
 #ifndef NDEBUG
     if (tage->isEnabled()) {
@@ -1171,8 +1175,12 @@ DecoupledBPUWithBTB::recoverHistoryForSquash(
     histShiftIn(real_bw_shamt, real_bw_taken, s0BwHistory);
 
     // Update local history with actual outcome
+    const Addr localHistoryIndex =
+        mgsc->getPcIndex(target.startPC,
+                         log2(mgsc->getNumEntriesFirstLocalHistories()),
+                         target.asidHash);
     histShiftIn(real_shamt, real_taken,
-                s0LHistory[mgsc->getPcIndex(target.startPC, log2(mgsc->getNumEntriesFirstLocalHistories()))]);
+                s0LHistory[localHistoryIndex]);
 
     // Update history manager with appropriate branch info
     if (squash_type == SQUASH_CTRL) {

From 491baad33712990d27f83e73ee7fa35e238475db Mon Sep 17 00:00:00 2001
From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com>
Date: Tue, 14 Apr 2026 14:53:14 +0800
Subject: [PATCH 19/38] cpu-o3: add backend SMT PMU (#827)

Co-authored-by: mo haonan <mohaonan@node023.bosccluster.com>
---
 src/cpu/o3/iew.cc         | 19 ++++++++++++-
 src/cpu/o3/iew.hh         |  2 ++
 src/cpu/o3/issue_queue.cc | 10 +++++--
 src/cpu/o3/issue_queue.hh |  1 +
 src/cpu/o3/rename.cc      | 57 +++++++++++++++++++++++++++++++--------
 src/cpu/o3/rename.hh      | 14 +++++++---
 6 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index a9a1a14565..96fded9794 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -207,6 +207,8 @@ IEW::IEWStats::IEWStats(CPU *cpu)
              "Average fanout of values written-back"),
     ADD_STAT(stallEvents, statistics::units::Count::get(),
              "Number of events the IEW has stalled"),
+    ADD_STAT(smtStallEvents, statistics::units::Count::get(),
+             "Number of events the IEW has stalled per thread"),
     ADD_STAT(fetchStallReason, statistics::units::Count::get(),
              "Number of fetch stall reasons each tick (Total)"),
     ADD_STAT(decodeStallReason, statistics::units::Count::get(),
@@ -243,6 +245,11 @@ IEW::IEWStats::IEWStats(CPU *cpu)
     stallEvents
         .init(StallEventCount)
         .flags(statistics::total);
+        
+    smtStallEvents
+        .init(StallEventCount,0,cpu->numThreads-1,1)
+        .flags(statistics::total);
+
 
     dispDist.init(0,10,1).flags(statistics::nozero);
 
@@ -257,6 +264,7 @@ IEW::IEWStats::IEWStats(CPU *cpu)
 
     for (int i = 0; i < StallEventCount; i++) {
         stallEvents.subname(i, stall_event_str[static_cast<StallEvent>(i)]);
+        smtStallEvents.subname(i, stall_event_str[static_cast<StallEvent>(i)]);
     }
 
     fetchStallReason
@@ -829,6 +837,7 @@ IEW::checkSquash()
 
             fetchRedirect[i] = false;
             iewStats.stallEvents[ROBWalk]++;
+            iewStats.smtStallEvents[ROBWalk].sample(i);
             setAllStalls(StallReason::CommitSquash);
         }
 
@@ -837,6 +846,7 @@ IEW::checkSquash()
 
             wroteToTimeBuffer = true;
             iewStats.stallEvents[ROBWalk]++;
+            iewStats.smtStallEvents[ROBWalk].sample(i);
             setAllStalls(StallReason::CommitSquash);
         }
     }
@@ -1014,6 +1024,7 @@ IEW::dispatchInstFromRename(ThreadID tid)
             DPRINTF(IEW, "[tid:%i] Dispatch: %s has become full.\n", tid, inst->isLoad() ? "LQ" : "SQ");
 
             iewStats.stallEvents[LSQFull]++;
+            iewStats.smtStallEvents[LSQFull].sample(tid);
 
             ++iewStats.lsqFullEvents;
             dispatch_stalls.push(checkDispatchStall(tid, NumDQ, inst, disp_seq));
@@ -1025,6 +1036,8 @@ IEW::dispatchInstFromRename(ThreadID tid)
         if (!scheduler->ready(inst, disp_seq)) {
             DPRINTF(IEW, "[tid:%i] Dispatch: IQ is full or bwFull.\n", tid);
             iewStats.stallEvents[IQFull]++;
+            iewStats.smtStallEvents[IQFull].sample(tid);
+
             ++iewStats.iqFullEvents;
 
             dispatch_stalls.push(checkDispatchStall(tid, NumDQ, inst, disp_seq));
@@ -1158,6 +1171,8 @@ IEW::dispatchInstFromRename(ThreadID tid)
         DPRINTF(IEW,"[tid:%i] Dispatch: Bandwidth Full. Blocking.\n", tid);
 
         iewStats.stallEvents[DispBWFull]++;
+        iewStats.smtStallEvents[DispBWFull].sample(tid);
+        
     }
 
 }
@@ -1270,6 +1285,7 @@ IEW::classifyInstToDispQue(ThreadID tid)
     if (!insts_to_dispatch.empty()) {
         DPRINTF(IEW,"[tid:%i] Dispatch: Bandwidth Full. Blocking.\n", tid);
         iewStats.stallEvents[DispBWFull]++;
+        iewStats.smtStallEvents[DispBWFull].sample(tid);
     }
 }
 
@@ -1302,6 +1318,7 @@ IEW::dispatchInstFromDispQue()
                 DPRINTF(IEW, "[tid:%i] Dispatch: IQ is full or bwFull.\n", tid);
 
                 iewStats.stallEvents[IQFull]++;
+                iewStats.smtStallEvents[IQFull].sample(tid);
                 ++iewStats.iqFullEvents;
                 break;
             }
@@ -1314,7 +1331,7 @@ IEW::dispatchInstFromDispQue()
                         inst->isLoad() ? "LQ" : "SQ");
 
                 iewStats.stallEvents[LSQFull]++;
-
+                iewStats.smtStallEvents[LSQFull].sample(tid);
                 ++iewStats.lsqFullEvents;
                 break;
             }
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index 94cfbcb8cc..c621e62ebc 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -571,6 +571,8 @@ class IEW
 
         statistics::Vector stallEvents;
 
+        statistics::VectorDistribution smtStallEvents;
+
         /** Distribution of number of fetch stall reasons each tick. */
         statistics::Vector fetchStallReason;
         /** Distribution of number of decode stall reasons each tick. */
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index c3739031aa..d83083a45f 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -224,7 +224,8 @@ IssueQue::IssueQueStats::IssueQueStats(statistics::Group* parent, IssueQue* que,
       ADD_STAT(issueDist, statistics::units::Count::get(), "distruibution of issue"),
       ADD_STAT(portissued, statistics::units::Count::get(), "count each port issues"),
       ADD_STAT(portBusy, statistics::units::Count::get(), "count each port busy cycles"),
-      ADD_STAT(avgInsts, statistics::units::Count::get(), "average insts")
+      ADD_STAT(avgInsts, statistics::units::Count::get(), "average insts"),
+      ADD_STAT(instsNum, statistics::units::Count::get(), "insts per thread")
 {
     insertDist.init(que->inports + 1).flags(statistics::nozero);
     issueDist.init(que->outports + 1).flags(statistics::nozero);
@@ -235,6 +236,7 @@ IssueQue::IssueQueStats::IssueQueStats(statistics::Group* parent, IssueQue* que,
     loadmiss.flags(statistics::nozero);
     arbFailed.flags(statistics::nozero);
     issueOccupy.flags(statistics::nozero);
+    instsNum.flags(statistics::nozero);
 }
 
 IssueQue::IssueQue(const IssueQueParams& params)
@@ -375,6 +377,7 @@ IssueQue::setCPU(CPU* cpu)
     this->cpu = cpu;
     _name = cpu->name() + ".scheduler." + getName();
     iqstats = new IssueQueStats(cpu, this, "scheduler." + this->getName());
+    iqstats->instsNum.init(cpu->numThreads);
 }
 
 void
@@ -903,7 +906,10 @@ IssueQue::incInIQInstsCounter(ThreadID tid)
 {
     if (instsCounter) {
         instsCounter->incCounter(tid);
-    } 
+    }
+    if (iqstats) {
+        iqstats->instsNum[tid]++;
+    }
 }
     
 void
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index a4416663a0..6c6d9f8fbf 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -206,6 +206,7 @@ class IssueQue : public SimObject
         statistics::Vector portissued;
         statistics::Vector portBusy;
         statistics::Average avgInsts;
+        statistics::Vector instsNum; 
     }* iqstats = nullptr;
 
     void replay(const DynInstPtr& inst);
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index 02c4f40144..33d7852a87 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -66,9 +66,9 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams &params)
       renameWidth(params.renameWidth),
       releaseWidth(params.phyregReleaseWidth),
       numThreads(params.numThreads),
-      stats(_cpu),
+      stats(_cpu, this),
       valuePred(params.valuePred),
-        enableSelectiveVPFlush(params.enableSelectiveVPFlush)
+      enableSelectiveVPFlush(params.enableSelectiveVPFlush)
 {
     if (renameWidth > MaxWidth)
         fatal("renameWidth (%d) is larger than compiled limit (%d),\n"
@@ -94,8 +94,8 @@ Rename::name() const
     return cpu->name() + ".rename";
 }
 
-Rename::RenameStats::RenameStats(statistics::Group *parent)
-    : statistics::Group(parent, "rename"),
+Rename::RenameStats::RenameStats(CPU *cpu, Rename *rename)
+    : statistics::Group(cpu, "rename"),
       ADD_STAT(squashCycles, statistics::units::Cycle::get(),
                "Number of cycles rename is squashing"),
       ADD_STAT(idleCycles, statistics::units::Cycle::get(),
@@ -109,7 +109,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent)
       ADD_STAT(unblockCycles, statistics::units::Cycle::get(),
                "Number of cycles rename is unblocking"),
       ADD_STAT(renamedInsts, statistics::units::Count::get(),
-               "Number of instructions processed by rename"),
+               "Number of instructions processed by rename per thread"),
       ADD_STAT(squashedInsts, statistics::units::Count::get(),
                "Number of squashed instructions processed by rename"),
       ADD_STAT(ROBFullEvents, statistics::units::Count::get(),
@@ -149,7 +149,9 @@ Rename::RenameStats::RenameStats(statistics::Group *parent)
       ADD_STAT(constantFolded, statistics::units::Count::get(),
                "count of insts eliminated by constant folding"),
       ADD_STAT(stallEvents, statistics::units::Count::get(),
-               "count of stall events")
+               "count of stall events"),
+      ADD_STAT(smtStallEvents, statistics::units::Count::get(),
+               "Number of events the Rename has stalled per thread")
 {
     squashCycles.prereq(squashCycles);
     idleCycles.prereq(idleCycles);
@@ -158,14 +160,12 @@ Rename::RenameStats::RenameStats(statistics::Group *parent)
     runCycles.prereq(idleCycles);
     unblockCycles.prereq(unblockCycles);
 
-    renamedInsts.prereq(renamedInsts);
     squashedInsts.prereq(squashedInsts);
 
     ROBFullEvents.prereq(ROBFullEvents);
     IQFullEvents.prereq(IQFullEvents);
     LQFullEvents.prereq(LQFullEvents);
     SQFullEvents.prereq(SQFullEvents);
-    fullRegistersEvents.prereq(fullRegistersEvents);
 
     renamedOperands.prereq(renamedOperands);
     lookups.prereq(lookups);
@@ -182,7 +182,13 @@ Rename::RenameStats::RenameStats(statistics::Group *parent)
     moveEliminated.flags(statistics::total);
     constantFolded.flags(statistics::total);
 
+    renamedInsts.init(cpu->numThreads).flags(statistics::total);
+    fullRegistersEvents.init(cpu->numThreads).flags(statistics::total);
+    
     stallEvents.init(StallEventCount).flags(statistics::total);
+    smtStallEvents
+        .init(StallEventCount,0,cpu->numThreads-1,1)
+        .flags(statistics::total);
     std::map < StallEvent, const char* > stall_event_str = {
         { ROBWalk, "ROBWalk"},
         { IEWStall, "IEWStall"},
@@ -196,6 +202,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent)
 
     for (int i = 0; i < StallEventCount; i++) {
         stallEvents.subname(i, stall_event_str[static_cast<StallEvent>(i)]);
+        smtStallEvents.subname(i, stall_event_str[static_cast<StallEvent>(i)]);
     }
 }
 
@@ -361,10 +368,23 @@ Rename::tick()
             block_reason = checkRenameStallFromIEW(i);
             if (block_reason == StallReason::NoStall) {
                 block_reason = StallReason::RegFull;
-                ++stats.fullRegistersEvents;
+                ++stats.fullRegistersEvents[i];
                 stats.stallEvents[RegFull]++;
             }
         }
+
+        if (block_reason == StallReason::ROBFull) {
+            stats.smtStallEvents[ROBFull].sample(i);
+        } else if (block_reason == StallReason::RegFull) {
+            stats.smtStallEvents[RegFull].sample(i);
+        } else if (block_reason == StallReason::SerializeStall) {
+            stats.smtStallEvents[SerializeInst].sample(i);
+        } else if ( block_reason == StallReason::MemDQBandwidth ||
+                    block_reason == StallReason::IntDQBandwidth ||
+                    block_reason == StallReason::FVDQBandwidth) {
+            stats.smtStallEvents[BWFull].sample(i);
+        }
+
         DPRINTF(Rename, "[tid:%i] blockRename: %i, canRename: %i, block: %i, active: %i\n",
                 i, stallSig->blockRename[i], can_rename, block, active);
 
@@ -402,6 +422,7 @@ Rename::tick()
     renameInsts(tid);
     if (stallSig->blockRename[tid]) {
         setAllStalls(stallSig->renameBlockReason[tid]);
+        stats.smtStallEvents[stallSig->renameBlockReason[tid]].sample(tid);
     } else if (toIEW->size > 0 && renameStalls[0] == StallReason::NoStall) {
         for (int i = 0; i < renameStalls.size(); i++) {
             if (i < toIEW->size) {
@@ -584,8 +605,9 @@ Rename::renameInsts(ThreadID tid)
             breakRename = checkRenameStallFromIEW(tid);
             if (breakRename == StallReason::NoStall) {
                 breakRename = StallReason::RegFull;
-                ++stats.fullRegistersEvents;
+                ++stats.fullRegistersEvents[tid];
                 stats.stallEvents[RegFull]++;
+                // stats.smtStallEvents[RegFull].sample(tid);
             }
         }
         blockReason = breakRename;
@@ -599,7 +621,20 @@ Rename::renameInsts(ThreadID tid)
     } else if (breakRename != StallReason::NoStall) {
         setAllStalls(breakRename);
     }
-    stats.renamedInsts += renamed_insts;
+
+    stats.renamedInsts[tid] += renamed_insts;
+
+    if (breakRename == StallReason::ROBFull) {
+        stats.smtStallEvents[ROBFull].sample(tid);
+    } else if (breakRename == StallReason::RegFull) {
+        stats.smtStallEvents[RegFull].sample(tid);
+    } else if (breakRename == StallReason::SerializeStall) {
+        stats.smtStallEvents[SerializeInst].sample(tid);
+    } else if ( breakRename == StallReason::MemDQBandwidth ||
+                breakRename == StallReason::IntDQBandwidth ||
+                breakRename == StallReason::FVDQBandwidth) {
+        stats.smtStallEvents[BWFull].sample(tid);
+    }
 
     // If we wrote to the time buffer, record this.
     if (toIEWIndex) {
diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh
index 861b0f82c2..a8d555a019 100644
--- a/src/cpu/o3/rename.hh
+++ b/src/cpu/o3/rename.hh
@@ -382,7 +382,8 @@ class Rename
 
     struct RenameStats : public statistics::Group
     {
-        RenameStats(statistics::Group *parent);
+        // RenameStats(statistics::Group *parent);
+        RenameStats(CPU *cpu, Rename *rename);
 
         /** Stat for total number of cycles spent squashing. */
         statistics::Scalar squashCycles;
@@ -397,8 +398,10 @@ class Rename
         statistics::Scalar runCycles;
         /** Stat for total number of cycles spent unblocking. */
         statistics::Scalar unblockCycles;
-        /** Stat for total number of renamed instructions. */
-        statistics::Scalar renamedInsts;
+        // /** Stat for total number of renamed instructions. */
+        // statistics::Scalar renamedInsts;
+        /** Stat for total number of renamed instructions per thread. */
+        statistics::Vector renamedInsts;
         /** Stat for total number of squashed instructions that rename
          * discards. */
         statistics::Scalar squashedInsts;
@@ -416,7 +419,7 @@ class Rename
         statistics::Scalar SQFullEvents;
         /** Stat for total number of times that rename runs out of free
          *  registers to use to rename. */
-        statistics::Scalar fullRegistersEvents;
+        statistics::Vector fullRegistersEvents;
         /** Stat for total number of renamed destination registers. */
         statistics::Scalar renamedOperands;
         /** Stat for total number of source register rename lookups. */
@@ -441,6 +444,9 @@ class Rename
         statistics::Scalar constantFolded;
 
         statistics::Vector stallEvents;
+
+        statistics::VectorDistribution smtStallEvents;
+        
     } stats;
 
     std::vector<StallReason> renameStalls;

From 97922239963c2be889709a1f7333f1aacabfd49f Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 15 Apr 2026 10:59:49 +0800
Subject: [PATCH 20/38] cpu-o3: enlarge smt l3 to 32MB

Change-Id: I653f16b0adefdfcc978f54791e21adbf74ecd84e
---
 configs/example/smt_idealkmhv3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py
index a83681506f..dbbe66f814 100644
--- a/configs/example/smt_idealkmhv3.py
+++ b/configs/example/smt_idealkmhv3.py
@@ -36,6 +36,7 @@ def setSharedLSQParams(args, system):
     args.smt = True
     args.bp_type = 'DecoupledBPUWithBTB'
     args.l2_size = '2MB'
+    args.l3_size = '32MB'
 
     Simulation.setMemClass(args)
 

From d7beabef943ef74ca02289d6d7ae213e44b9884f Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 27 Apr 2026 14:34:42 +0800
Subject: [PATCH 21/38] cpu: Isolate ABTB pipeline for SMT

Change-Id: I8060ec785799940c4190fdba32b349226f44795d
---
 src/cpu/pred/BranchPredictor.py    |  2 +-
 src/cpu/pred/btb/abtb.cc           | 98 ++++++++++++++++++++++--------
 src/cpu/pred/btb/abtb.hh           | 48 +++++++++++----
 src/cpu/pred/btb/test/abtb.test.cc | 53 +++++++++++++++-
 4 files changed, 158 insertions(+), 43 deletions(-)

diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
index 4b1b48c097..3359188b81 100644
--- a/src/cpu/pred/BranchPredictor.py
+++ b/src/cpu/pred/BranchPredictor.py
@@ -1000,7 +1000,7 @@ class AheadBTB(TimedBaseBTBPredictor):
     numEntries = Param.Unsigned(1024, "Number of entries in the BTB")
     tagBits = Param.Unsigned(38, "Number of bits in the tag")
     instShiftAmt = Param.Unsigned(1, "Amount to shift PC to get inst bits")
-    numThreads = Param.Unsigned(1, "Number of threads")
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
     numWays = Param.Unsigned(8, "Number of ways per set")
     aheadPipelinedStages = Param.Unsigned(1, "Number of stages ahead pipelined")
     entryHalfAligned = Param.Bool(False, "Whether the entries are half-aligned")
diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc
index 8013900e83..e0bf942724 100644
--- a/src/cpu/pred/btb/abtb.cc
+++ b/src/cpu/pred/btb/abtb.cc
@@ -66,12 +66,16 @@ namespace test {
  */
 #ifdef UNIT_TEST
 // Test constructor for unit testing mode - fixed ahead-pipelined configuration
-AheadBTB::AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, unsigned numDelay)
+AheadBTB::AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays,
+                   unsigned numDelay, unsigned numThreads)
     : TimedBaseBTBPredictor(),
       numEntries(numEntries),
       numWays(numWays),
+      numThreads(numThreads),
+      threadStates(numThreads),
       tagBits(tagBits)
 {
+    usingS3Pred = false;
     setNumDelay(numDelay);
     this->aheadPipelinedStages = 1; // fixed ahead-pipelined stages = 1
 #else
@@ -80,6 +84,8 @@ AheadBTB::AheadBTB(const Params &p)
     : TimedBaseBTBPredictor(p),
     numEntries(p.numEntries),
     numWays(p.numWays),
+    numThreads(p.numThreads),
+    threadStates(p.numThreads),
     tagBits(p.tagBits),
     usingS3Pred(p.usingS3Pred),
     btbStats(this)
@@ -91,6 +97,7 @@ AheadBTB::AheadBTB(const Params &p)
     // AheadBTB always uses single instruction alignment: | tag | idx | instShiftAmt
     idxShiftAmt = 1;
 
+    assert(numThreads > 0);
     assert(numEntries % numWays == 0);
     numSets = numEntries / numWays;
     // AheadBTB always uses ahead-pipelined stages = 1
@@ -130,6 +137,27 @@ AheadBTB::AheadBTB(const Params &p)
 #endif
 }
 
+ThreadID
+AheadBTB::predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const
+{
+    assert(!stagePreds.empty());
+    return stagePreds.front().tid;
+}
+
+AheadBTB::ThreadState &
+AheadBTB::threadState(ThreadID tid)
+{
+    assert(tid < threadStates.size());
+    return threadStates[tid];
+}
+
+const AheadBTB::ThreadState &
+AheadBTB::threadState(ThreadID tid) const
+{
+    assert(tid < threadStates.size());
+    return threadStates[tid];
+}
+
 #ifndef UNIT_TEST
 void
 AheadBTB::tickStart()
@@ -297,14 +325,17 @@ AheadBTB::fillStagePredictions(const std::vector<TickedBTBEntry>& entries,
  */
 void
 AheadBTB::updatePredictionMeta(const std::vector<TickedBTBEntry>& entries,
-                                   std::vector<FullBTBPrediction>& stagePreds)
+                                   std::vector<FullBTBPrediction>& stagePreds,
+                                   ThreadID tid)
 {
+    auto &state = threadState(tid);
+
     // Save current BTB entries
     for (auto e: entries) {
-        meta->hit_entries.push_back(BTBEntry(e));
+        state.meta->hit_entries.push_back(BTBEntry(e));
     }
 
-    lastPredEntries = meta->hit_entries;
+    state.lastPredEntries = state.meta->hit_entries;
 }
 
 void
@@ -312,10 +343,12 @@ AheadBTB::putPCHistory(Addr startAddr,
                          const boost::dynamic_bitset<> &history,
                          std::vector<FullBTBPrediction> &stagePreds)
 {
-    meta = std::make_shared<BTBMeta>();
+    const ThreadID tid = predictorTid(stagePreds);
+    auto &state = threadState(tid);
+    state.meta = std::make_shared<BTBMeta>();
     const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash;
     // Lookup all matching entries in BTB
-    auto find_entries = lookup(startAddr, asidHash);
+    auto find_entries = lookup(startAddr, tid, asidHash);
 
     // Process BTB entries
     auto processed_entries = processEntries(find_entries, startAddr);
@@ -324,19 +357,22 @@ AheadBTB::putPCHistory(Addr startAddr,
     fillStagePredictions(processed_entries, stagePreds);
     
     // Update metadata for later stages
-    updatePredictionMeta(processed_entries, stagePreds);
+    updatePredictionMeta(processed_entries, stagePreds, tid);
 }
 
 std::shared_ptr<void>
 AheadBTB::getPredictionMeta(ThreadID tid)
 {
-    (void)tid;
+    if (tid >= threadStates.size()) {
+        return nullptr;
+    }
+    auto &state = threadStates[tid];
     // Lazy-initialize meta so callers never observe a null pointer
     // This avoids early-cycle crashes when prediction hasn't populated meta yet
-    if (!meta) {
-        meta = std::make_shared<BTBMeta>();
+    if (!state.meta) {
+        state.meta = std::make_shared<BTBMeta>();
     }
-    return meta;
+    return state.meta;
 }
 
 void
@@ -345,9 +381,10 @@ AheadBTB::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredicti
 void
 AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken)
 {
+    auto &state = threadState(entry.tid);
     // clear ahead pipeline first
-    while (!aheadReadBtbEntries.empty()) {
-        aheadReadBtbEntries.pop();
+    while (!state.aheadReadBtbEntries.empty()) {
+        state.aheadReadBtbEntries.pop();
     }
 }
 
@@ -358,19 +395,22 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget
  * @return Vector of matching BTB entries
  */
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
+AheadBTB::lookupSingleBlock(Addr block_pc, ThreadID tid, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
         return res; // ignore false hit when lowest bit is 1
     }
+    auto &state = threadState(tid);
     Addr btb_idx = getIndex(block_pc, asidHash);
     auto btb_set = btb[btb_idx];
     assert(btb_idx < numSets);
     // AheadBTB always uses ahead-pipelined implementation:
     // memory access with previous block PC, tag compare with current PC
-    DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx);
-    aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set));
+    DPRINTF(AheadPipeline,
+            "AheadBTB: [tid:%u] pushing set for ahead-pipelined stages, idx %ld\n",
+            tid, btb_idx);
+    state.aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set));
 
     Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag
     Addr pc = 0;
@@ -378,21 +418,24 @@ AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
     BTBSet set;
     // AheadBTB always uses ahead-pipelined logic (aheadPipelinedStages > 0)
     // only if the ahead-pipeline is filled can we use the entry
-    if (aheadReadBtbEntries.size() >= aheadPipelinedStages+1) {
+    if (state.aheadReadBtbEntries.size() >= aheadPipelinedStages+1) {
         // +1 because we pushed a new set in this cycle before
         // in case there are push without corresponding pop
-        assert(aheadReadBtbEntries.size() == aheadPipelinedStages+1);
-        std::tie(pc, idx_prvStartpc, set) = aheadReadBtbEntries.front();
-        DPRINTF(AheadPipeline, "AheadBTB: ahead-pipeline filled, using set %ld from pc %#lx\n",
-            idx_prvStartpc, pc);
+        assert(state.aheadReadBtbEntries.size() == aheadPipelinedStages+1);
+        std::tie(pc, idx_prvStartpc, set) = state.aheadReadBtbEntries.front();
+        DPRINTF(AheadPipeline,
+            "AheadBTB: [tid:%u] ahead-pipeline filled, using set %ld from pc %#lx\n",
+            tid, idx_prvStartpc, pc);
         DPRINTF(AheadPipeline, "AheadBTB: dumping btb set\n");
         for (auto &entry : set) {
             printTickedBTBEntry(entry);
         }
-        aheadReadBtbEntries.pop();
+        state.aheadReadBtbEntries.pop();
     } else {
-        DPRINTF(AheadPipeline, "AheadBTB: ahead-pipeline not filled, only have %ld sets read,"
-            " skipping tag compare, assigning miss\n", aheadReadBtbEntries.size());
+        DPRINTF(AheadPipeline,
+            "AheadBTB: [tid:%u] ahead-pipeline not filled, only have %ld sets read,"
+            " skipping tag compare, assigning miss\n",
+            tid, state.aheadReadBtbEntries.size());
     }
     DPRINTF(ABTB, "BTB: Doing tag comparison for index 0x%lx tag %#lx\n",
         idx_prvStartpc, tag_curStartpc);
@@ -407,7 +450,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash)
 }
 
 std::vector<AheadBTB::TickedBTBEntry>
-AheadBTB::lookup(Addr block_pc, uint8_t asidHash)
+AheadBTB::lookup(Addr block_pc, ThreadID tid, uint8_t asidHash)
 {
     std::vector<TickedBTBEntry> res;
     if (block_pc & 0x1) {
@@ -415,7 +458,7 @@ AheadBTB::lookup(Addr block_pc, uint8_t asidHash)
     }
 
     // AheadBTB always uses single block lookup
-    res = lookupSingleBlock(block_pc, asidHash);
+    res = lookupSingleBlock(block_pc, tid, asidHash);
     return res;
 }
 
@@ -603,7 +646,8 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC)
                             (s3Pred.bbStart + predictWidth) & ~mask(floorLog2(predictWidth)-1);
 
     // AheadBTB use S3 prediction for update
-    auto old_entries= processOldEntries(lastPredEntries, end_inst_pc);
+    auto &state = threadState(s3Pred.tid);
+    auto old_entries= processOldEntries(state.lastPredEntries, end_inst_pc);
 
     auto entries_to_update = collectEntriesToUpdateFromS3Pred(old_entries,s3Pred);
 
diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh
index e5e29f7ffd..233489dab1 100644
--- a/src/cpu/pred/btb/abtb.hh
+++ b/src/cpu/pred/btb/abtb.hh
@@ -40,7 +40,10 @@
 #ifndef __CPU_PRED_BTB_BTB_HH__
 #define __CPU_PRED_BTB_BTB_HH__
 
+#include <memory>
 #include <queue>
+#include <tuple>
+#include <vector>
 
 #include "base/types.hh"
 #include "cpu/pred/btb/common.hh"
@@ -83,7 +86,8 @@ class AheadBTB : public TimedBaseBTBPredictor
 
 #ifdef UNIT_TEST
     // Test constructor - fixed ahead-pipelined configuration
-    AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, unsigned numDelay);
+    AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays,
+             unsigned numDelay, unsigned numThreads = 1);
 #else
     // Production constructor
     typedef AheadBTBParams Params;
@@ -260,15 +264,30 @@ class AheadBTB : public TimedBaseBTBPredictor
         }
     }BTBMeta;
 
-    std::shared_ptr<BTBMeta> meta; // metadata for BTB, set in putPCHistory, used in update
-
     /**
-    * lastPredEntries is using in updateusingS3pred() to store the hit entries during prediction
-    * it is using to hold the hit entries for later use in S3 update
-    * because in gem5 generat pred and updateusingS3pred finish in the same cycle
-    * so we can use this instead of using BTBMeta
+    * Per-thread ABTB prediction-time state. The BTB storage itself remains
+    * shared, but the ahead-read pipeline must not be shared across SMT
+    * threads because index read and tag compare occur in different cycles.
     */
-    std::vector<BTBEntry> lastPredEntries; // cached hit entries for the latest prediction
+    struct ThreadState
+    {
+        // metadata for BTB, set in putPCHistory, used in update
+        std::shared_ptr<BTBMeta> meta;
+
+        /**
+        * lastPredEntries is used in updateUsingS3Pred() to store hit entries
+        * during prediction. It holds the hit entries for later S3 update.
+        * Because gem5 generate pred and updateUsingS3Pred finish in the same
+        * cycle, we can use this instead of BTBMeta.
+        */
+        std::vector<BTBEntry> lastPredEntries;
+
+        std::queue<std::tuple<Addr, Addr, BTBSet>> aheadReadBtbEntries;
+    };
+
+    ThreadID predictorTid(const std::vector<FullBTBPrediction> &stagePreds) const;
+    ThreadState &threadState(ThreadID tid);
+    const ThreadState &threadState(ThreadID tid) const;
 
     /** Process BTB entries for prediction
      *  @param entries Vector of BTB entries to process
@@ -289,7 +308,8 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param entries Processed BTB entries
      */
     void updatePredictionMeta(const std::vector<TickedBTBEntry>& entries,
-                               std::vector<FullBTBPrediction>& stagePreds);
+                               std::vector<FullBTBPrediction>& stagePreds,
+                               ThreadID tid);
 
     /** Process prediction metadata and old entries
      *  @param meta BTB metadata from prediction
@@ -367,13 +387,15 @@ class AheadBTB : public TimedBaseBTBPredictor
      *  @param inst_PC The address of the block to look up.
      *  @return Returns all hit BTB entries.
      */
-    std::vector<TickedBTBEntry> lookup(Addr block_pc, uint8_t asidHash);
+    std::vector<TickedBTBEntry> lookup(Addr block_pc, ThreadID tid,
+                                       uint8_t asidHash);
 
     /** Helper function to lookup entries in a single block
      * @param block_pc The aligned PC to lookup
      * @return Vector of matching BTB entries
      */
-    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, uint8_t asidHash);
+    std::vector<TickedBTBEntry> lookupSingleBlock(Addr block_pc, ThreadID tid,
+                                                  uint8_t asidHash);
 
     /** The BTB structure:
      *  - Organized as numSets sets
@@ -389,12 +411,12 @@ class AheadBTB : public TimedBaseBTBPredictor
      */
     std::vector<BTBHeap> mruList;
 
-    std::queue<std::tuple<Addr, Addr, BTBSet>> aheadReadBtbEntries;
-
     /** BTB configuration parameters */
     unsigned numEntries;    // Total number of entries
     unsigned numWays;       // Number of ways per set
     unsigned numSets;       // Number of sets (numEntries/numWays)
+    unsigned numThreads;    // Number of SMT threads with isolated pipeline state
+    std::vector<ThreadState> threadStates;
 
 #ifdef UNIT_TEST
     uint64_t blockSize{32};  // max size in byte of a Fetch Block
diff --git a/src/cpu/pred/btb/test/abtb.test.cc b/src/cpu/pred/btb/test/abtb.test.cc
index 36c7cc1bd4..654d70cdfb 100644
--- a/src/cpu/pred/btb/test/abtb.test.cc
+++ b/src/cpu/pred/btb/test/abtb.test.cc
@@ -19,6 +19,8 @@ namespace test
 
 FetchTarget createStream(Addr startPC, FullBTBPrediction &pred, AheadBTB *abtb) {
     FetchTarget stream;
+    stream.tid = pred.tid;
+    stream.asidHash = pred.asidHash;
     stream.startPC = startPC;
     Addr fallThroughAddr = pred.getFallThrough(abtb->predictWidth);
     stream.isHit = pred.btbEntries.size() > 0; // TODO: fix isHit and falseHit
@@ -26,7 +28,7 @@ FetchTarget createStream(Addr startPC, FullBTBPrediction &pred, AheadBTB *abtb)
     stream.predBTBEntries = pred.btbEntries;
     stream.predTaken = pred.isTaken();
     stream.predEndPC = fallThroughAddr;
-    stream.predMetas[0] = abtb->getPredictionMeta();
+    stream.predMetas[0] = abtb->getPredictionMeta(stream.tid);
     return stream;
 }
 
@@ -39,13 +41,27 @@ void resolveStream(FetchTarget &stream, bool taken, Addr brPc, Addr target, bool
     stream.exeTaken = taken;
 }
 
-FullBTBPrediction makePrediction(Addr startPC, AheadBTB *abtb) {
+FullBTBPrediction makePrediction(Addr startPC, AheadBTB *abtb,
+                                 ThreadID tid = 0, uint8_t asidHash = 0) {
     std::vector<FullBTBPrediction> stagePreds(2);  // 2 stages
+    for (int i = 0; i < stagePreds.size(); i++) {
+        stagePreds[i].tid = tid;
+        stagePreds[i].asidHash = asidHash;
+        stagePreds[i].bbStart = startPC;
+        stagePreds[i].predSource = i;
+    }
     boost::dynamic_bitset<> history(8, 0); // history does not matter for BTB
     abtb->putPCHistory(startPC, history, stagePreds);
     return stagePreds[1];
 }
 
+void clearAheadPipeline(AheadBTB *abtb, ThreadID tid) {
+    FetchTarget stream;
+    stream.tid = tid;
+    boost::dynamic_bitset<> history(8, 0);
+    abtb->recoverHist(history, stream, 0, false);
+}
+
 void updateBTB(FetchTarget &stream, AheadBTB *abtb, MBTB *mbtb) {
     mbtb->getAndSetNewBTBEntry(stream); // usually called by mbtb, here for testing purpose
     abtb->update(stream);
@@ -151,6 +167,39 @@ TEST_F(ABTBTest, AliasAvoidance){
     EXPECT_EQ(pred_C_test.btbEntries.size(), 0);
 }
 
+TEST_F(ABTBTest, AheadPipelineIsThreadIsolated){
+    AheadBTB twoThreadAbtb(1024, 20, 1, 0, 2);
+
+    Addr t0PrevPC = 0x1000;
+    Addr t0StartPC = 0x2000;
+    Addr t0BrPC = 0x2004;
+    Addr t0Target = 0x3000;
+    Addr t1PrevPC = 0x1040;
+
+    // Train a thread-0 ABTB entry indexed by t0PrevPC and tagged by t0StartPC.
+    auto pred_t0 = makePrediction(t0StartPC, &twoThreadAbtb, 0);
+    auto stream_t0 = createStream(t0StartPC, pred_t0, &twoThreadAbtb);
+    stream_t0.previousPCs.push(t0PrevPC);
+    resolveStream(stream_t0, true, t0BrPC, t0Target, true);
+    updateBTB(stream_t0, &twoThreadAbtb, mbtb);
+
+    clearAheadPipeline(&twoThreadAbtb, 0);
+    clearAheadPipeline(&twoThreadAbtb, 1);
+
+    // Interleave another thread between thread 0's previous/current blocks.
+    // With a shared ahead FIFO, thread 0's current lookup would consume the
+    // set read by thread 1 and miss the trained entry.
+    makePrediction(t0PrevPC, &twoThreadAbtb, 0);
+    makePrediction(t1PrevPC, &twoThreadAbtb, 1);
+    auto pred_t0_test = makePrediction(t0StartPC, &twoThreadAbtb, 0);
+
+    EXPECT_EQ(pred_t0_test.btbEntries.size(), 1);
+    if (!pred_t0_test.btbEntries.empty()) {
+        EXPECT_EQ(pred_t0_test.btbEntries[0].pc, t0BrPC);
+        EXPECT_EQ(pred_t0_test.btbEntries[0].target, t0Target);
+    }
+}
+
 } // namespace test
 } // namespace btb_pred
 } // namespace branch_prediction

From 9845df977d53861866b8611b21b34c38534a120e Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 27 Apr 2026 16:32:44 +0800
Subject: [PATCH 22/38] cpu-o3: isolate MDP replay tracking per thread

Change-Id: Ia8509f4290de8b2c7e337892506c69a8d399d493
---
 src/cpu/o3/inst_queue.cc | 64 +++++++++++++++++++++++++---------------
 src/cpu/o3/inst_queue.hh |  5 +++-
 src/cpu/o3/lsq_unit.cc   | 28 ++++++++++++++++++
 src/cpu/o3/lsq_unit.hh   | 18 +++++++++++
 4 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index db8ec407f4..dda79556dc 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -119,6 +119,17 @@ InstructionQueue::MdpAddrReplayLdInst::MdpAddrReplayLdInst(
 {
 }
 
+bool
+InstructionQueue::hasMdpAddrReplayInsts() const
+{
+    for (const auto &replay_ld_insts : mdpAddrReplayLdInsts) {
+        if (!replay_ld_insts.empty()) {
+            return true;
+        }
+    }
+    return false;
+}
+
 InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
         const BaseO3CPUParams &params)
     : cpu(cpu_ptr),
@@ -384,7 +395,9 @@ InstructionQueue::resetState()
     deferredMemInsts.clear();
     cacheMissLdInsts.clear();
     stlfFailLdInsts.clear();
-    mdpAddrReplayLdInsts.clear();
+    for (auto &replay_ld_insts : mdpAddrReplayLdInsts) {
+        replay_ld_insts.clear();
+    }
     blockedMemInsts.clear();
     retryMemInsts.clear();
     wbOutstanding = 0;
@@ -683,7 +696,7 @@ InstructionQueue::scheduleReadyInsts()
     // removed from the code below.
     if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() ||
        !cacheMissLdInsts.empty() || !stlfFailLdInsts.empty() ||
-       !mdpAddrReplayLdInsts.empty()) {
+       hasMdpAddrReplayInsts()) {
         cpu->activityThisCycle();
     } else {
         DPRINTF(IQ, "Not able to schedule any instructions.\n");
@@ -708,14 +721,15 @@ InstructionQueue::resolveMdpAddrReplayStoreAddr(const DynInstPtr &store_inst)
 
     const ThreadID tid = store_inst->threadNumber;
     const InstSeqNum store_sn = store_inst->seqNum;
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[tid];
 
-    for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) {
+    for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) {
         if (!it->inst || it->inst->isSquashed()) {
-            it = mdpAddrReplayLdInsts.erase(it);
+            it = replay_ld_insts.erase(it);
             continue;
         }
 
-        if (it->inst->threadNumber != tid || it->strict) {
+        if (it->strict) {
             ++it;
             continue;
         }
@@ -725,7 +739,7 @@ InstructionQueue::resolveMdpAddrReplayStoreAddr(const DynInstPtr &store_inst)
             DPRINTF(IQ, "Load[sn:%llu] MDP addr replay ready (store[sn:%llu] addr ready)\n",
                     it->inst->seqNum, store_sn);
             it->inst->issueQue->retryMem(it->inst);
-            it = mdpAddrReplayLdInsts.erase(it);
+            it = replay_ld_insts.erase(it);
             continue;
         }
         ++it;
@@ -982,8 +996,10 @@ InstructionQueue::mdpAddrReplayRegister(
         return;
     }
 
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[load_inst->threadNumber];
+
     // Avoid duplicate registration for the same dynamic inst.
-    for (const auto &entry : mdpAddrReplayLdInsts) {
+    for (const auto &entry : replay_ld_insts) {
         if (entry.inst && entry.inst->seqNum == load_inst->seqNum) {
             return;
         }
@@ -991,7 +1007,7 @@ InstructionQueue::mdpAddrReplayRegister(
 
     DPRINTF(IQ, "Load[sn:%llu] MDP addr replay register, wait %lu stores\n",
             load_inst->seqNum, store_seq_nums.size());
-    mdpAddrReplayLdInsts.emplace_back(load_inst, store_seq_nums);
+    replay_ld_insts.emplace_back(load_inst, store_seq_nums);
 }
 
 void
@@ -1002,7 +1018,9 @@ InstructionQueue::mdpAddrReplayRegisterStrict(const DynInstPtr &load_inst,
         return;
     }
 
-    for (const auto &entry : mdpAddrReplayLdInsts) {
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[load_inst->threadNumber];
+
+    for (const auto &entry : replay_ld_insts) {
         if (entry.inst && entry.inst->seqNum == load_inst->seqNum) {
             return;
         }
@@ -1010,7 +1028,7 @@ InstructionQueue::mdpAddrReplayRegisterStrict(const DynInstPtr &load_inst,
 
     DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay register, wait storeCompletedIdx >= %lu\n",
             load_inst->seqNum, required_store_completed_idx);
-    mdpAddrReplayLdInsts.emplace_back(load_inst, required_store_completed_idx);
+    replay_ld_insts.emplace_back(load_inst, required_store_completed_idx);
 }
 
 void
@@ -1021,8 +1039,8 @@ InstructionQueue::mdpAddrReplayPipeDone(const DynInstPtr &load_inst)
     }
 
     const ThreadID tid = load_inst->threadNumber;
-    for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();
-         ++it) {
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[tid];
+    for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end(); ++it) {
         if (!it->inst || it->inst->seqNum != load_inst->seqNum) {
             continue;
         }
@@ -1033,13 +1051,13 @@ InstructionQueue::mdpAddrReplayPipeDone(const DynInstPtr &load_inst)
                 DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay ready (pipeDone)\n",
                         load_inst->seqNum);
                 load_inst->issueQue->retryMem(load_inst);
-                mdpAddrReplayLdInsts.erase(it);
+                replay_ld_insts.erase(it);
             }
         } else if (it->storeSeqNums.empty()) {
             DPRINTF(IQ, "Load[sn:%llu] MDP addr replay ready (pipeDone)\n",
                     load_inst->seqNum);
             load_inst->issueQue->retryMem(load_inst);
-            mdpAddrReplayLdInsts.erase(it);
+            replay_ld_insts.erase(it);
         }
         return;
     }
@@ -1059,19 +1077,20 @@ InstructionQueue::mdpAddrReplayUpdateStoreCompletedIdx(
     }
 
     mdpStoreCompletedIdx[tid] = store_completed_idx;
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[tid];
 
-    for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) {
+    for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) {
         if (!it->inst || it->inst->isSquashed()) {
-            it = mdpAddrReplayLdInsts.erase(it);
+            it = replay_ld_insts.erase(it);
             continue;
         }
 
-        if (it->inst->threadNumber == tid && it->strict && it->pipeDone &&
+        if (it->strict && it->pipeDone &&
             store_completed_idx >= it->requiredStoreCompletedIdx) {
             DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay ready (storeCompletedIdx=%lu)\n",
                     it->inst->seqNum, store_completed_idx);
             it->inst->issueQue->retryMem(it->inst);
-            it = mdpAddrReplayLdInsts.erase(it);
+            it = replay_ld_insts.erase(it);
             continue;
         }
         ++it;
@@ -1127,11 +1146,10 @@ InstructionQueue::doSquash(ThreadID tid)
     squashInfo.squashSn  = squashedSeqNum[tid];
     scheduler->doSquash(squashInfo);
 
-    for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) {
-        if (!it->inst ||
-            (it->inst->threadNumber == tid &&
-             it->inst->seqNum > squashedSeqNum[tid])) {
-            it = mdpAddrReplayLdInsts.erase(it);
+    auto &replay_ld_insts = mdpAddrReplayLdInsts[tid];
+    for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) {
+        if (!it->inst || it->inst->seqNum > squashedSeqNum[tid]) {
+            it = replay_ld_insts.erase(it);
         } else {
             ++it;
         }
diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh
index f163ebb28e..1f3790e286 100644
--- a/src/cpu/o3/inst_queue.hh
+++ b/src/cpu/o3/inst_queue.hh
@@ -42,6 +42,7 @@
 #ifndef __CPU_O3_INST_QUEUE_HH__
 #define __CPU_O3_INST_QUEUE_HH__
 
+#include <array>
 #include <list>
 #include <map>
 #include <queue>
@@ -373,10 +374,12 @@ class InstructionQueue
       MdpAddrReplayLdInst(const DynInstPtr &inst,
                           size_t required_store_completed_idx);
     };
-    std::list<MdpAddrReplayLdInst> mdpAddrReplayLdInsts;
+    std::array<std::list<MdpAddrReplayLdInst>, MaxThreads> mdpAddrReplayLdInsts;
 
     size_t mdpStoreCompletedIdx[MaxThreads] = {};
 
+    bool hasMdpAddrReplayInsts() const;
+
     /** List of instructions that have been cache blocked. */
     std::list<DynInstPtr> blockedMemInsts;
 
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 5112ee8c40..127212ecb4 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -593,6 +593,18 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent)
                "Number of load-load violation events"),
       ADD_STAT(stLdViolation, statistics::units::Count::get(),
                "Number of store-load violation events"),
+      ADD_STAT(rawMemOrderViolation, statistics::units::Count::get(),
+               "Number of RAW memory ordering violations"),
+      ADD_STAT(rawViolationMdpNoPred, statistics::units::Count::get(),
+               "Number of RAW violations where replay-based MDP had no producer prediction"),
+      ADD_STAT(rawViolationMdpHit, statistics::units::Count::get(),
+               "Number of RAW violations where replay-based MDP predicted the violating store"),
+      ADD_STAT(rawViolationMdpMiss, statistics::units::Count::get(),
+               "Number of RAW violations where replay-based MDP predicted other stores only"),
+      ADD_STAT(rawViolationMdpStrict, statistics::units::Count::get(),
+               "Number of RAW violations where replay-based MDP used strict wait"),
+      ADD_STAT(loadOrderViolation, statistics::units::Count::get(),
+               "Number of load-load or snoop ordering violations"),
       ADD_STAT(busForwardSuccess, statistics::units::Count::get(),
                "Number of successfully forwarding from bus"),
       ADD_STAT(cacheMissReplayEarly, statistics::units::Count::get(),
@@ -1185,6 +1197,7 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
 
                                 ++stats.memOrderViolation;
                                 ++stats.ldLdViolation;
+                                ++stats.loadOrderViolation;
 
                                 return std::make_shared<GenericISA::M5PanicFault>(
                                     "Detected fault with inst [sn:%lli] and "
@@ -1237,6 +1250,19 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt,
                             ++stats.stLdViolation;
                             countedStLdViolationThisCycle = true;
                         }
+                        ++stats.rawMemOrderViolation;
+                        if (ld_inst->mdpPredStrictWait) {
+                            ++stats.rawViolationMdpStrict;
+                        } else if (ld_inst->mdpProducingStores.empty()) {
+                            ++stats.rawViolationMdpNoPred;
+                        } else if (std::find(ld_inst->mdpProducingStores.begin(),
+                                             ld_inst->mdpProducingStores.end(),
+                                             inst->seqNum) !=
+                                   ld_inst->mdpProducingStores.end()) {
+                            ++stats.rawViolationMdpHit;
+                        } else {
+                            ++stats.rawViolationMdpMiss;
+                        }
 
                         return std::make_shared<GenericISA::M5PanicFault>(
                             "Detected fault with "
@@ -1353,6 +1379,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst)
             auto& store_inst = storePipeSx[1]->insts[i];
             if (pipeLineNukeCheck(inst, store_inst)) {
                 DPRINTF(LoadPipeline, "Load [sn:%llu] Nuke need replay\n", inst->seqNum);
+                ++stats.pipeRawNukeReplay;
                 inst->setProducerStorePC(store_inst->pcState().instAddr());
                 inst->setNukeReplay();
                 return NoFault;
@@ -1532,6 +1559,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst)
             return fault;
           case LdStReplayType::NukeReplay:
             DPRINTF(LoadPipeline, "Load [sn:%llu] Nuke need replay\n", inst->seqNum);
+            ++stats.pipeRawNukeReplay;
             return fault;
           default:
             panic("Unsupported load replay type selected in s2");
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index fd2ff7d172..7752b5003e 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -817,6 +817,24 @@ class LSQUnit
         /** Total number of store-load violation events. */
         statistics::Scalar stLdViolation;
 
+        /** RAW memory ordering violations caused by a younger load. */
+        statistics::Scalar rawMemOrderViolation;
+
+        /** RAW violations where replay-based MDP had no producer prediction. */
+        statistics::Scalar rawViolationMdpNoPred;
+
+        /** RAW violations where replay-based MDP predicted the violating store. */
+        statistics::Scalar rawViolationMdpHit;
+
+        /** RAW violations where replay-based MDP predicted other stores only. */
+        statistics::Scalar rawViolationMdpMiss;
+
+        /** RAW violations where replay-based MDP used strict wait. */
+        statistics::Scalar rawViolationMdpStrict;
+
+        /** Load-load/snoop ordering violations. */
+        statistics::Scalar loadOrderViolation;
+
         /** Tota number of successfully forwarding from bus. */
         statistics::Scalar busForwardSuccess;
 

From e16b2a27dffb8cd7a73e5d4eaeb5882723a4ef6d Mon Sep 17 00:00:00 2001
From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:34:28 +0800
Subject: [PATCH 23/38] cpu-o3: 1. Add scheduler starvation prevention
 mechanism; 2. Modify sbuffer resource allocation mechanism; 3. Vectorize
 waitForVsetvl (#844)

Co-authored-by: mo haonan <mohaonan@node023.bosccluster.com>
---
 src/cpu/o3/fetch.cc       | 39 +++++++++++++++++++++++++++------------
 src/cpu/o3/fetch.hh       |  2 +-
 src/cpu/o3/issue_queue.cc | 21 +++++++++++++++++++--
 src/cpu/o3/lsq.cc         | 12 +++++++++---
 src/cpu/o3/lsq.hh         |  1 +
 src/cpu/o3/lsq_unit.cc    |  5 ++++-
 src/cpu/o3/lsq_unit.hh    |  2 +-
 src/cpu/o3/smt_sched.hh   | 18 ++++++++++++++----
 8 files changed, 76 insertions(+), 24 deletions(-)

diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index ff31aa9bb9..2fdf1076f5 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -502,6 +502,7 @@ Fetch::resetStage()
         fetchQueue[tid].clear();
 
         priorityList.push_back(tid);
+        waitForVsetvl[tid] = false;
     }
 
     wroteToTimeBuffer = false;
@@ -1329,14 +1330,12 @@ Fetch::initializeTickState()
         // for each thread.
         bool updated_status = checkSignalsAndUpdate(tid);
         status_change =  status_change || updated_status;
+        if (fromCommit->commitInfo[tid].emptyROB) {
+            waitForVsetvl[tid] = false;
+        }
     }
 
     DPRINTF(Fetch, "Running stage.\n");
-
-    if (fromCommit->commitInfo[0].emptyROB) {
-        waitForVsetvl = false;
-    }
-
     return status_change;
 }
 
@@ -1391,22 +1390,32 @@ Fetch::selectUnstalledThread()
     // if (numThreads == 1) {
     //     return 0;
     // }
+    ThreadID selected = -1;
+    bool all_stalled = true;
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
-        if (!stallSig->blockFetch[tid]) {
+        if (!stallSig->blockFetch[tid] &&fetchQueue[tid].size() > 0) {
             lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount);
             iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount);
             robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount);
+            all_stalled = false;
            
-        } else {
+        }else {
             lsqCounter->setCounter(tid, UINT64_MAX);
             iqCounter->setCounter(tid, UINT64_MAX);
             robCounter->setCounter(tid, UINT64_MAX);
             
         }
+
+        if(all_stalled)
+        {
+            selected = -1;
+        }else{
+            selected = decodeScheduler->getThread();
+        }
         DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount);
     }
 
-    ThreadID selected = decodeScheduler->getThread();
+     
     return selected;
 }
 
@@ -1450,6 +1459,12 @@ Fetch::sendInstructionsToDecode()
     }
 
     ThreadID tid =selectUnstalledThread();
+
+    if(tid == -1)
+    {
+        DPRINTF(Fetch, "All threads are stalled, no thread selected.\n");
+        return;
+    }
     DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid);
 
     // fetch totally stalled
@@ -1997,9 +2012,9 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
 
     // Special handling for RISC-V vector configuration instructions.
     if (staticInst->isVectorConfig()) {
-        waitForVsetvl = dec_ptr->stall();
-        DPRINTF(Fetch, "[tid:%i] Vector config instruction, waitForVsetvl=%d\n",
-                tid, waitForVsetvl);
+        waitForVsetvl[tid] = dec_ptr->stall();
+        DPRINTF(Fetch, "[tid:%i] Vector config instruction, waitForVsetvl[tid]=%d\n",
+                tid, waitForVsetvl[tid]);
     }
 
     instruction->setVersion(localSquashVer[tid]);
@@ -2076,7 +2091,7 @@ Fetch::performInstructionFetch(ThreadID tid)
     // For decoupled frontend (including trace mode), check FTQ availability
     StallReason stall = StallReason::NoStall;
     while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize &&
-           !predictedBranch && !ftqEmpty(tid) && !waitForVsetvl) {
+           !predictedBranch && !ftqEmpty(tid) && !waitForVsetvl[tid]) {
 
         // Check memory needs and supply bytes to decoder if required
         stall = checkMemoryNeeds(tid, pc_state, curMacroop);
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 18e6159022..c76bb8d77f 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -1141,7 +1141,7 @@ public:
 
   private:
 
-    bool waitForVsetvl = false;
+    bool waitForVsetvl [MaxThreads];
 
     /** Value predictor */
     valuepred::VPUnit *valuePred;
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index d83083a45f..bf5bb1ea47 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -646,6 +646,13 @@ IssueQue::selectInst()
     selectQ.clear();
     for (int pi = 0; pi < outports; pi++) {
         auto readyQ = readyQs[pi];
+        // iq->getInstsCounter()->getCounter(tid)
+        int iqcount = 0;
+        for (auto it = readyQ->begin(); it != readyQ->end(); ++it) {
+            DPRINTF(Schedule, "readyQ for port %d has [sn:%llu] %s [tid:%u]\n", pi, (*it)->seqNum,
+                    (*it)->genDisassembly(), (*it)->threadNumber);
+        }
+        
         selector->begin(readyQ);
         for (auto it = selector->select(readyQ->begin(), pi); it != readyQ->end(); it = selector->select(it, pi)) {
             auto& inst = *it;
@@ -659,7 +666,15 @@ IssueQue::selectInst()
             uint64_t busy_bit = (lat > 63 ? -1 : (1llu << lat));
             if (!(portBusy[pi] & busy_bit)) {
                 DPRINTF(Schedule, "[sn %ld] was selected\n", inst->seqNum);
-
+                for (ThreadID tid = 0; tid < MaxThreads; tid++) {
+                    if (inst->threadNumber == tid) {
+                        independentIQICountScheduler->scheduleNum[tid]++;
+                    } else {
+                        independentIQICountScheduler->scheduleNum[tid] = 0;
+                    }
+                }
+                DPRINTF(Schedule, "smtScheduler->scheduleNum[0]=%d, smtScheduler->scheduleNum[1]=%d\n",
+                        independentIQICountScheduler->scheduleNum[0], independentIQICountScheduler->scheduleNum[1]);
                 // get regfile write port
                 for (int i = 0; i < inst->numDestRegs(); i++) {
                     auto pdst = inst->renamedDestIdx(i);
@@ -786,7 +801,7 @@ IssueQue::insert(const DynInstPtr& inst)
 
     cpu->perfCCT->updateInstPos(inst->seqNum, PerfRecord::AtIssueQue);
 
-    DPRINTF(Schedule, "[sn:%llu] %s insert into %s\n", inst->seqNum, enums::OpClassStrings[inst->opClass()], iqname);
+    DPRINTF(Schedule, "[tid:%u] [sn:%llu] %s insert into %s\n", inst->threadNumber, inst->seqNum, enums::OpClassStrings[inst->opClass()], iqname);
     selector->allocate(inst);
     inst->issueQue = this;
     instList.emplace_back(inst);
@@ -906,6 +921,7 @@ IssueQue::incInIQInstsCounter(ThreadID tid)
 {
     if (instsCounter) {
         instsCounter->incCounter(tid);
+        DPRINTF(Schedule, "Thread %d: incInIQInstsCounter to %d\n", tid, instsCounter->getCounter(tid));
     }
     if (iqstats) {
         iqstats->instsNum[tid]++;
@@ -917,6 +933,7 @@ IssueQue::decInIQInstsCounter(ThreadID tid)
 {
     if (instsCounter) {
         instsCounter->decCounter(tid);
+        DPRINTF(Schedule, "Thread %d: decInIQInstsCounter to %d\n", tid, instsCounter->getCounter(tid));
     }
 }
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index a341c1eaa0..448530ab69 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -940,6 +940,7 @@ LSQ::processWriteback()
     std::vector<uint32_t> offload_quota(numThreads, 0);
     std::vector<uint32_t> offload_demand(numThreads, 0);
     std::vector<ThreadID> requester_tids;
+    std::vector<bool> offload_fail(numThreads, false);
     requester_tids.reserve(activeThreads->size());
 
     for (ThreadID tid : *activeThreads) {
@@ -992,9 +993,14 @@ LSQ::processWriteback()
         }
     }
     threads = activeThreads->begin();
-    while (threads != end) {
-        ThreadID tid = *threads++;
-        thread[tid].offloadToStoreBuffer(offload_quota[tid]);
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        thread[(nextStoreBufferInsertTid + tid) % numThreads].offloadToStoreBuffer(offload_quota[(nextStoreBufferInsertTid + tid) % numThreads], offload_fail);
+    }
+
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        if (offload_fail[tid]) {
+            nextStoreBufferInsertTid = tid;
+        }
     }
 
     // A fence/flush only waits for the requesting thread's sbuffer domain.
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 28cb6e0146..397f372c26 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -1294,6 +1294,7 @@ class LSQ
     uint64_t storeBufferWritebackInactive = 0;
     StoreBufferEntry *blockedSbufferEntry = nullptr;
     ThreadID nextStoreBufferOffloadTid = InvalidThreadID;
+    ThreadID nextStoreBufferInsertTid  = 0;
 
     bool enableBankConflictCheck;
     bool sbufferBankWriteAccurately;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index 127212ecb4..c0e9283c55 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -2338,7 +2338,7 @@ LSQUnit::countStoreBufferOffloadableEntries(uint32_t max_entries) const
 }
 
 void
-LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
+LSQUnit::offloadToStoreBuffer(uint32_t max_entries, std::vector<bool>& offload_fail)
 {
     assert(!lsq->storeBufferBlocked());
     if (isStoreBlocked) return;
@@ -2415,6 +2415,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
                 if (success) {
                     request->_numOutstandingPackets++;
                 } else {
+                    offload_fail[lsqID] = true;
                     break;
                 }
             }
@@ -2435,6 +2436,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries)
                 vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size,
                 request->mainReq()->getByteEnable(), inst->seqNum);
             if (!success) {
+                offload_fail[lsqID] = true;
                 break;
             }
             ++accepted_entries;
@@ -2501,6 +2503,7 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
         // create new entry
         if (storeBuffer.full()) {
             stats.sbufferFull++;
+            // lsq->nextStoreBufferInsertTid = lsqID;
             DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr);
             return false;
         }
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 7752b5003e..e199d089fc 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -359,7 +359,7 @@ class LSQUnit
     uint32_t countStoreBufferOffloadableEntries(uint32_t max_entries) const;
 
     /** Writes back stores. */
-    void offloadToStoreBuffer(uint32_t max_entries);
+    void offloadToStoreBuffer(uint32_t max_entries, std::vector<bool>& offload_fail);
 
     bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
                            uint64_t size, const std::vector<bool>& mask,
diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh
index 74198c44fd..7fcbf5733f 100644
--- a/src/cpu/o3/smt_sched.hh
+++ b/src/cpu/o3/smt_sched.hh
@@ -131,23 +131,33 @@ class IndependentIQICountScheduler : public SMTScheduler {
 private:
      InstsCounter* counter;  // Counter for this IQ only
 
+
 public:
     IndependentIQICountScheduler(int numThreads, InstsCounter* counter)
         : SMTScheduler(numThreads), counter(counter){}
 
     ThreadID getThread() override {
         ThreadID selectedTid = 0;
-        uint64_t minCount = counter->getCounter(0);
-
+        uint64_t maxCount = counter->getCounter(0);
+        if(scheduleNum[0] >= 100){
+            selectedTid = 1;
+            return selectedTid;
+        }
         for (ThreadID tid = 1; tid < numThreads; ++tid) {
+            if(scheduleNum[tid] >= 100){
+                selectedTid = 0;
+                return selectedTid;
+            }
             uint64_t count = counter->getCounter(tid);
-            if (count < minCount) {
-                minCount = count;
+            if (count > maxCount) {
+                maxCount = count;
                 selectedTid = tid;
             }
         }
         return selectedTid;
     }
+    int scheduleNum[MaxThreads];
+
 };
 
 }}

From 032f5757fbdc09ca9c1a6daf564f49015058d6a2 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 27 Apr 2026 16:59:09 +0800
Subject: [PATCH 24/38] cpu: preserve BTB tag bits when hashing ASID

Change-Id: I30d0a510565c278349b4d4b915e84ead21f8c6ae
---
 src/cpu/pred/btb/common.hh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh
index e40dee3cf2..0e603a6f04 100644
--- a/src/cpu/pred/btb/common.hh
+++ b/src/cpu/pred/btb/common.hh
@@ -49,7 +49,8 @@ injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash)
 
     const unsigned hash_bits = std::min<unsigned>(4, tag_bits);
     const Addr hash_mask = mask(hash_bits);
-    return (base_tag & ~hash_mask) | (static_cast<Addr>(asid_hash) & hash_mask);
+    return (base_tag ^ (static_cast<Addr>(asid_hash) & hash_mask)) &
+           mask(tag_bits);
 }
 
 inline Addr

From 036db121e7a7fc26efb0610d8003afb4dccc8893 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 28 Apr 2026 10:40:24 +0800
Subject: [PATCH 25/38] cpu-o3: isolate committed stream state per thread

Change-Id: Id6528ad1ec2b2ad7a26cd0ef18030b8f671c3302
---
 src/cpu/o3/commit.cc | 10 ++++++----
 src/cpu/o3/commit.hh |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index 746a39872b..668f222553 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -188,6 +188,8 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara
         htmStarts[tid] = 0;
         htmStops[tid] = 0;
         traceCommitIndex[tid] = 0;
+        committedTargetId[tid] = 1;
+        committedLoopIter[tid] = 0;
         fixedbuffer[tid] = boost::circular_buffer<DynInstPtr>(renameWidth);
     }
     interrupt = NoFault;
@@ -725,8 +727,8 @@ Commit::squashAll(ThreadID tid)
 
     set(toIEW->commitInfo[tid].pc, pc[tid]);
 
-    toIEW->commitInfo[tid].squashedTargetId = committedTargetId;
-    toIEW->commitInfo[tid].squashedLoopIter = committedLoopIter;
+    toIEW->commitInfo[tid].squashedTargetId = committedTargetId[tid];
+    toIEW->commitInfo[tid].squashedLoopIter = committedLoopIter[tid];
 
     cpu->mmu->useNewPriv(cpu->getContext(tid));
 
@@ -1412,8 +1414,8 @@ Commit::commitInsts()
                     if (head_inst->getFtqId() > 1) {
                         toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1;
                     }
-                    committedTargetId = head_inst->getFtqId();
-                    committedLoopIter = head_inst->getLoopIteration();
+                    committedTargetId[tid] = head_inst->getFtqId();
+                    committedLoopIter[tid] = head_inst->getLoopIteration();
 
                     if (tid == 0)
                         canHandleInterrupts = !head_inst->isDelayedCommit();
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 3c83b610e5..27ac59157e 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -567,8 +567,8 @@ class Commit
 
     // committed Stream and Target
 
-    uint64_t committedTargetId{1};
-    uint64_t committedLoopIter{};
+    uint64_t committedTargetId[MaxThreads];
+    uint64_t committedLoopIter[MaxThreads];
 
     struct CommitStats : public statistics::Group
     {

From ada41dc4e5a04217fc793e7cfd5d7eab473bff37 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 28 Apr 2026 16:08:07 +0800
Subject: [PATCH 26/38] cpu: Isolate VPU state per thread

Change-Id: Ica716ed754083ab470c1eace992837b454547b55
---
 src/cpu/o3/commit.cc                    |  5 +--
 src/cpu/o3/fetch.cc                     |  1 +
 src/cpu/valuepred/ValuePredictor.py     |  1 +
 src/cpu/valuepred/enhanced_stride.cc    | 41 ++++++++++++++++---------
 src/cpu/valuepred/enhanced_stride.hh    |  7 +++--
 src/cpu/valuepred/ideal_constant_lvp.cc |  9 +++++-
 src/cpu/valuepred/ideal_constant_lvp.hh |  5 +--
 src/cpu/valuepred/valuepred_metadata.hh |  3 ++
 src/cpu/valuepred/valuepred_unit.cc     | 16 +++++++++-
 src/cpu/valuepred/valuepred_unit.hh     |  8 ++++-
 10 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index 668f222553..2257e1bd46 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -707,7 +707,7 @@ Commit::squashAll(ThreadID tid)
     changedROBNumEntries[tid] = true;
 
     if (valuePred)
-        valuePred->squash(squashed_inst);
+        valuePred->squash(tid, squashed_inst);
 
     // Send back the sequence number of the squashed instruction.
     toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
@@ -1099,7 +1099,7 @@ Commit::commit()
             changedROBNumEntries[tid] = true;
 
             if (valuePred)
-                valuePred->squash(squashed_inst);
+                valuePred->squash(tid, squashed_inst);
 
             toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
             toIEW->commitInfo[tid].doneMemSeqNum = squashed_inst;
@@ -1928,6 +1928,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
                     VPDataStructFactory::buildUpdateMetaData(valuePred->getValuePredictorType());
         updateMetaData->pc = head_inst->getPC();
         updateMetaData->seq_no = head_inst->seqNum;
+        updateMetaData->tid = tid;
         updateMetaData->actualValue = head_inst->actualValue;
         updateMetaData->isMisprediction = head_inst->vpMisprediction;
         valuePred->updateValuePredictor(updateMetaData);
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 2fdf1076f5..0642e5a587 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -2061,6 +2061,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc,
 
         vpPredMetaData->pc = instruction->getPC();
         vpPredMetaData->seq_no = instruction->seqNum;
+        vpPredMetaData->tid = tid;
         instruction->vpResult = valuePred->valuePredict(vpPredMetaData);
         delete vpPredMetaData;
     }
diff --git a/src/cpu/valuepred/ValuePredictor.py b/src/cpu/valuepred/ValuePredictor.py
index 1b586060ad..65bc7b606c 100644
--- a/src/cpu/valuepred/ValuePredictor.py
+++ b/src/cpu/valuepred/ValuePredictor.py
@@ -11,6 +11,7 @@ class ValuePredictor(SimObject):
     cxx_class = "gem5::valuepred::VPUnit"
     cxx_header = "cpu/valuepred/valuepred_unit.hh"
     abstract = True
+    numThreads = Param.Unsigned(Parent.numThreads, "Number of threads")
 
 class EStride(ValuePredictor):
     type = "EStride"
diff --git a/src/cpu/valuepred/enhanced_stride.cc b/src/cpu/valuepred/enhanced_stride.cc
index ffbce18571..5707e2b40c 100644
--- a/src/cpu/valuepred/enhanced_stride.cc
+++ b/src/cpu/valuepred/enhanced_stride.cc
@@ -108,7 +108,7 @@ EStride::EStride(const Params &params)
       logMaxConfidence(params.logMaxConfidence),
       MAXCONFIDENCE(1 << logMaxConfidence),
       confidenceThreshold(static_cast<int>(params.thresholdPercent * MAXCONFIDENCE)),
-      inflightWindow(params.inflightWindowTagLength, params.idealWindow),
+      inflightWindows(),
       enableTimeMsgInUpdate(params.enableTimeMsgInUpdate),
       esstats(this)
 {
@@ -122,9 +122,17 @@ EStride::EStride(const Params &params)
     gem5_assert(params.inflightWindowTagLength, "EStride inflightWindowTagLength must > 0 \n");
 
     // init stats
-    ESTables.resize(ways);
-    for (auto &table : ESTables) {
-        table.resize(entryCounts);
+    inflightWindows.reserve(numThreads);
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        inflightWindows.emplace_back(params.inflightWindowTagLength, params.idealWindow);
+    }
+
+    ESTables.resize(numThreads);
+    for (auto &threadTables : ESTables) {
+        threadTables.resize(ways);
+        for (auto &table : threadTables) {
+            table.resize(entryCounts);
+        }
     }
 
     esstats.allocate.init(ways, entryCounts);
@@ -289,8 +297,9 @@ EStride::doPredict(ESPredMetaData *esPredMetaData, int inflights)
     int way;
     uint32_t index;
     ESEntry entryCopy;
+    const ThreadID tid = esPredMetaData->tid;
     for (int i = 0; i < ways; ++i) {
-        const ESEntry &entry = ESTables[i][indexEachWays[i]];
+        const ESEntry &entry = ESTables[tid][i][indexEachWays[i]];
         if (!compareTags(entry.tag, tagEachWays[i])) {
             found = true;
             way = i;
@@ -329,10 +338,11 @@ EStride::valuePredict(VPPredMetaData *predMetaData)
 {
     gem5_assert(predMetaData, "can't pass nullptr to vpunit\n");
     ESPredMetaData *esPredMetaData = dynamic_cast<ESPredMetaData *>(predMetaData);
+    assertValidTid(esPredMetaData->tid);
 
 
     // value prediction
-    int inflights = inflightWindow.addToInflightWindow(esPredMetaData->pc);
+    int inflights = inflightWindows[esPredMetaData->tid].addToInflightWindow(esPredMetaData->pc);
     esstats.inflightSH.sample(inflights, 1);
 
     return doPredict(esPredMetaData, inflights);
@@ -343,10 +353,12 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
 {
     gem5_assert(updateMetaData, "can't pass nullptr to vpunit\n");
     ESUpdateMetaData *esUpdateMetaData = dynamic_cast<ESUpdateMetaData *>(updateMetaData);
+    assertValidTid(esUpdateMetaData->tid);
+    const ThreadID tid = esUpdateMetaData->tid;
 
 
     // the first step update inflights window
-    inflightWindow.removeFromWindow(esUpdateMetaData->pc, esUpdateMetaData->seq_no);
+    inflightWindows[tid].removeFromWindow(esUpdateMetaData->pc, esUpdateMetaData->seq_no);
 
     // Given the nature of the current hash method, the same PC gets the
     // same hash value every time it is computed. So instead of storing
@@ -387,7 +399,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
     int way;
     uint32_t index;
     for (size_t i = 0; i < ways; ++i) {
-        const ESEntry &entry = ESTables[i][indexEachWays[i]];
+        const ESEntry &entry = ESTables[tid][i][indexEachWays[i]];
         // todo maybe change the occupied
         if (!compareTags(entry.tag, tagEachWays[i])) {
             found = true;
@@ -400,7 +412,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
 
     if (found) {
         // update
-        ESEntry &entry = ESTables[way][index];
+        ESEntry &entry = ESTables[tid][way][index];
         DPRINTF(EStride, "[way: %d index: %u][confidence: %d  useful: %d lastValue: %lu]\n", way, index,
                 entry.confidence, entry.useful, entry.lastValue);
 
@@ -468,7 +480,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
 
         // first find no confidence
         for (size_t i = 0; i < ways; ++i) {
-            ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]];
+            ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]];
             if (entry.confidence == 0) {
                 DPRINTF(EStride, "allocate by confidence: [way: %d index: %u] \n", wayBegin, indexEachWays[wayBegin]);
                 esstats.allocate[wayBegin][indexEachWays[wayBegin]]++;
@@ -485,7 +497,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
 
         // second find not useful
         for (size_t i = 0; i < ways; ++i) {
-            ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]];
+            ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]];
             if (entry.useful == 0) {
                 DPRINTF(EStride, "allocate by useful: [way: %d index: %u] \n", wayBegin, indexEachWays[wayBegin]);
                 esstats.allocate[wayBegin][indexEachWays[wayBegin]]++;
@@ -501,7 +513,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData)
         }
 
         // can't allocate, just random dec some useful count
-        ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]];
+        ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]];
         DPRINTF(EStride, "try dec useful \n");
         if (tryDecUseful(entry) == 0) {
             DPRINTF(EStride, "[dec useful count]=> way: %d index: %d", wayBegin, indexEachWays[wayBegin]);
@@ -518,9 +530,10 @@ EStride::specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData)
 }
 
 void
-EStride::squash(const uint64_t seq_no)
+EStride::squash(ThreadID tid, const uint64_t seq_no)
 {
-    inflightWindow.squash(seq_no);
+    assertValidTid(tid);
+    inflightWindows[tid].squash(seq_no);
 }
 
 }
diff --git a/src/cpu/valuepred/enhanced_stride.hh b/src/cpu/valuepred/enhanced_stride.hh
index 5fc89c1730..d67509122d 100644
--- a/src/cpu/valuepred/enhanced_stride.hh
+++ b/src/cpu/valuepred/enhanced_stride.hh
@@ -110,12 +110,13 @@ class EStride : public VPUnit
     const int logMaxConfidence;
     const int MAXCONFIDENCE;
     const int confidenceThreshold;
-    InflightWindow inflightWindow;
+    std::vector<InflightWindow> inflightWindows;
     const bool enableTimeMsgInUpdate;
 
   private:
 
-    std::vector<std::vector<ESEntry>> ESTables;
+    // [tid][way][index]
+    std::vector<std::vector<std::vector<ESEntry>>> ESTables;
 
   private:
     // This function really implements the prediction function.
@@ -155,7 +156,7 @@ class EStride : public VPUnit
     // speculative updates may no longer be needed.
     virtual void specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData) override;
 
-    virtual void squash(const uint64_t seq_no) override;
+    virtual void squash(ThreadID tid, const uint64_t seq_no) override;
 
     virtual ValuePredType getValuePredictorType() override { return ValuePredType::EStride; }
 
diff --git a/src/cpu/valuepred/ideal_constant_lvp.cc b/src/cpu/valuepred/ideal_constant_lvp.cc
index 1d70397d19..823ad202b4 100644
--- a/src/cpu/valuepred/ideal_constant_lvp.cc
+++ b/src/cpu/valuepred/ideal_constant_lvp.cc
@@ -12,6 +12,7 @@ namespace valuepred
 
 IdealConstantLVP::IdealConstantLVP(const Params &params)
     : VPUnit(params),
+      idealConstTables(params.numThreads),
       satCounterBits(params.satCounterBits),
       resetConfidence(params.resetConfidence)
 {
@@ -20,6 +21,8 @@ IdealConstantLVP::IdealConstantLVP(const Params &params)
 VPResult
 IdealConstantLVP::valuePredict(VPPredMetaData *predMetaData)
 {
+    assertValidTid(predMetaData->tid);
+    auto &idealConstTable = idealConstTables[predMetaData->tid];
     auto it = idealConstTable.find(predMetaData->pc);
     if (it != idealConstTable.end()) {
         if (it->second.confidence.isSaturated()) {
@@ -32,6 +35,8 @@ IdealConstantLVP::valuePredict(VPPredMetaData *predMetaData)
 void
 IdealConstantLVP::updateValuePredictor(VPUpdateMetaData *updateMetaData)
 {
+    assertValidTid(updateMetaData->tid);
+    auto &idealConstTable = idealConstTables[updateMetaData->tid];
     auto it = idealConstTable.find(updateMetaData->pc);
     if (it == idealConstTable.end()) {
         // Not found, allocate a new entry
@@ -63,8 +68,10 @@ IdealConstantLVP::specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaD
 }
 
 void
-IdealConstantLVP::squash(const uint64_t seq_no)
+IdealConstantLVP::squash(ThreadID tid, const uint64_t seq_no)
 {
+    (void)tid;
+    (void)seq_no;
     // Do nothing
 }
 
diff --git a/src/cpu/valuepred/ideal_constant_lvp.hh b/src/cpu/valuepred/ideal_constant_lvp.hh
index de8eaba0fd..5d380670e4 100644
--- a/src/cpu/valuepred/ideal_constant_lvp.hh
+++ b/src/cpu/valuepred/ideal_constant_lvp.hh
@@ -2,6 +2,7 @@
 #define __IDEAL_CONSTANT_LVP_HH__
 
 #include <unordered_map>
+#include <vector>
 
 #include "base/sat_counter.hh"
 #include "base/types.hh"
@@ -30,7 +31,7 @@ class IdealConstantLVP : public VPUnit
         }
     };
 
-    std::unordered_map<Addr, ICEntry> idealConstTable;
+    std::vector<std::unordered_map<Addr, ICEntry>> idealConstTables;
 
     const unsigned satCounterBits;
     const bool resetConfidence;
@@ -46,7 +47,7 @@ class IdealConstantLVP : public VPUnit
 
     void specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData) override;
 
-    void squash(const uint64_t seq_no) override;
+    void squash(ThreadID tid, const uint64_t seq_no) override;
 
     virtual ValuePredType getValuePredictorType() override { return ValuePredType::IdealConstantLVP; }
 };
diff --git a/src/cpu/valuepred/valuepred_metadata.hh b/src/cpu/valuepred/valuepred_metadata.hh
index d5cfc2a975..75454d34aa 100644
--- a/src/cpu/valuepred/valuepred_metadata.hh
+++ b/src/cpu/valuepred/valuepred_metadata.hh
@@ -15,6 +15,7 @@ class VPPredMetaData
   public:
     Addr pc;
     uint64_t seq_no;
+    ThreadID tid = 0;
     virtual ~VPPredMetaData() {};
 };
 
@@ -23,6 +24,7 @@ class VPUpdateMetaData
   public:
     Addr pc;
     uint64_t seq_no;
+    ThreadID tid = 0;
     RegVal actualValue;
     bool isMisprediction;
     virtual ~VPUpdateMetaData() {};
@@ -31,6 +33,7 @@ class VPUpdateMetaData
 class VPSpecUpdateMetaData
 {
   public:
+    ThreadID tid = 0;
     virtual ~VPSpecUpdateMetaData() {};
 };
 
diff --git a/src/cpu/valuepred/valuepred_unit.cc b/src/cpu/valuepred/valuepred_unit.cc
index 97c7c7ebda..cd7ece5812 100644
--- a/src/cpu/valuepred/valuepred_unit.cc
+++ b/src/cpu/valuepred/valuepred_unit.cc
@@ -1,5 +1,6 @@
 #include "cpu/valuepred/valuepred_unit.hh"
 
+#include "base/logging.hh"
 #include "base/stats/group.hh"
 #include "base/stats/units.hh"
 
@@ -9,7 +10,20 @@ namespace gem5
 namespace valuepred
 {
 
-VPUnit::VPUnit(const Params &params) : SimObject(params), stats(this) {}
+VPUnit::VPUnit(const Params &params)
+    : SimObject(params),
+      numThreads(params.numThreads),
+      stats(this)
+{
+    gem5_assert(numThreads > 0, "Value predictor needs at least one thread\n");
+}
+
+void
+VPUnit::assertValidTid(ThreadID tid) const
+{
+    gem5_assert(tid < numThreads, "%s got invalid tid %u, numThreads=%u\n",
+                name().c_str(), static_cast<unsigned>(tid), numThreads);
+}
 
 VPUnit::ValuePredUnitStats::ValuePredUnitStats(VPUnit *vp)
     : statistics::Group(vp),
diff --git a/src/cpu/valuepred/valuepred_unit.hh b/src/cpu/valuepred/valuepred_unit.hh
index 4380cdeaed..52d36b0121 100644
--- a/src/cpu/valuepred/valuepred_unit.hh
+++ b/src/cpu/valuepred/valuepred_unit.hh
@@ -4,6 +4,7 @@
 #include <string>
 
 #include "base/statistics.hh"
+#include "base/types.hh"
 #include "cpu/valuepred/valuepred_metadata.hh"
 #include "enums/ValuePredType.hh"
 #include "params/ValuePredictor.hh"
@@ -23,6 +24,11 @@ class VPUnit : public SimObject
   private:
     using Params = ValuePredictorParams;
 
+  protected:
+    const unsigned numThreads;
+
+    void assertValidTid(ThreadID tid) const;
+
   public:
     VPUnit(const Params &params);
 
@@ -38,7 +44,7 @@ class VPUnit : public SimObject
     virtual void specUpdateValuePredictor(VPSpecUpdateMetaData *specupdateMetadata) = 0;
 
     // If predict error, squash the inflight instructions in value predictor.
-    virtual void squash(const uint64_t seq_no) = 0;
+    virtual void squash(ThreadID tid, const uint64_t seq_no) = 0;
 
     // Get the value predictor type
     virtual ValuePredType getValuePredictorType() = 0;

From 004a04603639c09dc1894be5617d6d495f0ee3e6 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 29 Apr 2026 10:14:53 +0800
Subject: [PATCH 27/38] mem-cache: Preserve prefetch context

Carry the triggering request context through prefetch metadata and final HardPFReq creation. Use context-aware prefetch queue matching so SMT threads with the same virtual address do not squash or deduplicate each other's prefetches.

Validation:

- scons build/RISCV/gem5.opt -j16

- git diff --check

Change-Id: I6a40826b47bff2e7ee8c7748eecd8622b44ca3c3
---
 src/mem/cache/prefetch/base.cc   | 29 +++++++++++++++---
 src/mem/cache/prefetch/base.hh   | 52 ++++++++++++++++++++++++++++++--
 src/mem/cache/prefetch/queued.cc | 22 ++++++++++----
 src/mem/cache/prefetch/worker.cc |  6 ++--
 4 files changed, 94 insertions(+), 15 deletions(-)

diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc
index 0fd233b369..109c75c6b0 100644
--- a/src/mem/cache/prefetch/base.cc
+++ b/src/mem/cache/prefetch/base.cc
@@ -63,7 +63,11 @@ namespace prefetch
 
 Base::PrefetchInfo::PrefetchInfo(PacketPtr pkt, Addr addr, bool miss)
   : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0),
-    requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()),
+    requestorId(pkt->req->requestorId()),
+    _contextId(pkt->req->hasContextId() ?
+        pkt->req->contextId() : InvalidContextID),
+    validContextId(pkt->req->hasContextId()),
+    validPC(pkt->req->hasPC()),
     secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()),
     paddress(pkt->req->getPaddr()), cacheMiss(miss)
 {
@@ -86,7 +90,11 @@ Base::PrefetchInfo::PrefetchInfo(
     PacketPtr pkt, Addr addr, bool miss,
     Request::XsMetadata xsMeta
 ) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0),
-    requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()),
+    requestorId(pkt->req->requestorId()),
+    _contextId(pkt->req->hasContextId() ?
+        pkt->req->contextId() : InvalidContextID),
+    validContextId(pkt->req->hasContextId()),
+    validPC(pkt->req->hasPC()),
     secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()),
     paddress(pkt->req->getPaddr()), cacheMiss(miss), xsMetadata(xsMeta)
 {
@@ -107,6 +115,7 @@ Base::PrefetchInfo::PrefetchInfo(
 
 Base::PrefetchInfo::PrefetchInfo(PrefetchInfo const &pfi, Addr addr)
   : address(addr), pc(pfi.pc), requestorId(pfi.requestorId),
+    _contextId(pfi._contextId), validContextId(pfi.validContextId),
     validPC(pfi.validPC), secure(pfi.secure), size(pfi.size),
     write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss),
     data(nullptr),data_ptr(nullptr)
@@ -114,6 +123,7 @@ Base::PrefetchInfo::PrefetchInfo(PrefetchInfo const &pfi, Addr addr)
 }
 Base::PrefetchInfo::PrefetchInfo(PrefetchInfo_old const &pfi)
   : address(pfi.address), pc(pfi.pc), requestorId(pfi.requestorId),
+    _contextId(pfi._contextId), validContextId(pfi.validContextId),
     validPC(pfi.validPC), secure(pfi.secure), size(pfi.size),
     write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss),
     data(nullptr),data_ptr(nullptr)
@@ -121,7 +131,11 @@ Base::PrefetchInfo::PrefetchInfo(PrefetchInfo_old const &pfi)
 }
 Base::PrefetchInfo_old::PrefetchInfo_old(PacketPtr pkt, Addr addr, bool miss)
   : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0),
-    requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()),
+    requestorId(pkt->req->requestorId()),
+    _contextId(pkt->req->hasContextId() ?
+        pkt->req->contextId() : InvalidContextID),
+    validContextId(pkt->req->hasContextId()),
+    validPC(pkt->req->hasPC()),
     secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()),
     paddress(pkt->req->getPaddr()), cacheMiss(miss)
 {
@@ -144,7 +158,11 @@ Base::PrefetchInfo_old::PrefetchInfo_old(
     PacketPtr pkt, Addr addr, bool miss,
     Request::XsMetadata xsMeta
 ) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0),
-    requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()),
+    requestorId(pkt->req->requestorId()),
+    _contextId(pkt->req->hasContextId() ?
+        pkt->req->contextId() : InvalidContextID),
+    validContextId(pkt->req->hasContextId()),
+    validPC(pkt->req->hasPC()),
     secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()),
     paddress(pkt->req->getPaddr()), cacheMiss(miss), xsMetadata(xsMeta)
 {
@@ -164,6 +182,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old(
 }
 Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &other)
   : address(other.address), pc(other.pc), requestorId(other.requestorId),
+    _contextId(other._contextId), validContextId(other.validContextId),
     validPC(other.validPC), secure(other.secure), size(other.size),
     write(other.write), paddress(other.paddress), cacheMiss(other.cacheMiss),
     data(nullptr),data_ptr(nullptr)
@@ -172,6 +191,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &other)
 }
 Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &pfi, Addr addr)
   : address(addr), pc(pfi.pc), requestorId(pfi.requestorId),
+    _contextId(pfi._contextId), validContextId(pfi.validContextId),
     validPC(pfi.validPC), secure(pfi.secure), size(pfi.size),
     write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss),
     data(nullptr),data_ptr(nullptr)
@@ -179,6 +199,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &pfi, Addr addr)
 }
 Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo const &pfi)
   : address(pfi.address), pc(pfi.pc), requestorId(pfi.requestorId),
+    _contextId(pfi._contextId), validContextId(pfi.validContextId),
     validPC(pfi.validPC), secure(pfi.secure), size(pfi.size),
     write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss),
     data(nullptr),data_ptr(nullptr)
diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh
index 2ba250fa48..f1a6fde397 100644
--- a/src/mem/cache/prefetch/base.hh
+++ b/src/mem/cache/prefetch/base.hh
@@ -163,6 +163,10 @@ class Base : public ClockedObject
         Addr pc;
         /** The requestor ID that generated this address. */
         RequestorID requestorId;
+        /** The thread context that generated this address. */
+        ContextID _contextId;
+        /** Whether the thread context is valid. */
+        bool validContextId;
         /** Validity bit for the PC of this address. */
         bool validPC;
         /** Whether this address targets the secure memory space. */
@@ -242,6 +246,17 @@ class Base : public ClockedObject
             return requestorId;
         }
 
+        bool hasContextId() const
+        {
+            return validContextId;
+        }
+
+        ContextID contextId() const
+        {
+            assert(hasContextId());
+            return _contextId;
+        }
+
         /**
          * Gets the size of the request triggering this event
          * @return the size in bytes of the request triggering this event
@@ -317,7 +332,16 @@ class Base : public ClockedObject
         bool sameAddr(PrefetchInfo const &pfi) const
         {
             return this->getAddr() == pfi.getAddr() &&
-                this->isSecure() == pfi.isSecure();
+                this->isSecure() == pfi.isSecure() &&
+                this->sameContext(pfi);
+        }
+
+        bool sameContext(PrefetchInfo const &pfi) const
+        {
+            if (hasContextId() != pfi.hasContextId()) {
+                return false;
+            }
+            return !hasContextId() || _contextId == pfi.contextId();
         }
 
         bool sameAddr(Addr addr, bool isSecure) const
@@ -407,6 +431,10 @@ class Base : public ClockedObject
         Addr pc;
         /** The requestor ID that generated this address. */
         RequestorID requestorId;
+        /** The thread context that generated this address. */
+        ContextID _contextId;
+        /** Whether the thread context is valid. */
+        bool validContextId;
         /** Validity bit for the PC of this address. */
         bool validPC;
         /** Whether this address targets the secure memory space. */
@@ -486,6 +514,17 @@ class Base : public ClockedObject
             return requestorId;
         }
 
+        bool hasContextId() const
+        {
+            return validContextId;
+        }
+
+        ContextID contextId() const
+        {
+            assert(hasContextId());
+            return _contextId;
+        }
+
         /**
          * Gets the size of the request triggering this event
          * @return the size in bytes of the request triggering this event
@@ -561,7 +600,16 @@ class Base : public ClockedObject
         bool sameAddr(PrefetchInfo_old const &pfi) const
         {
             return this->getAddr() == pfi.getAddr() &&
-                this->isSecure() == pfi.isSecure();
+                this->isSecure() == pfi.isSecure() &&
+                this->sameContext(pfi);
+        }
+
+        bool sameContext(PrefetchInfo_old const &pfi) const
+        {
+            if (hasContextId() != pfi.hasContextId()) {
+                return false;
+            }
+            return !hasContextId() || _contextId == pfi.contextId();
         }
 
         bool sameAddr(Addr addr, bool isSecure) const
diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc
index 5fbeb76b68..2ded29bfcf 100644
--- a/src/mem/cache/prefetch/queued.cc
+++ b/src/mem/cache/prefetch/queued.cc
@@ -67,11 +67,21 @@ Queued::DeferredPacket::createPkt(Addr paddr, unsigned blk_size, RequestorID req
     /* Create a prefetch memory request */
     RequestPtr req;
     if (owner->useVirtualAddresses && pfInfo.hasPC()) {
-        req = std::make_shared<Request>(pfInfo.getAddr(), blk_size, 0,
-                                        requestor_id, pfInfo.getPC(), 0);
+        if (pfInfo.hasContextId()) {
+            req = std::make_shared<Request>(pfInfo.getAddr(), blk_size, 0,
+                                            requestor_id, pfInfo.getPC(),
+                                            pfInfo.contextId());
+        } else {
+            req = std::make_shared<Request>();
+            req->setVirt(pfInfo.getAddr(), blk_size, 0, requestor_id,
+                         pfInfo.getPC());
+        }
         req->setPaddr(paddr);
     } else {
         req = std::make_shared<Request>(paddr, blk_size, 0, requestor_id);
+        if (pfInfo.hasContextId()) {
+            req->setContext(pfInfo.contextId());
+        }
     }
 
     req->setFlags(Request::PREFETCH);
@@ -213,7 +223,6 @@ void
 Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi)
 {
     Addr blk_addr = blockAddress(pfi.getAddr());
-    bool is_secure = pfi.isSecure();
 
     bool late_in_mshr = pkt->missOnLatePf;  // hit in pf mshr
 
@@ -222,10 +231,10 @@ Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi)
 
     // Squash queued prefetches if demand miss to same line
     if (queueSquash) {
+        PrefetchInfo blk_pfi(pfi, blk_addr);
         auto itr = pfq.begin();
         while (itr != pfq.end()) {
-            if (itr->pfInfo.getAddr() == blk_addr &&
-                itr->pfInfo.isSecure() == is_secure) {
+            if (itr->pfInfo.sameAddr(blk_pfi)) {
                 DPRINTF(HWPrefetch, "Removing pf candidate addr: %#x "
                         "(cl: %#x), demand request going to the same addr\n",
                         itr->pfInfo.getAddr(),
@@ -545,9 +554,10 @@ Queued::alreadyInQueue(std::list<DeferredPacket> &queue,
 RequestPtr
 Queued::createPrefetchRequest(Addr addr, PrefetchInfo const &pfi, PacketPtr pkt, PrefetchSourceType pf_src, int pf_depth)
 {
+    assert(pfi.hasContextId());
     RequestPtr translation_req = std::make_shared<Request>(
             addr, blkSize, pkt->req->getFlags(), requestorId, pfi.getPC(),
-            pkt->req->contextId());
+            pfi.contextId());
     translation_req->setFlags(Request::PF_EXCLUSIVE);
     translation_req->setPFSource(pf_src);
     translation_req->setPFDepth(pf_depth);
diff --git a/src/mem/cache/prefetch/worker.cc b/src/mem/cache/prefetch/worker.cc
index 6602ebe7a0..a87b6810fa 100644
--- a/src/mem/cache/prefetch/worker.cc
+++ b/src/mem/cache/prefetch/worker.cc
@@ -60,10 +60,10 @@ WorkerPrefetcher::transfer()
     auto dpp_it = localBuffer.begin();
     while (count < depth && !localBuffer.empty()) {
         if (queueFilter) {
-            if (alreadyInQueue(pfq, dpp_it->pfInfo.getAddr(), dpp_it->pfInfo.isSecure(), dpp_it->priority)) {
+            if (alreadyInQueue(pfq, dpp_it->pfInfo, dpp_it->priority)) {
                 DPRINTF(WorkerPref, "Worker: [%lx, %d] was already in pfq\n", dpp_it->pfInfo.getAddr(),
                         dpp_it->pfahead_host);
-            } else if (alreadyInQueue(pfqMissingTranslation, dpp_it->pfInfo.getAddr(), dpp_it->pfInfo.isSecure(),
+            } else if (alreadyInQueue(pfqMissingTranslation, dpp_it->pfInfo,
                                       dpp_it->priority)) {
                 DPRINTF(WorkerPref, "Worker: [%lx, %d] was already in pfq\n", dpp_it->pfInfo.getAddr(),
                         dpp_it->pfahead_host);
@@ -85,4 +85,4 @@ WorkerPrefetcher::transfer()
 }
 
 }  // namespace prefetch
-}  // namespace gem5
\ No newline at end of file
+}  // namespace gem5

From 32eed52ed1e8c23c6392372962a1e759fac2c58e Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 11 May 2026 13:07:18 +0800
Subject: [PATCH 28/38] cpu-o3: Select IQ entries by dispatch age

Use per-instruction dispatch age for IQ ordering and remove the old SMT-specific selector plumbing.

Change-Id: I06f4cec1fcbe910652272cf1caf9366bdbbea171
---
 src/cpu/o3/FuncScheduler.py |  7 +--
 src/cpu/o3/SConscript       |  2 +-
 src/cpu/o3/dyn_inst.hh      |  3 ++
 src/cpu/o3/iew.cc           | 18 ++++++++
 src/cpu/o3/iew.hh           |  1 +
 src/cpu/o3/inst_queue.cc    |  3 +-
 src/cpu/o3/issue_queue.cc   | 92 ++-----------------------------------
 src/cpu/o3/issue_queue.hh   | 28 +----------
 src/cpu/o3/smt_sched.hh     | 33 -------------
 9 files changed, 30 insertions(+), 157 deletions(-)

diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py
index 7676f6d643..2e118a6734 100644
--- a/src/cpu/o3/FuncScheduler.py
+++ b/src/cpu/o3/FuncScheduler.py
@@ -75,11 +75,6 @@ class PAgeSelector(BaseSelector):
 
     piece = Param.Int(2, "number of instructions in a group")
 
-class SMTBasedSelector(BaseSelector):
-    type = 'SMTBasedSelector'
-    cxx_class = 'gem5::o3::SMTBasedSelector'
-    cxx_header = "cpu/o3/issue_queue.hh"
-
 class IssueQue(SimObject):
     type = 'IssueQue'
     cxx_class = 'gem5::o3::IssueQue'
@@ -90,7 +85,7 @@ class IssueQue(SimObject):
     inports = Param.Int(2, "")
     scheduleToExecDelay = Param.Cycles(2, "")
     oports = VectorParam.IssuePort("")
-    sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)")
+    sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ")
 
 class Scheduler(SimObject):
     type = 'Scheduler'
diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript
index 3c2902a6b4..93d6c0b1da 100755
--- a/src/cpu/o3/SConscript
+++ b/src/cpu/o3/SConscript
@@ -32,7 +32,7 @@ Import('*')
 
 if env['CONF']['TARGET_ISA'] != 'null':
     SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel',
-              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler'])
+              'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler'])
     SimObject('FuncUnitConfig.py', sim_objects=[])
     SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[
         'SMTFetchPolicy', 'SMTQueuePolicy', 'SMTLSQMode', 'CommitPolicy',
diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh
index f79a8784b4..60459642f9 100644
--- a/src/cpu/o3/dyn_inst.hh
+++ b/src/cpu/o3/dyn_inst.hh
@@ -1455,6 +1455,9 @@ class DynInst : public ExecContext, public RefCounted
     Tick lastWakeDependents = -1;
     Tick translatedTick = -1;
 
+    /** Dispatch age = dispatch cycle * 8 + dispatch position. */
+    uint64_t ageCtr = static_cast<uint64_t>(-1);
+
     Tick readyTick = -1;
     Tick completionTick = -1;
 
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 96fded9794..03076739e3 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -915,6 +915,20 @@ IEW::canInsertLDSTQue(ThreadID tid)
     return false;
 }
 
+void
+IEW::setDispatchAgeCtr(const DynInstPtr& inst, int dispatch_pos)
+{
+    constexpr uint64_t dispatchAgeScale = 8;
+
+    assert(dispatch_pos >= 0);
+    assert(dispatch_pos < static_cast<int>(dispatchAgeScale));
+    inst->ageCtr = static_cast<uint64_t>(cpu->curCycle()) * dispatchAgeScale +
+                   static_cast<uint64_t>(dispatch_pos);
+    DPRINTF(IEW, "[tid:%i] [sn:%llu] ageCtr=%llu at dispatch pos %d.\n",
+            inst->threadNumber, inst->seqNum,
+            static_cast<unsigned long long>(inst->ageCtr), dispatch_pos);
+}
+
 void
 IEW::dispatchInsts()
 {
@@ -1055,6 +1069,8 @@ IEW::dispatchInstFromRename(ThreadID tid)
             inst->clearHtmTransactionalState();
         }
 
+        setDispatchAgeCtr(inst, dispatched);
+
         if (!inst->isNop() && !inst->isEliminated()) {
             scheduler->addProducer(inst);
         }
@@ -1221,6 +1237,8 @@ IEW::classifyInstToDispQue(ThreadID tid)
                 inst->clearHtmTransactionalState();
             }
 
+            setDispatchAgeCtr(inst, dispatched);
+
             if (inst->isAtomic()) {
                 ++iewStats.dispStoreInsts;
                 ++iewStats.dispNonSpecInsts;
diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh
index c621e62ebc..a050d3d9d7 100644
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@@ -327,6 +327,7 @@ class IEW
 
     /** Dispatches instructions to IQ and LSQ. */
     void dispatchInsts();
+    void setDispatchAgeCtr(const DynInstPtr& inst, int dispatch_pos);
 
     void dispatchInstFromRename(ThreadID tid);
 
diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc
index dda79556dc..2b76fdaf7e 100644
--- a/src/cpu/o3/inst_queue.cc
+++ b/src/cpu/o3/inst_queue.cc
@@ -163,8 +163,7 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr,
     scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue);
     scheduler->resetDepGraph(numPhysRegs);
     scheduler->setMemDepUnit(memDepUnit);
-    scheduler->initIQICountSmtScheduler(numThreads);
-    
+
     resetState();
 }
 
diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc
index bf5bb1ea47..094c0a2fa3 100644
--- a/src/cpu/o3/issue_queue.cc
+++ b/src/cpu/o3/issue_queue.cc
@@ -140,61 +140,12 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid)
     }
 }
 
-void
-SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq)
-{
-    BaseSelector::setparent(scheduler, iq);
-
-    smtScheduler = iq->getIndependentIQICountScheduler();
-}
-
-ReadyQue::iterator
-SMTBasedSelector::select(ReadyQue::iterator begin, int portid)
-{
-    if (begin == end) {
-        return end;
-    }
-    
-    ThreadID priorityThread = 0;
-    
-    if (smtScheduler) {
-        priorityThread = smtScheduler->getThread();
-        
-        DPRINTF(Schedule, 
-            "SMTBasedSelector: priority thread = %d\n", 
-            priorityThread);
-    }
-    
-    for (auto it = begin; it != end; it++) {
-        auto& inst = *it;
-        
-        if (inst->threadNumber == priorityThread) {
-            DPRINTF(Schedule, 
-                "[sn:%llu] selected by SMT policy (tid=%d)\n",
-                inst->seqNum, priorityThread);
-            return it;
-        }
-    }
-    
-    
-    for (auto it = begin; it != end; it++) {
-        auto& inst = *it;
-        
-        if (inst->threadNumber != priorityThread) {
-            DPRINTF(Schedule, 
-                "[sn:%llu] selected by default (tid=%d, priority=%d)\n",
-                inst->seqNum, inst->threadNumber, priorityThread);
-            return it;
-        }
-    }
-    
-    DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n");
-    return begin;
-}
-
 bool
 IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const
 {
+    if (a->ageCtr != b->ageCtr) {
+        return a->ageCtr < b->ageCtr;
+    }
     return a->seqNum < b->seqNum;
 }
 
@@ -646,8 +597,6 @@ IssueQue::selectInst()
     selectQ.clear();
     for (int pi = 0; pi < outports; pi++) {
         auto readyQ = readyQs[pi];
-        // iq->getInstsCounter()->getCounter(tid)
-        int iqcount = 0;
         for (auto it = readyQ->begin(); it != readyQ->end(); ++it) {
             DPRINTF(Schedule, "readyQ for port %d has [sn:%llu] %s [tid:%u]\n", pi, (*it)->seqNum,
                     (*it)->genDisassembly(), (*it)->threadNumber);
@@ -666,15 +615,6 @@ IssueQue::selectInst()
             uint64_t busy_bit = (lat > 63 ? -1 : (1llu << lat));
             if (!(portBusy[pi] & busy_bit)) {
                 DPRINTF(Schedule, "[sn %ld] was selected\n", inst->seqNum);
-                for (ThreadID tid = 0; tid < MaxThreads; tid++) {
-                    if (inst->threadNumber == tid) {
-                        independentIQICountScheduler->scheduleNum[tid]++;
-                    } else {
-                        independentIQICountScheduler->scheduleNum[tid] = 0;
-                    }
-                }
-                DPRINTF(Schedule, "smtScheduler->scheduleNum[0]=%d, smtScheduler->scheduleNum[1]=%d\n",
-                        independentIQICountScheduler->scheduleNum[0], independentIQICountScheduler->scheduleNum[1]);
                 // get regfile write port
                 for (int i = 0; i < inst->numDestRegs(); i++) {
                     auto pdst = inst->renamedDestIdx(i);
@@ -937,17 +877,6 @@ IssueQue::decInIQInstsCounter(ThreadID tid)
     }
 }
 
-void
-IssueQue::initIndependentIQICountScheduler(int numThreads)
-{
-       assert(instsCounter != nullptr && "InstsCounter must be set first");
-        
-        independentIQICountScheduler = new IndependentIQICountScheduler(
-            numThreads, instsCounter);
-        
-        DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname);    
-}
-
 Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to,
                                                       PendingWakeEventsType* owner)
     : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to)
@@ -1146,6 +1075,7 @@ Scheduler::setCPU(CPU* cpu, LSQ* lsq)
     this->lsq = lsq;
     for (auto it : issueQues) {
         it->setCPU(cpu);
+        it->selector->setparent(this, it);
     }
 }
 
@@ -1763,19 +1693,5 @@ Scheduler::setMainRdpOpt(bool enable)
     }
 }
 
-void
-Scheduler::initIQICountSmtScheduler(int numThreads)
-{
-    DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads);
-        
-    // to do: add switch;add SMTSchedulingPolicy
-    for (auto iq : issueQues) {
-        InstsCounter* counter = iq->getInstsCounter();
-        assert(counter);
-        iq->initIndependentIQICountScheduler(numThreads);
-        iq->selector->setparent(this, iq);
-    }
-}
-
 }
 }
diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh
index 6c6d9f8fbf..15c99eeb4c 100644
--- a/src/cpu/o3/issue_queue.hh
+++ b/src/cpu/o3/issue_queue.hh
@@ -23,7 +23,6 @@
 #include "params/IssuePort.hh"
 #include "params/IssueQue.hh"
 #include "params/PAgeSelector.hh"
-#include "params/SMTBasedSelector.hh"
 #include "params/Scheduler.hh"
 #include "params/SpecWakeupChannel.hh"
 #include "sim/sim_object.hh"
@@ -101,25 +100,11 @@ class PAgeSelector : public BaseSelector
     ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
 };
 
-class SMTBasedSelector : public BaseSelector
-{
-  private:
-      IndependentIQICountScheduler* smtScheduler = nullptr;
-  public:
-    SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {}
-    void setparent(Scheduler* scheduler, IssueQue* iq) override;
-    void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);}
-    void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);}
-    ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override;
-};
-
 class IssueQue : public SimObject
 {
     friend class Scheduler;
     friend class BaseSelector;
     friend class PAgeSelector;
-    friend class InstsCounter;
-    friend class IndependentIQICountScheduler;
 
     std::string _name;
     const int inports;
@@ -188,9 +173,8 @@ class IssueQue : public SimObject
     Scheduler* scheduler = nullptr;
     BaseSelector* selector = nullptr;
 
-    //iq smt scheduler
+    // iq per-thread occupancy counter, used for fetch-side feedback stats
     InstsCounter* instsCounter = nullptr;
-    IndependentIQICountScheduler* independentIQICountScheduler = nullptr;
 
     struct IssueQueStats : public statistics::Group
     {
@@ -228,21 +212,12 @@ class IssueQue : public SimObject
     void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; }
     void resetDepGraph(int numPhysRegs);
 
-    void setInstsCounter(InstsCounter* counter) { instsCounter = counter;}
-
     InstsCounter* getInstsCounter() const {return instsCounter; }
 
     void incInIQInstsCounter(ThreadID tid);
     void decInIQInstsCounter(ThreadID tid);
     bool hasInstsCounter() const { return instsCounter != nullptr; }
 
-    void initIndependentIQICountScheduler(int numThreads);
-
-    void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) {
-      independentIQICountScheduler = _independentIQICountScheduler;
-    }
-    IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; }
-
     void tick();
     bool ready();
     int emptyEntries() const { return iqsize - instNum; }
@@ -367,7 +342,6 @@ class Scheduler : public SimObject
     void setAllScoreBoard(PhysRegIdPtr reg);
     void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; }
     void setMainRdpOpt(bool enable);
-    void initIQICountSmtScheduler(int numThreads);
 
     void tick();
     void issueAndSelect();
diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh
index 7fcbf5733f..a10e404b15 100644
--- a/src/cpu/o3/smt_sched.hh
+++ b/src/cpu/o3/smt_sched.hh
@@ -127,38 +127,5 @@ class MultiPrioritySched : public SMTScheduler
     }
 };
 
-class IndependentIQICountScheduler : public SMTScheduler {
-private:
-     InstsCounter* counter;  // Counter for this IQ only
-
-
-public:
-    IndependentIQICountScheduler(int numThreads, InstsCounter* counter)
-        : SMTScheduler(numThreads), counter(counter){}
-
-    ThreadID getThread() override {
-        ThreadID selectedTid = 0;
-        uint64_t maxCount = counter->getCounter(0);
-        if(scheduleNum[0] >= 100){
-            selectedTid = 1;
-            return selectedTid;
-        }
-        for (ThreadID tid = 1; tid < numThreads; ++tid) {
-            if(scheduleNum[tid] >= 100){
-                selectedTid = 0;
-                return selectedTid;
-            }
-            uint64_t count = counter->getCounter(tid);
-            if (count > maxCount) {
-                maxCount = count;
-                selectedTid = tid;
-            }
-        }
-        return selectedTid;
-    }
-    int scheduleNum[MaxThreads];
-
-};
-
 }}
 #endif

From 70a364f8eb4bcf49933a4543c5f5caa11f6564fd Mon Sep 17 00:00:00 2001
From: Mo Haonan <66786667+mhnGitHubz@users.noreply.github.com>
Date: Mon, 11 May 2026 19:44:36 +0800
Subject: [PATCH 29/38] cpu-o3: all threads have a store to offload and both
 fail, reset request priority. (#847)

Co-authored-by: mo haonan <mohaonan@node023.bosccluster.com>
---
 src/cpu/o3/lsq.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 448530ab69..33ffdeb593 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -993,12 +993,16 @@ LSQ::processWriteback()
         }
     }
     threads = activeThreads->begin();
+    bool has_thread_offloaded = false;
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
         thread[(nextStoreBufferInsertTid + tid) % numThreads].offloadToStoreBuffer(offload_quota[(nextStoreBufferInsertTid + tid) % numThreads], offload_fail);
+        has_thread_offloaded |= ((offload_quota[(nextStoreBufferInsertTid + tid) % numThreads] != 0) 
+                                && !(offload_fail[(nextStoreBufferInsertTid + tid) % numThreads]));
+        
     }
 
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
-        if (offload_fail[tid]) {
+        if (offload_fail[tid] && has_thread_offloaded) {
             nextStoreBufferInsertTid = tid;
         }
     }

From f4334ca0301a02ba1ec0369617c7e6fa9e178f7d Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 19 May 2026 17:02:05 +0800
Subject: [PATCH 30/38] cpu-o3: Expose SMT borrowing tunables

Change-Id: I1078cf7423248c619faeb4fde7f0a210a7d02b77
---
 configs/example/smt_idealkmhv3.py |   1 +
 src/cpu/o3/BaseO3CPU.py           |  12 ++-
 src/cpu/o3/comm.hh                | 127 ++++++++++++++++++++++++++++++
 src/cpu/o3/commit.cc              |  52 +++++++++---
 src/cpu/o3/commit.hh              |   6 ++
 src/cpu/o3/decode.cc              |  24 +++---
 src/cpu/o3/fetch.cc               |  80 ++++++++++++++-----
 src/cpu/o3/fetch.hh               |   4 +
 src/cpu/o3/iew.cc                 |  44 ++++++++---
 src/cpu/o3/lsq.cc                 |  11 ++-
 src/cpu/o3/rename.cc              |  23 ++++--
 src/cpu/o3/rob.cc                 | 111 ++++++++++++++++++++++++++
 src/cpu/o3/rob.hh                 |  35 +++++++-
 13 files changed, 465 insertions(+), 65 deletions(-)

diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py
index dbbe66f814..28abc2a3e9 100644
--- a/configs/example/smt_idealkmhv3.py
+++ b/configs/example/smt_idealkmhv3.py
@@ -22,6 +22,7 @@ def setSharedLSQParams(args, system):
         # shared target queue and starve the other thread's frontend.
         cpu.smtLSQMode = 'Shared'
         cpu.smtLSQPolicy = 'Dynamic'
+        cpu.smtROBPolicy = 'DynamicBorrowing'
         cpu.branchPred.smtFTQMode = 'Shared'
         cpu.branchPred.smtFTQPolicy = 'Partitioned'
 
diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py
index b1f6979368..88c03f1dd6 100644
--- a/src/cpu/o3/BaseO3CPU.py
+++ b/src/cpu/o3/BaseO3CPU.py
@@ -51,7 +51,7 @@ class SMTFetchPolicy(ScopedEnum):
     vals = [ 'RoundRobin', 'Branch', 'IQCount', 'LSQCount' ]
 
 class SMTQueuePolicy(ScopedEnum):
-    vals = [ 'Dynamic', 'Partitioned', 'Threshold' ]
+    vals = [ 'Dynamic', 'Partitioned', 'Threshold', 'DynamicBorrowing' ]
 
 class SMTLSQMode(ScopedEnum):
     vals = [ 'Independent', 'Shared' ]
@@ -248,6 +248,16 @@ def support_take_over(cls):
                                           "SMT ROB Sharing Policy")
     smtROBThreshold = Param.Int(100, "SMT ROB Threshold Sharing Parameter")
     smtCommitPolicy = Param.CommitPolicy('RoundRobin', "SMT Commit Policy")
+    smtBorrowThrottleCycles = Param.Unsigned(
+        8, "Cycles to keep a backend-stalled SMT thread throttled at fetch")
+    smtBorrowLdstqHighWater = Param.Unsigned(
+        0, "Explicit SMT borrowing LSQ high-water threshold; 0 uses percentage")
+    smtBorrowLdstqHighWaterPercent = Param.Percent(
+        75, "SMT borrowing LSQ high-water threshold as a percentage of LQ+SQ")
+    smtBorrowDonorHoldCycles = Param.Unsigned(
+        8, "Cycles to keep an SMT thread marked as a ROB borrowing donor")
+    smtBorrowDonorReserveEntries = Param.Unsigned(
+        8, "Minimum ROB entries reserved for a borrowing donor to resume")
 
     branchPred = Param.BranchPredictor(DecoupledBPUWithBTB(),
                                        "Branch Predictor")
diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh
index ade70ed5e3..8d8c8cdd7d 100644
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@@ -350,6 +350,133 @@ struct TimeStruct
     CommitComm commitInfo[MaxThreads];// commit to iew, rename, fetch
 };
 
+inline bool
+smtCanDonateRobHeadroom(StallReason reason)
+{
+    switch (reason) {
+      case NoStall:
+      case ROBFull:
+      case RegFull:
+      case MemDQBandwidth:
+      case IntDQBandwidth:
+      case FVDQBandwidth:
+      case VectorReadyButNotIssued:
+      case ScalarReadyButNotIssued:
+      case CommitSquash:
+        return false;
+      default:
+        return true;
+    }
+}
+
+inline bool
+smtIsMemoryPressureReason(StallReason reason)
+{
+    switch (reason) {
+      case DTlbStall:
+      case LoadL2Bound:
+      case LoadL3Bound:
+      case LoadMemBound:
+      case StoreL2Bound:
+      case StoreL3Bound:
+      case StoreMemBound:
+      case MemSquashed:
+      case MemNotReady:
+      case MemCommitRateLimit:
+      case Atomic:
+      case OtherMemStall:
+        return true;
+      default:
+        return false;
+    }
+}
+
+inline bool
+smtHasBorrowThrottleStall(const TimeStruct::IewComm &info)
+{
+    return smtCanDonateRobHeadroom(info.robHeadStallReason) ||
+           smtCanDonateRobHeadroom(info.lqHeadStallReason) ||
+           smtCanDonateRobHeadroom(info.sqHeadStallReason);
+}
+
+inline bool
+smtHasMemoryPressure(const TimeStruct::IewComm &info,
+                     unsigned ldstqHighWater = 0)
+{
+    if (ldstqHighWater != 0 && info.ldstqCount >= ldstqHighWater) {
+        return true;
+    }
+
+    return smtIsMemoryPressureReason(info.robHeadStallReason) ||
+           smtIsMemoryPressureReason(info.lqHeadStallReason) ||
+           smtIsMemoryPressureReason(info.sqHeadStallReason);
+}
+
+inline uint64_t
+smtBorrowPriority(const TimeStruct::IewComm &info)
+{
+    constexpr uint64_t backend_stall_penalty = 1ULL << 48;
+    constexpr uint64_t memory_pressure_penalty = 1ULL << 49;
+
+    uint64_t score = static_cast<uint64_t>(info.robCount) +
+                     static_cast<uint64_t>(info.iqCount) * 2 +
+                     static_cast<uint64_t>(info.ldstqCount) * 4;
+
+    if (smtHasBorrowThrottleStall(info)) {
+        score += backend_stall_penalty;
+    }
+    if (smtHasMemoryPressure(info)) {
+        score += memory_pressure_penalty;
+    }
+
+    return score;
+}
+
+struct SmtActiveThreadFreeze
+{
+    ThreadID previousActive = InvalidThreadID;
+    bool freezeCurrent = false;
+};
+
+class SmtActiveThreadArbiter
+{
+  public:
+    static constexpr uint64_t InvalidScore = static_cast<uint64_t>(-1);
+
+    SmtActiveThreadFreeze observe(ThreadID tid, uint64_t score)
+    {
+        if (score < bestScore) {
+            selectedTid = tid;
+            bestScore = score;
+        }
+
+        if (freezeActive) {
+            SmtActiveThreadFreeze freeze;
+            freeze.freezeCurrent = true;
+            return freeze;
+        }
+
+        if (firstActiveTid == InvalidThreadID) {
+            firstActiveTid = tid;
+            return {};
+        }
+
+        freezeActive = true;
+        SmtActiveThreadFreeze freeze;
+        freeze.previousActive = firstActiveTid;
+        freeze.freezeCurrent = true;
+        return freeze;
+    }
+
+    ThreadID selected() const { return selectedTid; }
+
+  private:
+    ThreadID selectedTid = InvalidThreadID;
+    ThreadID firstActiveTid = InvalidThreadID;
+    bool freezeActive = false;
+    uint64_t bestScore = InvalidScore;
+};
+
 
 struct StallSignals
 {
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index 2257e1bd46..bfbe23550f 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -147,6 +147,7 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara
       renameWidth(params.renameWidth),
       commitWidth(params.commitWidth),
       numThreads(params.numThreads),
+      smtBorrowDonorHoldCycles(params.smtBorrowDonorHoldCycles),
       drainPending(false),
       drainImminent(false),
       trapLatency(params.trapLatency),
@@ -175,6 +176,7 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         commitStatus[tid] = Idle;
         changedROBNumEntries[tid] = false;
+        borrowingDonorCycles[tid] = 0;
         trapSquash[tid] = false;
         tcSquash[tid] = false;
         squashAfterInst[tid] = nullptr;
@@ -563,6 +565,7 @@ Commit::takeOverFrom()
     for (ThreadID tid = 0; tid < numThreads; tid++) {
         commitStatus[tid] = Idle;
         changedROBNumEntries[tid] = false;
+        borrowingDonorCycles[tid] = 0;
         trapSquash[tid] = false;
         tcSquash[tid] = false;
         squashAfterInst[tid] = NULL;
@@ -1974,16 +1977,40 @@ Commit::moveInstsToBuffer()
         for (int i = 0; i < insts_from_rename; ++i) {
             const DynInstPtr &inst = fromRename->insts[i];
             assert(inst->threadNumber == tid);
-            if (!inst->isSquashed())
-            fixedbuffer[tid].push_back(inst);
+            if (!inst->isSquashed()) {
+                fixedbuffer[tid].push_back(inst);
+            }
+        }
+    }
+
+    for (int i = 0; i < numThreads; ++i) {
+        bool has_buffered_rename = !fixedbuffer[i].empty();
+        bool donor = false;
+
+        if (has_buffered_rename) {
+            borrowingDonorCycles[i] = 0;
+        } else {
+            donor = smtHasBorrowThrottleStall(robInfoFromIEW->iewInfo[i]);
+            if (donor) {
+                borrowingDonorCycles[i] = smtBorrowDonorHoldCycles;
+            } else if (borrowingDonorCycles[i] > 0) {
+                --borrowingDonorCycles[i];
+            }
+            donor = borrowingDonorCycles[i] > 0;
         }
+
+        rob->setBorrowingDonor(i, donor);
     }
 
     // check threads stall & status
-    ThreadID tid = InvalidThreadID;
+    SmtActiveThreadArbiter active_arbiter;
+    auto freezeActiveThread = [this](ThreadID tid) {
+        stallSig->blockIEW[tid] = true;
+        stallSig->iewBlockReason[tid] = StallReason::OtherFragStall;
+    };
     for (int i = 0; i < numThreads; i++) {
         bool robblock = commitStatus[i] == ROBSquashing || commitStatus[i] == TrapPending;
-        bool block = (rob->getMaxEntries(i) - rob->getThreadEntries(i) < fixedbuffer[i].size()) || robblock;
+        bool block = !rob->canAllocate(i, fixedbuffer[i].size()) || robblock;
         bool active = !block && !fixedbuffer[i].empty();
         StallReason block_reason = StallReason::NoStall;
         if (robblock) {
@@ -1999,16 +2026,17 @@ Commit::moveInstsToBuffer()
         stallSig->blockIEW[i] = block;
         stallSig->iewBlockReason[i] = block ? block_reason : StallReason::NoStall;
         if (active) {
-            if (tid == InvalidThreadID) tid = i;
-            else {
-                // if there are multiple active threads, must exhaust all threads first
-                // to avoid starvation of other threads and also avoid resource conflict
-                stallSig->blockIEW[tid] = true;
-                stallSig->blockIEW[i] = true;
-                DPRINTF(IEW, "Multiple active threads detected, blocking all threads\n");
+            const auto freeze = active_arbiter.observe(
+                i, smtBorrowPriority(robInfoFromIEW->iewInfo[i]));
+            if (freeze.previousActive != InvalidThreadID) {
+                freezeActiveThread(freeze.previousActive);
+            }
+            if (freeze.freezeCurrent) {
+                freezeActiveThread(i);
             }
         }
     }
+    const ThreadID tid = active_arbiter.selected();
     if (tid == InvalidThreadID) {
         DPRINTF(Commit, "No instructions from Rename stage.\n");
         return;
@@ -2028,7 +2056,7 @@ Commit::moveInstsToBuffer()
 
             rob->insertInst(inst);
 
-            assert(rob->getThreadEntries(tid) <= rob->getMaxEntries(tid));
+            assert(rob->canAllocate(tid, 0));
 
             youngestSeqNum[tid] = inst->seqNum;
         } else {
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 27ac59157e..510b88a3b7 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -457,6 +457,9 @@ class Commit
      */
     bool changedROBNumEntries[MaxThreads];
 
+    /** Donor hysteresis for dynamic ROB borrowing. */
+    unsigned borrowingDonorCycles[MaxThreads];
+
     /** Records if a thread has to squash this cycle due to a trap. */
     bool trapSquash[MaxThreads];
 
@@ -497,6 +500,9 @@ class Commit
     /** Number of Active Threads */
     const ThreadID numThreads;
 
+    /** Cycles to keep a stalled thread marked as a ROB borrowing donor. */
+    const unsigned smtBorrowDonorHoldCycles;
+
     /** Is a drain pending? Commit is looking for an instruction boundary while
      * there are no pending interrupts
      */
diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index 0d36e05a85..d24e1d1efc 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -486,8 +486,14 @@ Decode::tick()
     checkSquash();
 
     // check threads stall & status
-    ThreadID tid = InvalidThreadID;
     ThreadID blocked_tid = InvalidThreadID;
+    SmtActiveThreadArbiter active_arbiter;
+    auto freezeActiveThread = [this](ThreadID tid) {
+        stallSig->blockFetch[tid] = true;
+        stallSig->fetchBlockReason[tid] = StallReason::OtherFragStall;
+        toFetch->decodeInfo[tid].blockReason =
+            stallSig->fetchBlockReason[tid];
+    };
     const bool fifoBackpressured =
         !stallBuffer.empty() &&
         eachstallSize.size() + decodeToFetchDelay + 1 >=
@@ -520,19 +526,19 @@ Decode::tick()
                 StallReason::NoStall;
         toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i];
         if (active) {
-            if (tid == InvalidThreadID)
-                tid = i;
-            else {
-                // if there are multiple active threads, must exhaust all threads first
-                // to avoid starvation of other threads and also avoid resource conflict
-                stallSig->blockFetch[tid] = true;
-                stallSig->blockFetch[i] = true;
-                DPRINTF(Decode, "Multiple active threads detected, blocking all threads\n");
+            const auto freeze = active_arbiter.observe(
+                i, smtBorrowPriority(fromIEW->iewInfo[i]));
+            if (freeze.previousActive != InvalidThreadID) {
+                freezeActiveThread(freeze.previousActive);
+            }
+            if (freeze.freezeCurrent) {
+                freezeActiveThread(i);
             }
         } else if (block && blocked_tid == InvalidThreadID) {
             blocked_tid = i;
         }
     }
+    const ThreadID tid = active_arbiter.selected();
     if (tid == InvalidThreadID) {
         // all threads are stalled, no need to process
         if (blocked_tid != InvalidThreadID) {
diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc
index 0642e5a587..5dc44ac398 100644
--- a/src/cpu/o3/fetch.cc
+++ b/src/cpu/o3/fetch.cc
@@ -116,6 +116,7 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
              "\tincrease MaxWidth in src/cpu/o3/limits.hh\n",
              fetchWidth, static_cast<int>(MaxWidth));
 
+    smtBorrowThrottleHoldCycles = params.smtBorrowThrottleCycles;
     for (int i = 0; i < MaxThreads; i++) {
         setThreadStatus(i, Idle);
         decoder[i] = nullptr;
@@ -123,6 +124,13 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams &params)
         macroop[i] = nullptr;
         delayedCommit[i] = false;
         lastIcacheStall[i] = 0;
+        smtBorrowThrottleCycles[i] = 0;
+    }
+    smtLdstqHighWater = params.smtBorrowLdstqHighWater;
+    if (smtLdstqHighWater == 0) {
+        smtLdstqHighWater =
+            (params.LQEntries + params.SQEntries) *
+            params.smtBorrowLdstqHighWaterPercent / 100;
     }
 
     branchPred = params.branchPred;
@@ -503,6 +511,7 @@ Fetch::resetStage()
 
         priorityList.push_back(tid);
         waitForVsetvl[tid] = false;
+        smtBorrowThrottleCycles[tid] = 0;
     }
 
     wroteToTimeBuffer = false;
@@ -1386,36 +1395,67 @@ Fetch::handleInterrupts()
 ThreadID
 Fetch::selectUnstalledThread()
 {
+    ThreadID selected = InvalidThreadID;
+    bool has_candidate = false;
+    bool has_unthrottled_candidate = false;
 
-    // if (numThreads == 1) {
-    //     return 0;
-    // }
-    ThreadID selected = -1;
-    bool all_stalled = true;
     for (ThreadID tid = 0; tid < numThreads; ++tid) {
-        if (!stallSig->blockFetch[tid] &&fetchQueue[tid].size() > 0) {
-            lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount);
-            iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount);
-            robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount);
-            all_stalled = false;
-           
-        }else {
+        const bool candidate = !stallSig->blockFetch[tid] &&
+                               !fetchQueue[tid].empty();
+        if (!candidate) {
+            smtBorrowThrottleCycles[tid] = 0;
             lsqCounter->setCounter(tid, UINT64_MAX);
             iqCounter->setCounter(tid, UINT64_MAX);
             robCounter->setCounter(tid, UINT64_MAX);
-            
+            continue;
+        }
+        has_candidate = true;
+
+        const bool throttle_now =
+            smtHasBorrowThrottleStall(fromIEW->iewInfo[tid]) ||
+            smtHasMemoryPressure(fromIEW->iewInfo[tid], smtLdstqHighWater);
+        if (throttle_now) {
+            smtBorrowThrottleCycles[tid] = smtBorrowThrottleHoldCycles;
+        } else if (smtBorrowThrottleCycles[tid] > 0) {
+            --smtBorrowThrottleCycles[tid];
         }
 
-        if(all_stalled)
-        {
-            selected = -1;
-        }else{
-            selected = decodeScheduler->getThread();
+        const bool throttled = smtBorrowThrottleCycles[tid] > 0;
+        if (!throttled) {
+            has_unthrottled_candidate = true;
         }
-        DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount);
+
+        lsqCounter->setCounter(
+            tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].ldstqCount);
+        iqCounter->setCounter(
+            tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].iqCount);
+        robCounter->setCounter(
+            tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].robCount);
+
+        DPRINTF(Fetch,
+                "[tid:%i] lsq=%u iq=%u rob=%u throttled=%u mem_pressure=%u hold=%u\n",
+                tid, fromIEW->iewInfo[tid].ldstqCount,
+                fromIEW->iewInfo[tid].iqCount, fromIEW->iewInfo[tid].robCount,
+                throttled,
+                smtHasMemoryPressure(fromIEW->iewInfo[tid], smtLdstqHighWater),
+                smtBorrowThrottleCycles[tid]);
+    }
+
+    if (has_candidate && !has_unthrottled_candidate) {
+        for (ThreadID tid = 0; tid < numThreads; ++tid) {
+            if (stallSig->blockFetch[tid] || fetchQueue[tid].empty()) {
+                continue;
+            }
+            lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount);
+            iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount);
+            robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount);
+        }
+    }
+
+    if (has_candidate) {
+        selected = decodeScheduler->getThread();
     }
 
-     
     return selected;
 }
 
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index c76bb8d77f..3c874749af 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -242,6 +242,10 @@ class Fetch
     InstsCounter* iqCounter;
     InstsCounter* robCounter;
 
+    unsigned smtBorrowThrottleCycles[MaxThreads];
+    unsigned smtBorrowThrottleHoldCycles;
+    unsigned smtLdstqHighWater;
+
     // Configuration parameters
     std::string smtDecodePolicy ="multi_priority";
     int delayedSchedulerDelay;
diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 03076739e3..491ceda48a 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -937,8 +937,26 @@ IEW::dispatchInsts()
     }
 
     // check threads stall & status
-    ThreadID tid = InvalidThreadID;
+    SmtActiveThreadArbiter active_arbiter;
+    auto freezeActiveThread = [this](ThreadID tid) {
+        stallSig->blockRename[tid] = true;
+        stallSig->renameBlockReason[tid] = StallReason::OtherFragStall;
+        toRename->iewInfo[tid].blockReason = StallReason::OtherFragStall;
+    };
     for (int i = 0; i < numThreads; i++) {
+        auto &iew_info = toRename->iewInfo[i];
+        iew_info.robHeadStallReason =
+            checkDispatchStall(i, NumDQ, nullptr, -1);
+        iew_info.lqHeadStallReason =
+            ldstQueue.lqEmpty(i) ? StallReason::NoStall :
+                                   checkLSQStall(i, true);
+        iew_info.sqHeadStallReason =
+            ldstQueue.sqEmpty(i) ? StallReason::NoStall :
+                                   checkLSQStall(i, false);
+        iew_info.ldstqCount = ldstQueue.getCount(i);
+        iew_info.robCount = rob->getThreadEntries(i);
+        iew_info.iqCount = scheduler->getIQInsts(i);
+
         bool ldst_block = !canInsertLDSTQue(i);
         bool block = stallSig->blockIEW[i] || ldst_block;
         bool active = !block && !fixedbuffer[i].empty();
@@ -946,25 +964,27 @@ IEW::dispatchInsts()
         if (stallSig->blockIEW[i]) {
             block_reason = stallSig->iewBlockReason[i];
         } else if (ldst_block) {
-            block_reason = checkDispatchStall(i, NumDQ, nullptr, -1);
+            block_reason = iew_info.robHeadStallReason;
             if (block_reason == StallReason::NoStall) {
                 block_reason = StallReason::OtherStall;
             }
         }
+        iew_info.blockReason = block ? block_reason : StallReason::NoStall;
 
         stallSig->blockRename[i] = block;
         stallSig->renameBlockReason[i] = block ? block_reason : StallReason::NoStall;
         if (active) {
-            if (tid == InvalidThreadID) tid = i;
-            else {
-                // if there are multiple active threads, must exhaust all threads first
-                // to avoid starvation of other threads and also avoid resource conflict
-                stallSig->blockRename[tid] = true;
-                stallSig->blockRename[i] = true;
-                DPRINTF(IEW, "Multiple active threads detected, blocking all threads\n");
+            const auto freeze =
+                active_arbiter.observe(i, smtBorrowPriority(iew_info));
+            if (freeze.previousActive != InvalidThreadID) {
+                freezeActiveThread(freeze.previousActive);
+            }
+            if (freeze.freezeCurrent) {
+                freezeActiveThread(i);
             }
         }
     }
+    const ThreadID tid = active_arbiter.selected();
 
     if (tid != InvalidThreadID) {
         DPRINTF(IEW,"Processing [tid:%i]\n",tid);
@@ -978,6 +998,9 @@ IEW::dispatchInsts()
         // check stall again
         if (!fixedbuffer[tid].empty()) {
             stallSig->blockRename[tid] = true;
+            stallSig->renameBlockReason[tid] =
+                blockReason == StallReason::NoStall ?
+                    StallReason::OtherFragStall : blockReason;
             DPRINTF(IEW, "Dispatch bandwidth full, blocking thread %i\n", tid);
         }
 
@@ -987,6 +1010,9 @@ IEW::dispatchInsts()
         toRename->iewInfo[tid].sqHeadStallReason =
             ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false);
         toRename->iewInfo[tid].blockReason = blockReason;
+        toRename->iewInfo[tid].ldstqCount = ldstQueue.getCount(tid);
+        toRename->iewInfo[tid].robCount = rob->getThreadEntries(tid);
+        toRename->iewInfo[tid].iqCount = scheduler->getIQInsts(tid);
     }
 }
 
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 33ffdeb593..eb6bdb4ce6 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -527,7 +527,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
                  smtLSQThreshold == 0,
                  "SMT LSQ threshold must be non-zero in shared threshold mode");
 
-        if (lsqPolicy == SMTQueuePolicy::Dynamic) {
+        if (lsqPolicy == SMTQueuePolicy::Dynamic ||
+            lsqPolicy == SMTQueuePolicy::DynamicBorrowing) {
             DPRINTF(LSQ, "LSQ mode set to Shared/Dynamic: %u LQ and %u SQ "
                     "entries are shared across active SMT threads, along "
                     "with %u RARQ and %u RAWQ entries\n",
@@ -539,7 +540,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
                     smtLSQThreshold);
         } else {
             panic("Invalid LSQ sharing policy. Options are: Dynamic, "
-                        "Partitioned, Threshold");
+                        "Partitioned, Threshold, DynamicBorrowing");
         }
     } else {
         panic("Invalid SMT LSQ mode. Options are: Independent, Shared");
@@ -1580,6 +1581,7 @@ LSQ::sharedLSQAllocation(unsigned entries) const
 
     switch (lsqPolicy) {
       case SMTQueuePolicy::Dynamic:
+      case SMTQueuePolicy::DynamicBorrowing:
         return entries;
       case SMTQueuePolicy::Partitioned:
         return entries / active_threads;
@@ -1588,7 +1590,7 @@ LSQ::sharedLSQAllocation(unsigned entries) const
             std::min(entries, smtLSQThreshold);
       default:
         panic("Invalid LSQ sharing policy. Options are: Dynamic, "
-              "Partitioned, Threshold");
+              "Partitioned, Threshold, DynamicBorrowing");
     }
 }
 
@@ -1927,7 +1929,8 @@ LSQ::isStalled()
 bool
 LSQ::isStalled(ThreadID tid)
 {
-    if (lsqPolicy == SMTQueuePolicy::Dynamic)
+    if (lsqPolicy == SMTQueuePolicy::Dynamic ||
+        lsqPolicy == SMTQueuePolicy::DynamicBorrowing)
         return isStalled();
     else
         return thread[tid].isStalled();
diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc
index 33d7852a87..9e00c2bbef 100644
--- a/src/cpu/o3/rename.cc
+++ b/src/cpu/o3/rename.cc
@@ -355,8 +355,14 @@ Rename::tick()
     releasePhysRegs();
 
     // check threads stall & status
-    ThreadID tid = InvalidThreadID;
     ThreadID blocked_tid = InvalidThreadID;
+    SmtActiveThreadArbiter active_arbiter;
+    auto freezeActiveThread = [this](ThreadID tid) {
+        stallSig->blockDecode[tid] = true;
+        stallSig->decodeBlockReason[tid] = StallReason::OtherFragStall;
+        toDecode->renameInfo[tid].blockReason =
+            stallSig->decodeBlockReason[tid];
+    };
     for (int i = 0; i < numThreads; i++) {
         bool can_rename = canRename(i);
         bool block = stallSig->blockRename[i] || !can_rename;
@@ -394,18 +400,19 @@ Rename::tick()
             stallSig->blockDecode[i] ? block_reason : StallReason::NoStall;
         toDecode->renameInfo[i].blockReason = stallSig->decodeBlockReason[i];
         if (active) {
-            if (tid == InvalidThreadID) tid = i;
-            else {
-                // if there are multiple active threads, must exhaust all threads first
-                // to avoid starvation of other threads and also avoid resource conflict
-                stallSig->blockDecode[tid] = true;
-                stallSig->blockDecode[i] = true;
-                DPRINTF(Rename, "Multiple active threads detected, blocking all threads\n");
+            const auto freeze = active_arbiter.observe(
+                i, smtBorrowPriority(fromIEW->iewInfo[i]));
+            if (freeze.previousActive != InvalidThreadID) {
+                freezeActiveThread(freeze.previousActive);
+            }
+            if (freeze.freezeCurrent) {
+                freezeActiveThread(i);
             }
         } else if (stallSig->blockDecode[i] && blocked_tid == InvalidThreadID) {
             blocked_tid = i;
         }
     }
+    const ThreadID tid = active_arbiter.selected();
 
     if (tid == InvalidThreadID) {
         // all threads are stalled, no need to process
diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc
index d57ea8b0df..6fe0c6467a 100644
--- a/src/cpu/o3/rob.cc
+++ b/src/cpu/o3/rob.cc
@@ -40,6 +40,7 @@
 
 #include "cpu/o3/rob.hh"
 
+#include <algorithm>
 #include <list>
 
 #include "base/logging.hh"
@@ -131,6 +132,7 @@ ROB::allocateGroup_kmhv3(const DynInstPtr inst, ThreadID tid)
 
 ROB::ROB(CPU *_cpu, const BaseO3CPUParams &params)
     : robPolicy(params.smtROBPolicy),
+      borrowingDonorReserveEntries(params.smtBorrowDonorReserveEntries),
       robWalkPolicy(params.robWalkPolicy),
       cpu(_cpu),
       numEntries(params.numROBEntries),
@@ -142,6 +144,10 @@ ROB::ROB(CPU *_cpu, const BaseO3CPUParams &params)
       numThreads(params.numThreads),
       stats(_cpu)
 {
+    for (ThreadID tid = 0; tid < MaxThreads; ++tid) {
+        borrowingDonor[tid] = false;
+    }
+
     //Figure out rob policy
     if (robPolicy == SMTQueuePolicy::Dynamic) {
         //Set Max Entries to Total ROB Capacity
@@ -149,6 +155,14 @@ ROB::ROB(CPU *_cpu, const BaseO3CPUParams &params)
             maxEntries[tid] = numEntries;
         }
 
+    } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+        DPRINTF(Fetch, "ROB sharing policy set to DynamicBorrowing\n");
+
+        int part_amt = numEntries / numThreads;
+        for (ThreadID tid = 0; tid < numThreads; tid++) {
+            maxEntries[tid] = part_amt;
+        }
+
     } else if (robPolicy == SMTQueuePolicy::Partitioned) {
         DPRINTF(Fetch, "ROB sharing policy set to Partitioned\n");
 
@@ -212,6 +226,7 @@ ROB::resetState()
         squashIt[tid] = instList[tid].end();
         squashedSeqNum[tid] = 0;
         doneSquashing[tid] = true;
+        borrowingDonor[tid] = false;
     }
     numInstsInROB = 0;
 
@@ -262,6 +277,8 @@ ROB::resetEntries()
 
             if (robPolicy == SMTQueuePolicy::Partitioned) {
                 maxEntries[tid] = numEntries / active_threads;
+            } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+                maxEntries[tid] = numEntries / active_threads;
             } else if (robPolicy == SMTQueuePolicy::Threshold &&
                        active_threads == 1) {
                 maxEntries[tid] = numEntries;
@@ -275,11 +292,95 @@ ROB::entryAmount(ThreadID num_threads)
 {
     if (robPolicy == SMTQueuePolicy::Partitioned) {
         return numEntries / num_threads;
+    } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+        return numEntries / num_threads;
     } else {
         return 0;
     }
 }
 
+unsigned
+ROB::activeThreadCount() const
+{
+    if (!activeThreads || activeThreads->empty()) {
+        return numThreads == 0 ? 1 : numThreads;
+    }
+    return activeThreads->size();
+}
+
+unsigned
+ROB::totalEntries() const
+{
+    unsigned total = 0;
+    for (ThreadID tid = 0; tid < numThreads; ++tid) {
+        total += threadGroups[tid].size();
+    }
+    return total;
+}
+
+bool
+ROB::canBorrow(ThreadID tid) const
+{
+    return robPolicy == SMTQueuePolicy::DynamicBorrowing &&
+           tid < numThreads;
+}
+
+unsigned
+ROB::borrowingLimit(ThreadID tid) const
+{
+    if (tid >= numThreads) {
+        return 0;
+    }
+
+    if (!canBorrow(tid)) {
+        return maxEntries[tid];
+    }
+
+    const unsigned active_threads = std::max(1U, activeThreadCount());
+    const unsigned base = std::max(1U, numEntries / active_threads);
+    const unsigned donor_resume_quota =
+        std::min(base, borrowingDonorReserveEntries);
+
+    unsigned reserved = 0;
+    for (ThreadID other = 0; other < numThreads; ++other) {
+        if (other == tid) {
+            continue;
+        }
+
+        const unsigned reserve =
+            borrowingDonor[other] ? donor_resume_quota : base;
+        const unsigned used = threadGroups[other].size();
+        if (used < reserve) {
+            reserved += reserve - used;
+        }
+    }
+
+    if (reserved >= numEntries) {
+        return 0;
+    }
+
+    return numEntries - reserved;
+}
+
+bool
+ROB::canAllocate(ThreadID tid, unsigned entries) const
+{
+    if (tid >= numThreads) {
+        return false;
+    }
+
+    const unsigned used = threadGroups[tid].size();
+
+    if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+        if (totalEntries() + entries > numEntries) {
+            return false;
+        }
+        return used + entries <= borrowingLimit(tid);
+    }
+
+    return used + entries <= maxEntries[tid];
+}
+
 int
 ROB::countInsts()
 {
@@ -354,6 +455,7 @@ ROB::insertInst(const DynInstPtr &inst)
     assert(numInstsInROB <= numEntries * instsPerGroup);
 
     ThreadID tid = inst->threadNumber;
+    assert(canAllocate(tid, 1));
 
     // allocate group
     bool alloc = (this->*allocateNewGroup)(inst, tid);
@@ -508,6 +610,15 @@ ROB::getHeadGroupLastDoneSeq(ThreadID tid)
 unsigned
 ROB::numFreeEntries(ThreadID tid)
 {
+    if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+        const unsigned limit = borrowingLimit(tid);
+        const unsigned used = threadGroups[tid].size();
+        if (limit <= used) {
+            return 0;
+        }
+        return limit - used;
+    }
+
     return maxEntries[tid] - threadGroups[tid].size();
 }
 
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index 94b93d2593..e5726d0c02 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -90,9 +90,15 @@ class ROB
     /** Per-thread ROB status. */
     Status robStatus[MaxThreads];
 
+    /** Whether a thread may donate unused ROB headroom this cycle. */
+    bool borrowingDonor[MaxThreads];
+
     /** ROB resource sharing policy for SMT mode. */
     SMTQueuePolicy robPolicy;
 
+    /** Minimum entries a donor thread keeps for restarting after a stall. */
+    const unsigned borrowingDonorReserveEntries;
+
     ROBWalkPolicy robWalkPolicy;
 
     bool allocateGroup_none(const DynInstPtr inst, ThreadID tid);
@@ -100,6 +106,11 @@ class ROB
     bool allocateGroup_MohBoE(const DynInstPtr inst, ThreadID tid);
     bool allocateGroup_kmhv3(const DynInstPtr inst, ThreadID tid);
 
+    unsigned activeThreadCount() const;
+    unsigned borrowingLimit(ThreadID tid) const;
+    unsigned totalEntries() const;
+    bool canBorrow(ThreadID tid) const;
+
   public:
     /** ROB constructor.
      *  @param _cpu   The cpu object pointer.
@@ -188,7 +199,19 @@ class ROB
 
     /** Returns the maximum number of entries for a specific thread. */
     unsigned getMaxEntries(ThreadID tid)
-    { return maxEntries[tid]; }
+    {
+        if (tid >= numThreads) {
+            return 0;
+        }
+        return canBorrow(tid) ? borrowingLimit(tid) : maxEntries[tid];
+    }
+
+    /** Returns whether the thread may borrow unused ROB capacity. */
+    void setBorrowingDonor(ThreadID tid, bool donor)
+    { borrowingDonor[tid] = donor; }
+
+    /** Returns whether the thread can reserve the requested ROB entries. */
+    bool canAllocate(ThreadID tid, unsigned entries) const;
 
     /** Returns the number of entries being used by a specific thread. */
     unsigned getThreadEntries(ThreadID tid)
@@ -197,6 +220,9 @@ class ROB
     /** Returns if the ROB is full. */
     bool isFull()
     {
+      if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+        return totalEntries() >= numEntries;
+      }
       for (int i =0;i<MaxThreads;i++) {
         if (isFull(i)) return true;
       }
@@ -205,7 +231,12 @@ class ROB
 
     /** Returns if a specific thread's partition is full. */
     bool isFull(ThreadID tid)
-    { return threadGroups[tid].size() == numEntries; }
+    {
+      if (robPolicy == SMTQueuePolicy::DynamicBorrowing) {
+          return numFreeEntries(tid) == 0;
+      }
+      return threadGroups[tid].size() == numEntries;
+    }
 
     /** Returns if the ROB is empty. */
     bool isEmpty() const

From 11d0ee49f9cf605d7c66a95bd950bca5f97a08dd Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 19 May 2026 17:19:54 +0800
Subject: [PATCH 31/38] cpu-o3: Guard empty LSQ head stall checks

Change-Id: I735b9557f8f8d69f094121bb6229bd3ab7682f49
---
 src/cpu/o3/iew.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc
index 491ceda48a..d0ac1cbdb8 100644
--- a/src/cpu/o3/iew.cc
+++ b/src/cpu/o3/iew.cc
@@ -2190,8 +2190,15 @@ IEW::checkDispatchStall(ThreadID tid, int dq_stall, const DynInstPtr &dispatch_i
         if (head_inst->isNonSpeculative()) {
             return StallReason::SerializeStall;
         } else if (head_inst->isLoad() && ldstQueue.lqFull(tid)) {
+            if (ldstQueue.lqEmpty(tid)) {
+                return StallReason::InstNotReady;
+            }
             return checkLSQStall(tid, true);
-        } else if ((head_inst->isStore() || head_inst->isAtomic()) && ldstQueue.sqFull(tid)) {
+        } else if ((head_inst->isStore() || head_inst->isAtomic()) &&
+                   ldstQueue.sqFull(tid)) {
+            if (ldstQueue.sqEmpty(tid)) {
+                return StallReason::InstNotReady;
+            }
             return checkLSQStall(tid, false);
         } else {
             return StallReason::InstNotReady;
@@ -2222,6 +2229,11 @@ IEW::checkDispatchStall(ThreadID tid, int dq_stall, const DynInstPtr &dispatch_i
 StallReason
 IEW::checkLSQStall(ThreadID tid, bool isLoad)
 {
+    if ((isLoad && ldstQueue.lqEmpty(tid)) ||
+        (!isLoad && ldstQueue.sqEmpty(tid))) {
+        return StallReason::InstNotReady;
+    }
+
     DynInstPtr head_inst = ldstQueue.getLSQHeadInst(tid, isLoad);
     return checkLoadStoreInst(head_inst);
 }

From 1d2a555f52d0c43a465afcc63185ce5a0210a5c1 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Wed, 20 May 2026 10:35:52 +0800
Subject: [PATCH 32/38] arch-riscv: Isolate old TLB privilege by thread

Change-Id: Ie6f7cfc4bd2d2d8f7aad6c9e7bace19534b77c00
---
 src/arch/riscv/tlb.cc | 52 ++++++++++++++++++++++++++++++++++++-------
 src/arch/riscv/tlb.hh | 23 ++++++++++---------
 src/cpu/o3/commit.cc  |  6 ++---
 3 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc
index 050e0735b8..0004d18a66 100644
--- a/src/arch/riscv/tlb.cc
+++ b/src/arch/riscv/tlb.cc
@@ -2147,21 +2147,57 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
     return NoFault;
 }
 PrivilegeMode
-TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
+TLB::currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
 {
-    if (use_old_priv && mode != BaseMMU::Execute) {
-        if (mode == BaseMMU::Execute) {
-            return old_priv_ex;
-        } else {
-            return old_priv_ldst;
-        }
-    }
     STATUS status = (STATUS)tc->readMiscReg(MISCREG_STATUS);
     PrivilegeMode pmode = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
     if (mode != BaseMMU::Execute && status.mprv == 1)
         pmode = (PrivilegeMode)(RegVal)status.mpp;
     return pmode;
 }
+
+PrivilegeMode
+TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
+{
+    if (mode != BaseMMU::Execute) {
+        const int tid = tc->threadId();
+        if (tid >= 0) {
+            const auto thread_idx = static_cast<size_t>(tid);
+            if (thread_idx < oldPrivByThread.size() &&
+                oldPrivByThread[thread_idx].valid) {
+                return oldPrivByThread[thread_idx].ldst;
+            }
+        }
+    }
+    return currentMemPriv(tc, mode);
+}
+
+void
+TLB::setOldPriv(ThreadContext *tc)
+{
+    const int tid = tc->threadId();
+    assert(tid >= 0);
+    const auto thread_idx = static_cast<size_t>(tid);
+    if (oldPrivByThread.size() <= thread_idx) {
+        oldPrivByThread.resize(thread_idx + 1);
+    }
+    oldPrivByThread[thread_idx].valid = true;
+    oldPrivByThread[thread_idx].ldst = currentMemPriv(tc, BaseMMU::Read);
+}
+
+void
+TLB::useNewPriv(ThreadContext *tc)
+{
+    const int tid = tc->threadId();
+    if (tid < 0) {
+        return;
+    }
+    const auto thread_idx = static_cast<size_t>(tid);
+    if (thread_idx < oldPrivByThread.size()) {
+        oldPrivByThread[thread_idx].valid = false;
+    }
+}
+
 bool
 TLB::hasTwoStageTranslation(ThreadContext *tc, const RequestPtr &req, BaseMMU::Mode mode)
 {
diff --git a/src/arch/riscv/tlb.hh b/src/arch/riscv/tlb.hh
index 5b94852e2f..99d57e0c3d 100644
--- a/src/arch/riscv/tlb.hh
+++ b/src/arch/riscv/tlb.hh
@@ -34,6 +34,7 @@
 
 #include <cstdint>
 #include <list>
+#include <vector>
 
 #include "arch/generic/tlb.hh"
 #include "arch/riscv/isa.hh"
@@ -42,6 +43,7 @@
 #include "arch/riscv/regs/misc.hh"
 #include "arch/riscv/utility.hh"
 #include "base/statistics.hh"
+#include "base/types.hh"
 #include "mem/request.hh"
 #include "params/RiscvTLB.hh"
 #include "sim/sim_object.hh"
@@ -107,9 +109,13 @@ class TLB : public BaseTLB
     uint64_t lastPc;
     uint64_t traceFlag;
 
-    bool use_old_priv;
-    PrivilegeMode old_priv_ldst;
-    PrivilegeMode old_priv_ex;
+    struct OldPrivState
+    {
+        bool valid = false;
+        PrivilegeMode ldst = PrivilegeMode::PRV_M;
+    };
+
+    std::vector<OldPrivState> oldPrivByThread;
 
     Walker *walker;
 
@@ -253,6 +259,7 @@ class TLB : public BaseTLB
                               BaseMMU::Translation *translation, BaseMMU::Mode mode) override;
     Fault finalizePhysical(const RequestPtr &req, ThreadContext *tc,
                            BaseMMU::Mode mode) const override;
+    PrivilegeMode currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode);
     TlbEntry *lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden,
                      bool sign_used, uint8_t translateMode,
                      bool is_prefetch = false);
@@ -262,14 +269,8 @@ class TLB : public BaseTLB
     TlbEntry *lookupL2TLB(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden, int f_level, bool sign_used,
                           uint8_t translateMode);
 
-    void setOldPriv(ThreadContext *tc) {
-      use_old_priv = true;
-      old_priv_ex = getMemPriv(tc, BaseMMU::Execute);
-      old_priv_ldst = getMemPriv(tc, BaseMMU::Read);
-    }
-    void useNewPriv(ThreadContext *tc) {
-      use_old_priv = false;
-    }
+    void setOldPriv(ThreadContext *tc);
+    void useNewPriv(ThreadContext *tc);
 
 
     std::vector<TlbEntry> tlbL2L3;  // our TLB
diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc
index bfbe23550f..82884044de 100644
--- a/src/cpu/o3/commit.cc
+++ b/src/cpu/o3/commit.cc
@@ -964,9 +964,8 @@ Commit::handleInterrupt()
             cpu->difftestRaiseIntr(cpu->getInterruptsNO() | (1ULL << 63));
         }
         traceLogHandleInterrupt();
-        cpu->processInterrupts(cpu->getInterrupts());
-
         cpu->mmu->setOldPriv(cpu->getContext(0));
+        cpu->processInterrupts(cpu->getInterrupts());
 
         thread[0]->noSquashFromTC = false;
 
@@ -1789,12 +1788,11 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
         // needed to update the state as soon as possible.  This
         // prevents external agents from changing any specific state
         // that the trap need.
+        cpu->mmu->setOldPriv(cpu->getContext(tid));
         cpu->trap(inst_fault, tid,
                   head_inst->notAnInst() ? nullStaticInstPtr :
                       head_inst->staticInst);
 
-        cpu->mmu->setOldPriv(cpu->getContext(tid));
-
         // Exit state update mode to avoid accidental updating.
         thread[tid]->noSquashFromTC = false;
 

From 43e1cc7338d35f7f7fc95f125734b00bf3ecf965 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 21 May 2026 11:16:15 +0800
Subject: [PATCH 33/38] cpu-o3: Fix SMT AMO difftest snapshot

Change-Id: Iccef736835125a5ef35efb06c3eeced41f189b6c
---
 src/cpu/o3/lsq_unit.cc | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index c0e9283c55..be065323b9 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -367,19 +367,16 @@ LSQUnit::completeDataAccess(PacketPtr pkt)
                 assert(size == inst->effSize);
 
                 if (inst->isAtomic()) {
-                    uint8_t current_golden[8] = {};
-                    panic_if(size > sizeof(current_golden),
-                             "Unexpected AMO size %u at addr %#lx\n",
+                    panic_if(size > sizeof(uint64_t),
+                             "Unexpected AMO size %zu at addr %#lx\n",
                              size, addr);
-                    cpu->goldenMemManager()->readGoldenMem(addr, current_golden,
-                                                           size);
-
                     // Preserve the DUT-observed old value until completeStore()
-                    // derives the post-AMO memory image. The golden old-value
-                    // snapshot used by difftest is captured when the request
-                    // is first sent, before later concurrent updates can
-                    // advance shared memory.
+                    // derives the post-AMO memory image. Keep the actual
+                    // response value for difftest, since the request may have
+                    // been serialized behind another hart's AMO by the cache.
                     inst->setGolden(loaded_data);
+                    std::memcpy(inst->getAmoOldGoldenValuePtr(), loaded_data,
+                                size);
                 } else {
                     // check data with golden mem
                     uint8_t *golden_data =
@@ -2933,7 +2930,6 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe
                     paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size);
             cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff,
                                                      request->_size);
-            store_inst->setGolden(tmp_data);
         }
     }
 

From e3751dfa79aae4bf9038363facd4387f1fc4055e Mon Sep 17 00:00:00 2001
From: Mo Haonan <66786667+mhnGitHubz@users.noreply.github.com>
Date: Thu, 21 May 2026 11:27:23 +0800
Subject: [PATCH 34/38] cpu-o3: reserve a store buffer resource for each thread
 to prevent deadlock (#858)

Co-authored-by: mo haonan <mohaonan@node023.bosccluster.com>
---
 src/cpu/o3/lsq.cc      | 24 +++++++++++++++++++++++-
 src/cpu/o3/lsq.hh      |  9 +++++++--
 src/cpu/o3/lsq_unit.cc |  8 ++++----
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index eb6bdb4ce6..1877412495 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -245,6 +245,7 @@ LSQ::StoreBuffer::setData(std::vector<StoreBufferEntry *> &data_vec)
     this->data_vec = data_vec;
     int way = data_vec.size();
     _size = 0;
+    max_size = way;
     lru_index.set_capacity(way);
     free_list.set_capacity(way);
     crossRef.resize(way);
@@ -255,12 +256,26 @@ LSQ::StoreBuffer::setData(std::vector<StoreBufferEntry *> &data_vec)
     }
 }
 
+void
+LSQ::StoreBuffer::setMaxThread(ThreadID _max_thread)
+{
+    max_thread = _max_thread;
+    vld_cnt_vec.resize(max_thread, 0);
+}
+
 bool
 LSQ::StoreBuffer::full() const
 {
     return free_list.size() == 0;
 }
 
+bool
+LSQ::StoreBuffer::full(ThreadID tid) const
+{
+    assert(vld_cnt_vec[tid] <= max_size);
+    return (vld_cnt_vec[tid] == (max_size - max_thread + 1));
+}
+
 uint64_t
 LSQ::StoreBuffer::size() const
 {
@@ -326,6 +341,8 @@ LSQ::StoreBuffer::insert(StoreBufferEntry *entry)
     assert(!data_vld[index]);
     assert(!lru_index.full());
     _size++;
+    vld_cnt_vec[tid]++;
+    assert(vld_cnt_vec[tid] <= max_size);
     auto [it, _] = data_map.insert({hashKey(tid, addr), data_vec[index]});
     crossRef[index] = it;
     data_vld[index] = true;
@@ -411,6 +428,9 @@ LSQ::StoreBuffer::createVice(StoreBufferEntry *entry)
     assert(!entry->vice);
     entry->vice = vice;
     data_vld[vice->index] = true;
+    assert(entry->tid < max_thread);
+    vld_cnt_vec[entry->tid]++;
+    assert(vld_cnt_vec[entry->tid] <= max_size);
     // do not insert map and lru_index
     return vice;
 }
@@ -420,6 +440,8 @@ LSQ::StoreBuffer::release(StoreBufferEntry *entry)
 {
     assert(_size > 0);
     _size--;
+    vld_cnt_vec[entry->tid]--;
+    assert(vld_cnt_vec[entry->tid] >= 0);
     int index = entry->index;
     data_vld[index] = false;
     data_map.erase(crossRef[index]);
@@ -563,7 +585,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams &params)
         store_buffer_entries.push_back(new StoreBufferEntry(cpu->cacheLineSize(), i));
     }
     storeBuffer.setData(store_buffer_entries);
-
+    storeBuffer.setMaxThread(numThreads);
     bankOccupied.resize(dcacheSetDivNum, std::vector<bool>(numBank, false));
     pendingDcacheRefill.resize(dcacheSetDivNum, false);
     dcacheRefillDataRead.resize(dcacheSetDivNum, 0);
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 397f372c26..9371d49987 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -189,12 +189,15 @@ class LSQ
 
         // key = (paddr & cacheblockmask)
         uint64_t _size = 0;
+        int max_size = 0;
+        int max_thread = 0;
         std::unordered_map<uint64_t, StoreBufferEntry *> data_map;
         std::vector<mapIter> crossRef;
         boost::circular_buffer<int> lru_index;
         boost::circular_buffer<int> free_list;
         std::vector<StoreBufferEntry *> data_vec;
         std::vector<bool> data_vld;
+        std::vector<int> vld_cnt_vec;
 
         uint64_t hashKey(ThreadID tid, Addr block_paddr) const
         {
@@ -204,8 +207,10 @@ class LSQ
 
       public:
         void setData(std::vector<StoreBufferEntry *> &data_vec);
-        bool full() const;
-        uint64_t size() const;
+        void setMaxThread(ThreadID max_thread);
+	bool full() const;
+        bool full(ThreadID tid) const;
+	uint64_t size() const;
         uint64_t size(ThreadID tid) const;
         uint64_t size(ThreadID tid, InstSeqNum seq_num) const;
         uint64_t unsentSize() const;
diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc
index be065323b9..7c7074090f 100644
--- a/src/cpu/o3/lsq_unit.cc
+++ b/src/cpu/o3/lsq_unit.cc
@@ -2472,8 +2472,8 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
                         blockPaddr, paddr);
             } else {
                 // create vice for sending entry
-                if (storeBuffer.full()) {
-                    DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr);
+                if (storeBuffer.full(lsqID) || storeBuffer.full()) {
+                    DPRINTF(StoreBuffer, "[tid:%u] Insert %#x failed due to sbuffer full\n", lsqID, paddr);
                     stats.sbufferFull++;
                     return false;
                 }
@@ -2498,10 +2498,10 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas,
         }
     } else {
         // create new entry
-        if (storeBuffer.full()) {
+        if (storeBuffer.full(lsqID) || storeBuffer.full()) {
             stats.sbufferFull++;
             // lsq->nextStoreBufferInsertTid = lsqID;
-            DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr);
+            DPRINTF(StoreBuffer, "[tid:%u] Insert %#x failed due to sbuffer full\n", lsqID, paddr);
             return false;
         }
         // insert

From cba429499431bcc837242eea8912af029b5e7a7f Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Thu, 21 May 2026 14:08:38 +0800
Subject: [PATCH 35/38] mem: Avoid materializing zero pages on zstd restore

Change-Id: I615bad73ac28ceea5e51bd1f594f94acda1f7705
---
 src/mem/physical.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mem/physical.cc b/src/mem/physical.cc
index 8b4df199bf..02d4bae07c 100644
--- a/src/mem/physical.cc
+++ b/src/mem/physical.cc
@@ -851,9 +851,9 @@ PhysicalMemory::unserializeFromZstd(std::string filepath, unsigned store_id, lon
             }
 
             for (uint64_t x = 0; x < output.pos; x += sizeof(long)) {
-                pmem_current = (uint64_t*)(pmem + total_write_size + x);
                 uint64_t read_data = *(decompress_file_buffer + x / sizeof(long));
-                if (read_data != 0 || *pmem_current != 0) {
+                if (read_data != 0) {
+                    pmem_current = (uint64_t*)(pmem + total_write_size + x);
                     *pmem_current = read_data;
                     non_zero_dword++;
                 }

From 8d75755582a8a08b6567d42ce018795f3bfd0d1c Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 25 May 2026 15:13:26 +0800
Subject: [PATCH 36/38] cpu-o3: Fix SMT decode stallbuffer backpressure

Change-Id: I50ab39ce30eebfb6d129c2c0fafd8f855b536730
---
 src/cpu/o3/decode.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc
index d24e1d1efc..b53e6917a4 100644
--- a/src/cpu/o3/decode.cc
+++ b/src/cpu/o3/decode.cc
@@ -90,8 +90,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams &params)
     // This buffer preserves the fetch->decode pipeline contents when decode
     // stalls while TimeBuffer keeps advancing. Its depth matches the original
     // forward pipeline window; fetch is backpressured before full to absorb
-    // both the decode->fetch feedback delay and the request already issued in
-    // the current cycle before decode computes backpressure.
+    // the fetch groups already in that window.
     const auto stallGroupDepth = fetchToDecodeDelay + 1;
     stallBuffer = boost::circular_buffer<DynInstPtr>(
         decodeWidth * stallGroupDepth);
@@ -496,7 +495,7 @@ Decode::tick()
     };
     const bool fifoBackpressured =
         !stallBuffer.empty() &&
-        eachstallSize.size() + decodeToFetchDelay + 1 >=
+        eachstallSize.size() + fetchToDecodeDelay >=
             eachstallSize.capacity();
     const ThreadID fifoHeadTid =
         !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID;

From 446710fedce26caa140f092bd0007dc43a9208ff Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Mon, 25 May 2026 17:56:42 +0800
Subject: [PATCH 37/38] cpu: Fix BTBTAGE unit test history update call

Change-Id: If3999d5c26bf325cfb052b100456e759388c2330
---
 src/cpu/pred/btb/test/btb_tage.test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc
index e945065e9f..56c67c28ad 100644
--- a/src/cpu/pred/btb/test/btb_tage.test.cc
+++ b/src/cpu/pred/btb/test/btb_tage.test.cc
@@ -393,12 +393,12 @@ TEST_F(BTBTAGETest, GlobalHistoryModeUpdate) {
     BTBTAGE ghrTage(4, 2, 1024, 4, false);
     boost::dynamic_bitset<> ghr(64, false);
 
-    ghrTage.doUpdateHist(ghr, 1, true, 0, 0);
+    ghrTage.doUpdateHist(ghr, 1, true, 0, 0, 0);
     applyOutcomeHistory(ghr, 1, true);
     ghrTage.checkFoldedHist(ghr, "ghr taken update");
 
     boost::dynamic_bitset<> before_not_taken = ghr;
-    ghrTage.doUpdateHist(ghr, 1, false, 0, 0);
+    ghrTage.doUpdateHist(ghr, 1, false, 0, 0, 0);
     applyOutcomeHistory(ghr, 1, false);
     ghrTage.checkFoldedHist(ghr, "ghr not-taken update");
 

From 6ac3abca4ec515877254bf9940a5e1d84eede225 Mon Sep 17 00:00:00 2001
From: tastynoob <934348725@qq.com>
Date: Tue, 26 May 2026 14:43:33 +0800
Subject: [PATCH 38/38] Revert "arch-riscv: fix agnostic vector load fill"

This reverts commit 40bf365d1d262123eb1328740099996f3ff4ebd2.
---
 .../riscv/isa/vector/base/vector_mem.temp.isa | 28 -------------------
 .../isa/vector/simple/vector_mem.temp.isa     | 28 -------------------
 2 files changed, 56 deletions(-)

diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
index 2448a9ad95..e97eef0940 100644
--- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
@@ -1,24 +1,5 @@
 output header {{
 
-#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
-    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
-
-#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
-    do {                                                                     \
-        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
-            const uint32_t _vdElemIdx =                                      \
-                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
-            const size_t _ei = _i + vmi.rs;                                  \
-            const bool _is_tail = _ei >= rVl;                                \
-            const bool _is_masked = !this->vm && !_is_tail &&                \
-                !elem_mask(v0, _ei);                                         \
-            if ((_is_tail && machInst.vtype8.vta) ||                         \
-                (_is_masked && machInst.vtype8.vma)) {                       \
-                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
-            }                                                                \
-        }                                                                    \
-    } while (0)
-
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -166,7 +147,6 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
-    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -192,8 +172,6 @@ Fault
         %(memacc_code)s;
     }
 
-    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
-
     %(op_wb)s;
     return fault;
 }
@@ -283,7 +261,6 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
-    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -322,11 +299,6 @@ Fault
         }
     }
 
-#if %(is_vecWhole)s
-#else
-    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
-#endif
-
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;
diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
index 4b64f5dac0..a8e5b71f99 100644
--- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
+++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
@@ -1,24 +1,5 @@
 output header {{
 
-#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
-    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
-
-#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
-    do {                                                                     \
-        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
-            const uint32_t _vdElemIdx =                                      \
-                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
-            const size_t _ei = _i + vmi.rs;                                  \
-            const bool _is_tail = _ei >= rVl;                                \
-            const bool _is_masked = !this->vm && !_is_tail &&                \
-                !elem_mask(v0, _ei);                                         \
-            if ((_is_tail && machInst.vtype8.vta) ||                         \
-                (_is_masked && machInst.vtype8.vma)) {                       \
-                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
-            }                                                                \
-        }                                                                    \
-    } while (0)
-
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -166,7 +147,6 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
-    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -192,8 +172,6 @@ Fault
         %(memacc_code)s;
     }
 
-    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
-
     %(op_wb)s;
     return fault;
 }
@@ -283,7 +261,6 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
-    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -322,11 +299,6 @@ Fault
         }
     }
 
-#if %(is_vecWhole)s
-#else
-    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
-#endif
-
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;