From 1ab1a92584f6797a8286f910ca797a66df4f30dd Mon Sep 17 00:00:00 2001 From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:38:31 +0800 Subject: [PATCH 01/38] add frontend and backend smt (#791) Co-authored-by: mohaonan --- src/cpu/o3/FuncScheduler.py | 7 +- src/cpu/o3/SConscript | 2 +- src/cpu/o3/comm.hh | 10 ++ src/cpu/o3/fetch.cc | 65 ++++++++++++- src/cpu/o3/fetch.hh | 19 ++++ src/cpu/o3/iew.cc | 5 +- src/cpu/o3/inst_queue.cc | 9 +- src/cpu/o3/inst_queue.hh | 1 + src/cpu/o3/issue_queue.cc | 177 ++++++++++++++++++++++++++++++------ src/cpu/o3/issue_queue.hh | 41 ++++++++- src/cpu/o3/smt_sched.hh | 26 +++++- 11 files changed, 323 insertions(+), 39 deletions(-) diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py index 2d088a6032..7676f6d643 100644 --- a/src/cpu/o3/FuncScheduler.py +++ b/src/cpu/o3/FuncScheduler.py @@ -75,6 +75,11 @@ class PAgeSelector(BaseSelector): piece = Param.Int(2, "number of instructions in a group") +class SMTBasedSelector(BaseSelector): + type = 'SMTBasedSelector' + cxx_class = 'gem5::o3::SMTBasedSelector' + cxx_header = "cpu/o3/issue_queue.hh" + class IssueQue(SimObject): type = 'IssueQue' cxx_class = 'gem5::o3::IssueQue' @@ -85,7 +90,7 @@ class IssueQue(SimObject): inports = Param.Int(2, "") scheduleToExecDelay = Param.Cycles(2, "") oports = VectorParam.IssuePort("") - sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ (default: age first)") + sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)") class Scheduler(SimObject): type = 'Scheduler' diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript index 1ee4cf9448..463a8cdfc0 100755 --- a/src/cpu/o3/SConscript +++ b/src/cpu/o3/SConscript @@ -32,7 +32,7 @@ Import('*') if env['CONF']['TARGET_ISA'] != 'null': SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel', - 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler']) + 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler']) SimObject('FuncUnitConfig.py', sim_objects=[]) SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[ 'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord']) diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index cb88ad769f..f15257426f 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -168,6 +168,12 @@ struct IssueStruct DynInstPtr insts[MaxWidth]; }; +struct SquashInfo +{ + InstSeqNum squashSn; + ThreadID squashTid; +}; + struct SquashVersion { uint8_t version; @@ -246,6 +252,10 @@ struct TimeStruct }; /** Resolved control-flow PCs produced this cycle (fetch buffers/merges). */ std::vector resolvedCFIs; // *F + + unsigned iqCount; + unsigned ldstqCount; + unsigned robCount; }; IewComm iewInfo[MaxThreads]; // iew to rename, fetch diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 21c9cec4e6..3c00c5937d 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -148,6 +148,8 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) threads[tid].data = new uint8_t[fetchBufferSize]; } + initDecodeScheduler(); + // Get the size of an instruction. // stallReason size should be the same as decodeWidth,renameWidth,dispWidth stallReason.resize(decodeWidth, StallReason::NoStall); @@ -372,6 +374,41 @@ Fetch::setTimeBuffer(TimeBuffer *time_buffer) fromCommit = timeBuffer->getWire(-commitToFetchDelay); } +void +Fetch::initDecodeScheduler() +{ + // Initialize counters (same as before) + lsqCounter = new InstsCounter(); + iqCounter = new InstsCounter(); + robCounter = new InstsCounter(); + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 0\n"); + + for (ThreadID tid = 0; tid < numThreads; tid++) + { + lsqCounter->setCounter(tid, 0); + iqCounter->setCounter(tid, 0); + robCounter->setCounter(tid, 0); + } + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: 1\n"); + + if (smtDecodePolicy == "icount") { + // Use ROB as default counter for icount + decodeScheduler = new ICountScheduler(numThreads, robCounter); + } + else if (smtDecodePolicy == "delayed") { + decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, delayedSchedulerDelay); + } + else if (smtDecodePolicy == "multi_priority") { + decodeScheduler = new MultiPrioritySched(numThreads, {lsqCounter, iqCounter, robCounter}); + } + else { + // Default: round-robin like (use delayed with thread cycling) + decodeScheduler = new DelayedICountScheduler(numThreads, robCounter, numThreads); + } + + DPRINTF(Fetch, "Initialized SMT Decode Scheduler: %s\n", smtDecodePolicy.c_str()); +} + void Fetch::setActiveThreads(std::list *at_ptr) { @@ -1285,6 +1322,32 @@ Fetch::handleInterrupts() } } +ThreadID +Fetch::selectUnstalledThread() +{ + + // if (numThreads == 1) { + // return 0; + // } + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (!stallSig->blockFetch[tid]) { + lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount); + iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount); + robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount); + + } else { + lsqCounter->setCounter(tid, UINT64_MAX); + iqCounter->setCounter(tid, UINT64_MAX); + robCounter->setCounter(tid, UINT64_MAX); + + } + DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount); + } + + ThreadID selected = decodeScheduler->getThread(); + return selected; +} + void Fetch::sendInstructionsToDecode() { @@ -1321,7 +1384,7 @@ Fetch::sendInstructionsToDecode() return; } - ThreadID tid = 0; // TODO: smt support + ThreadID tid =selectUnstalledThread(); // fetch totally stalled if (stallSig->blockFetch[tid]) { diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 19091ef30e..6e114487cf 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -65,6 +65,7 @@ #include "mem/port.hh" #include "sim/eventq.hh" #include "sim/probe/probe.hh" +#include "cpu/o3/smt_sched.hh" namespace gem5 { @@ -233,6 +234,18 @@ class Fetch /** To probe when a fetch request is successfully sent. */ ProbePointArg *ppFetchRequestSent; + // SMT Decode Scheduler + SMTScheduler* decodeScheduler; + + // Counters from backend structures (to be passed in) + InstsCounter* lsqCounter; + InstsCounter* iqCounter; + InstsCounter* robCounter; + + // Configuration parameters + std::string smtDecodePolicy ="multi_priority"; + int delayedSchedulerDelay; + public: /** Fetch constructor. */ Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms); @@ -299,6 +312,12 @@ class Fetch /** For priority-based fetch policies, need to keep update priorityList */ void deactivateThread(ThreadID tid); + + // Function to initialize scheduler + void initDecodeScheduler(); + + // Select a thread that is not fetch-blocked, using scheduler + ThreadID selectUnstalledThread(); private: /** Reset this pipeline stage */ void resetStage(); diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 81c261bc40..d9c815b86c 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -528,7 +528,7 @@ IEW::squash(ThreadID tid) for (auto& dp : dispQue) { for (auto& it : dp) { - if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum) { + if (it->seqNum > fromCommit->commitInfo[tid].doneSeqNum && (it->threadNumber == tid)) { it->setSquashed(); } } @@ -1556,6 +1556,9 @@ IEW::executeInsts() ThreadID tid = *activeThreads->begin(); toFetch->iewInfo[tid].resolvedCFIs.clear(); + toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid); + toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid); + toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid); // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; fromIssue->size = 0; diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 89a027c3b1..29573959cf 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -151,7 +151,8 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue); scheduler->resetDepGraph(numPhysRegs); scheduler->setMemDepUnit(memDepUnit); - + scheduler->initIQICountSmtScheduler(numThreads); + resetState(); } @@ -1121,7 +1122,9 @@ InstructionQueue::doSquash(ThreadID tid) DPRINTF(IQ, "[tid:%i] Squashing until sequence number %i!\n", tid, squashedSeqNum[tid]); - scheduler->doSquash(squashedSeqNum[tid]); + squashInfo.squashTid = tid; + squashInfo.squashSn = squashedSeqNum[tid]; + scheduler->doSquash(squashInfo); for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) { if (!it->inst || @@ -1134,7 +1137,7 @@ InstructionQueue::doSquash(ThreadID tid) } for (auto it = nonSpecInsts.begin(); it != nonSpecInsts.end();) { - if (it->first > squashedSeqNum[tid]) { + if (it->first > squashedSeqNum[tid] && (it->second->threadNumber == tid)) { auto& squashed_inst = it->second; if (!squashed_inst->isIssued() || (squashed_inst->isMemRef() && diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index db01710da9..f163ebb28e 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -427,6 +427,7 @@ class InstructionQueue /** The sequence number of the squashed instruction. */ InstSeqNum squashedSeqNum[MaxThreads]; + SquashInfo squashInfo; struct IQStats : public statistics::Group { diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index e1ba93a79a..50fa7a5eb5 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -140,6 +140,58 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid) } } +void +SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq) +{ + BaseSelector::setparent(scheduler, iq); + + smtScheduler = iq->getIndependentIQICountScheduler(); +} + +ReadyQue::iterator +SMTBasedSelector::select(ReadyQue::iterator begin, int portid) +{ + if (begin == end) { + return end; + } + + ThreadID priorityThread = 0; + + if (smtScheduler) { + priorityThread = smtScheduler->getThread(); + + DPRINTF(Schedule, + "SMTBasedSelector: priority thread = %d\n", + priorityThread); + } + + for (auto it = begin; it != end; it++) { + auto& inst = *it; + + if (inst->threadNumber == priorityThread) { + DPRINTF(Schedule, + "[sn:%llu] selected by SMT policy (tid=%d)\n", + inst->seqNum, priorityThread); + return it; + } + } + + + for (auto it = begin; it != end; it++) { + auto& inst = *it; + + if (inst->threadNumber != priorityThread) { + DPRINTF(Schedule, + "[sn:%llu] selected by default (tid=%d, priority=%d)\n", + inst->seqNum, inst->threadNumber, priorityThread); + return it; + } + } + + DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n"); + return begin; +} + bool IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const { @@ -312,6 +364,9 @@ IssueQue::IssueQue(const IssueQueParams& params) if (storePipeAcc) numStorePipe++; } + + //Init InstsCounter + instsCounter = new InstsCounter(); } void @@ -361,6 +416,9 @@ IssueQue::addToFu(const DynInstPtr& inst) } inst->setIssued(); POPINST(inst); + if (hasInstsCounter()) { + decInIQInstsCounter(inst->threadNumber); + } scheduler->addToFU(inst); } @@ -513,14 +571,16 @@ IssueQue::wakeUpDependents(const DynInstPtr& inst, bool speculative) for (auto& it : depgraph) { int srcIdx = it.first; auto& consumer = it.second; - if (consumer->readySrcIdx(srcIdx)) { - continue; - } - consumer->markSrcRegReady(srcIdx); + if(consumer->threadNumber == inst->threadNumber){ + if (consumer->readySrcIdx(srcIdx)) { + continue; + } + consumer->markSrcRegReady(srcIdx); - DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx); - addIfReady(consumer); + DPRINTF(Schedule, "[sn:%llu] src%d was woken\n", consumer->seqNum, srcIdx); + addIfReady(consumer); + } } if (!speculative) { @@ -725,6 +785,9 @@ IssueQue::insert(const DynInstPtr& inst) selector->allocate(inst); inst->issueQue = this; instList.emplace_back(inst); + if (hasInstsCounter()) { + incInIQInstsCounter(inst->threadNumber); + } bool addToDepGraph = false; for (int i = 0; i < inst->numSrcRegs(); i++) { auto src = inst->renamedSrcIdx(i); @@ -780,11 +843,14 @@ IssueQue::doCommit(const InstSeqNum seqNum) } void -IssueQue::doSquash(const InstSeqNum seqNum) +IssueQue::doSquash(SquashInfo squashInfo) { for (auto it = instList.begin(); it != instList.end();) { - if ((*it)->seqNum > seqNum) { + if (((*it)->seqNum > squashInfo.squashSn) && ((*it)->threadNumber == squashInfo.squashTid)) { if (!(*it)->isIssued()) { + if (hasInstsCounter()) { + decInIQInstsCounter((*it)->threadNumber); + } POPINST((*it)); (*it)->setIssued(); } @@ -807,7 +873,7 @@ IssueQue::doSquash(const InstSeqNum seqNum) int size = inflightIssues[-i].size; for (int j = 0; j < size; j++) { auto& inst = inflightIssues[-i].insts[j]; - if (inst && inst->isSquashed()) { + if (inst && inst->isSquashed() && (inst->threadNumber == squashInfo.squashTid)) { inst = nullptr; } } @@ -816,7 +882,7 @@ IssueQue::doSquash(const InstSeqNum seqNum) // clear in depGraph for (auto& entrys : subDepGraph) { for (auto it = entrys.begin(); it != entrys.end();) { - if ((*it).second->isSquashed()) { + if ((*it).second->isSquashed() && ((*it).second->threadNumber == squashInfo.squashTid)) { it = entrys.erase(it); } else { it++; @@ -825,6 +891,33 @@ IssueQue::doSquash(const InstSeqNum seqNum) } } +void +IssueQue::incInIQInstsCounter(ThreadID tid) +{ + if (instsCounter) { + instsCounter->incCounter(tid); + } +} + +void +IssueQue::decInIQInstsCounter(ThreadID tid) +{ + if (instsCounter) { + instsCounter->decCounter(tid); + } +} + +void +IssueQue::initIndependentIQICountScheduler(int numThreads) +{ + assert(instsCounter != nullptr && "InstsCounter must be set first"); + + independentIQICountScheduler = new IndependentIQICountScheduler( + numThreads, instsCounter); + + DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname); +} + Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to, PendingWakeEventsType* owner) : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to) @@ -1451,24 +1544,26 @@ Scheduler::loadCancel(const DynInstPtr& inst) for (auto& it : iq->subDepGraph[dst->flatIndex()]) { int srcIdx = it.first; auto& depInst = it.second; - if (depInst->readySrcIdx(srcIdx)) { - DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum, - depInst->renamedSrcIdx(srcIdx)->flatIndex()); - if (depInst->isIssued()) { - if (inst->vpMisprediction) { - // VP misprediction: consumer may already be in-flight. - // Mark canceled and propagate to its dependents. - depInst->setCancel(); - depInst->clearSrcRegReady(srcIdx); - dfs.push(depInst); - needSquashFallback = true; + if (depInst->threadNumber == inst->threadNumber) { + if (depInst->readySrcIdx(srcIdx)) { + DPRINTF(Schedule, "cancel [sn:%llu], clear src p%d ready\n", depInst->seqNum, + depInst->renamedSrcIdx(srcIdx)->flatIndex()); + if (depInst->isIssued()) { + if (inst->vpMisprediction) { + // VP misprediction: consumer may already be in-flight. + // Mark canceled and propagate to its dependents. + depInst->setCancel(); + depInst->clearSrcRegReady(srcIdx); + dfs.push(depInst); + needSquashFallback = true; + } + continue; } - continue; - } - depInst->issueQue->cancel(depInst); - depInst->clearSrcRegReady(srcIdx); - dfs.push(depInst); + depInst->issueQue->cancel(depInst); + depInst->clearSrcRegReady(srcIdx); + dfs.push(depInst); + } } } } @@ -1591,11 +1686,11 @@ Scheduler::doCommit(const InstSeqNum seqNum) } void -Scheduler::doSquash(const InstSeqNum seqNum) +Scheduler::doSquash(SquashInfo squashInfo) { - DPRINTF(Schedule, "doSquash until seqNum %lu\n", seqNum); + DPRINTF(Schedule, "doSquash until seqNum %lu\n", squashInfo.squashSn); for (auto it : issueQues) { - it->doSquash(seqNum); + it->doSquash(squashInfo); } } @@ -1609,6 +1704,17 @@ Scheduler::getIQInsts() return total; } +uint32_t +Scheduler::getIQInsts(ThreadID tid) +{ + uint32_t total = 0; + for (auto iq : issueQues) { + total += iq->getInstsCounter()->getCounter(tid);; + } + return total; +} + + void Scheduler::setMainRdpOpt(bool enable) { @@ -1617,5 +1723,18 @@ Scheduler::setMainRdpOpt(bool enable) } } +void +Scheduler::initIQICountSmtScheduler(int numThreads) +{ + DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads); + + // to do: add switch;add SMTSchedulingPolicy + for (auto iq : issueQues) { + InstsCounter* counter = iq->getInstsCounter(); + assert(counter); + iq->initIndependentIQICountScheduler(numThreads); + } +} + } } diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index 0058bbb8df..b1ab4f361a 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -16,12 +16,14 @@ #include "cpu/inst_seq.hh" #include "cpu/o3/dyn_inst.hh" #include "cpu/o3/dyn_inst_ptr.hh" +#include "cpu/o3/smt_sched.hh" #include "cpu/reg_class.hh" #include "cpu/timebuf.hh" #include "params/BaseSelector.hh" #include "params/IssuePort.hh" #include "params/IssueQue.hh" #include "params/PAgeSelector.hh" +#include "params/SMTBasedSelector.hh" #include "params/Scheduler.hh" #include "params/SpecWakeupChannel.hh" #include "sim/sim_object.hh" @@ -99,11 +101,25 @@ class PAgeSelector : public BaseSelector ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; }; +class SMTBasedSelector : public BaseSelector +{ + private: + IndependentIQICountScheduler* smtScheduler = nullptr; + public: + SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {} + void setparent(Scheduler* scheduler, IssueQue* iq) override; + void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);} + void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);} + ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; +}; + class IssueQue : public SimObject { friend class Scheduler; friend class BaseSelector; friend class PAgeSelector; + friend class InstsCounter; + friend class IndependentIQICountScheduler; std::string _name; const int inports; @@ -172,6 +188,10 @@ class IssueQue : public SimObject Scheduler* scheduler = nullptr; BaseSelector* selector = nullptr; + //iq smt scheduler + InstsCounter* instsCounter = nullptr; + IndependentIQICountScheduler* independentIQICountScheduler = nullptr; + struct IssueQueStats : public statistics::Group { IssueQueStats(statistics::Group* parent, IssueQue* que, std::string name); @@ -207,6 +227,21 @@ class IssueQue : public SimObject void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; } void resetDepGraph(int numPhysRegs); + void setInstsCounter(InstsCounter* counter) { instsCounter = counter;} + + InstsCounter* getInstsCounter() const {return instsCounter; } + + void incInIQInstsCounter(ThreadID tid); + void decInIQInstsCounter(ThreadID tid); + bool hasInstsCounter() const { return instsCounter != nullptr; } + + void initIndependentIQICountScheduler(int numThreads); + + void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) { + independentIQICountScheduler = _independentIQICountScheduler; + } + IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; } + void tick(); bool ready(); int emptyEntries() const { return iqsize - instNum; } @@ -219,7 +254,7 @@ class IssueQue : public SimObject bool idle(); void doCommit(const InstSeqNum inst); - void doSquash(const InstSeqNum seqNum); + void doSquash(SquashInfo squashInfo); int getIssueStages() { return scheduleToExecDelay; } int getId() { return IQID; } @@ -331,6 +366,7 @@ class Scheduler : public SimObject void setAllScoreBoard(PhysRegIdPtr reg); void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; } void setMainRdpOpt(bool enable); + void initIQICountSmtScheduler(int numThreads); void tick(); void issueAndSelect(); @@ -360,8 +396,9 @@ class Scheduler : public SimObject bool hasReadyInsts(); bool isDrained(); void doCommit(const InstSeqNum seqNum); - void doSquash(const InstSeqNum seqNum); + void doSquash(SquashInfo squashInfo); uint32_t getIQInsts(); + uint32_t getIQInsts(ThreadID tid); SchedulerStats& getStats() { return stats; } }; diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh index d5222e758d..e6b00ab4d8 100644 --- a/src/cpu/o3/smt_sched.hh +++ b/src/cpu/o3/smt_sched.hh @@ -28,6 +28,8 @@ class InstsCounter uint64_t getCounter(ThreadID tid) { return counter[tid]; } void setCounter(ThreadID tid, uint64_t value) { counter[tid] = value; } + void incCounter(ThreadID tid, uint64_t value = 1) { counter[tid] += value; } + void decCounter(ThreadID tid, uint64_t value = 1) { counter[tid] -= value; } }; class SMTScheduler @@ -36,7 +38,8 @@ class SMTScheduler int numThreads; public: SMTScheduler(int numThreads) : numThreads(numThreads) {} - virtual ThreadID getThread(); + virtual ~SMTScheduler() = default; + virtual ThreadID getThread() = 0; }; @@ -124,7 +127,28 @@ class MultiPrioritySched : public SMTScheduler } }; +class IndependentIQICountScheduler : public SMTScheduler { +private: + InstsCounter* counter; // Counter for this IQ only +public: + IndependentIQICountScheduler(int numThreads, InstsCounter* counter) + : SMTScheduler(numThreads), counter(counter){} + + ThreadID getThread() override { + ThreadID selectedTid = 0; + uint64_t maxCount = counter->getCounter(0); + + for (ThreadID tid = 1; tid < numThreads; ++tid) { + uint64_t count = counter->getCounter(tid); + if (count > maxCount) { + maxCount = count; + selectedTid = tid; + } + } + return selectedTid; + } +}; }} #endif From 81b9cc0cb6ce8efdfab3a3b2e35cd9a5019290db Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 17 Mar 2026 14:21:10 +0800 Subject: [PATCH 02/38] cpu-o3: fix smt framework --- configs/common/Options.py | 8 ++-- configs/common/xiangshan.py | 5 +++ src/cpu/o3/comm.hh | 22 +++++++--- src/cpu/o3/commit.cc | 10 +++-- src/cpu/o3/commit.hh | 2 +- src/cpu/o3/cpu.cc | 16 +++---- src/cpu/o3/decode.cc | 7 +-- src/cpu/o3/decode.hh | 2 +- src/cpu/o3/fetch.cc | 66 +++++++++++++++++------------ src/cpu/o3/fetch.hh | 2 +- src/cpu/o3/iew.cc | 12 +++--- src/cpu/o3/iew.hh | 2 +- src/cpu/o3/inst_queue.cc | 2 +- src/cpu/o3/issue_queue.cc | 17 +++++--- src/cpu/o3/issue_queue.hh | 4 +- src/cpu/o3/lsq.cc | 12 ++++++ src/cpu/o3/lsq.hh | 4 ++ src/cpu/o3/rename.cc | 7 +-- src/cpu/o3/rename.hh | 2 +- src/cpu/pred/btb/decoupled_bpred.cc | 35 +++++++++++++-- src/cpu/pred/btb/decoupled_bpred.hh | 5 +-- 21 files changed, 157 insertions(+), 85 deletions(-) diff --git a/configs/common/Options.py b/configs/common/Options.py index 441840b5c8..4c314ddbf9 100644 --- a/configs/common/Options.py +++ b/configs/common/Options.py @@ -349,16 +349,14 @@ def addCommonOptions(parser, configure_xiangshan=False): "that are present under any of the roots. If not given, dump all " "stats. ") + parser.add_argument("--smt", action="store_true", default=False, + help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""") + if configure_xiangshan: return # Following options are not available in XiangShan parser.add_argument("--checker", action="store_true") - parser.add_argument("--smt", action="store_true", default=False, - help=""" - Only used if multiple programs are specified. If true, - then the number of threads per cpu is same as the - number of programs.""") parser.add_argument( "--elastic-trace-en", action="store_true", help="""Enable capture of data dependency and instruction diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py index ca5362d449..3e2c0fa4a2 100644 --- a/configs/common/xiangshan.py +++ b/configs/common/xiangshan.py @@ -439,7 +439,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby): test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i) for i in range(np)] # Configure MMU for trace-aware FS mode + if args.smt: + test_sys.multi_thread = True + for cpu in test_sys.cpu: + if args.smt: + cpu.numThreads = 2 cpu.mmu.pma_checker = PMAChecker( uncacheable=[AddrRange(0, size=0x80000000)]) cpu.mmu.functional = args.functional_tlb diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index f15257426f..ade70ed5e3 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -187,14 +187,23 @@ struct SquashVersion return (version + 1) % versionLimit; } bool largerThan(uint8_t other) const { - bool larger = version > other && version - other <= maxInflightSquash; - bool wrapped_larger = - version + versionLimit > other && - version + versionLimit - other <= maxInflightSquash; - if (!(larger || wrapped_larger || (version == other))) { + const uint8_t distance = (version + versionLimit - other) % versionLimit; + if (distance == 0) { + return false; + } + + if (distance <= maxInflightSquash) { + return true; + } + + if (versionLimit - distance <= maxInflightSquash) { + return false; + } + + if (version != other) { panic("SquashVersion: %d, other: %d\n", version, other); } - return larger || wrapped_larger; + return false; } void update(uint8_t v) { version = v; @@ -205,6 +214,7 @@ struct SquashVersion struct ResolveQueueEntry { + ThreadID resolvedTid; uint64_t resolvedFTQId; std::vector resolvedInstPC; }; diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index e7036301b4..f0040c0d4f 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -1997,10 +1997,10 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid) fixedbuffer[tid].clear(); - localSquashVer.update(localSquashVer.nextVersion()); - toIEW->commitInfo[tid].squashVersion = localSquashVer; + localSquashVer[tid].update(localSquashVer[tid].nextVersion()); + toIEW->commitInfo[tid].squashVersion = localSquashVer[tid]; DPRINTF(Commit, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[tid].getVersion()); } void @@ -2021,7 +2021,9 @@ Commit::markCompletedInsts() fromIEW->insts[inst_num]->setCanCommit(); auto &inst = fromIEW->insts[inst_num]; - panic_if(!rob->findInst(0, inst->seqNum), "[sn:%llu] Committed instruction not found in ROB", + panic_if(!rob->findInst(inst->threadNumber, inst->seqNum), + "[tid:%i] [sn:%llu] Committed instruction not found in ROB", + inst->threadNumber, inst->seqNum); } } diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 4cb184af98..465732ea0e 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -431,7 +431,7 @@ class Commit /** Wire to read information from rename queue. */ TimeBuffer::wire fromRename; - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; public: /** ROB interface. */ diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 502f02c25e..162a3ad341 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -135,13 +135,6 @@ CPU::CPU(const BaseO3CPUParams ¶ms) cpuStats(this), valuePred(params.valuePred) { - fatal_if(FullSystem && params.numThreads > 1, - "SMT is not supported in O3 in full system mode currently."); - - fatal_if(!FullSystem && params.numThreads < params.workload.size(), - "More workload items (%d) than threads (%d) on CPU %s.", - params.workload.size(), params.numThreads, name()); - if (!params.switched_out) { _status = Running; } else { @@ -206,7 +199,10 @@ CPU::CPU(const BaseO3CPUParams ¶ms) ThreadID active_threads; if (FullSystem) { - active_threads = 1; + // FS-SMT still uses one shared workload/system image, but the O3 core + // must provision per-thread architectural state for every hardware + // thread context exposed by the CPU. + active_threads = numThreads; } else { active_threads = params.workload.size(); @@ -283,9 +279,7 @@ CPU::CPU(const BaseO3CPUParams ¶ms) for (ThreadID tid = 0; tid < numThreads; ++tid) { if (FullSystem) { - // SMT is not supported in FS mode yet. - assert(numThreads == 1); - thread[tid] = new ThreadState(this, 0, NULL); + thread[tid] = new ThreadState(this, tid, NULL); } else { if (tid < params.workload.size()) { DPRINTF(O3CPU, "Workload[%i] process is %#x", tid, diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index ecb274b152..d76d0fbc60 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -401,7 +401,7 @@ Decode::moveInstsToBuffer() for (int i = 0; i < insts_from_fetch; ++i) { const DynInstPtr &inst = stallBuffer.front(); assert(tid == inst->threadNumber); - if (localSquashVer.largerThan(inst->getVersion())) { + if (localSquashVer[tid].largerThan(inst->getVersion())) { inst->setSquashed(); } assert(!fixedbuffer[inst->threadNumber].full()); @@ -419,9 +419,10 @@ Decode::checkSquash() DPRINTF(Decode, "[tid:%i] Squashing instructions due to squash " "from commit.\n", i); squash(i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); DPRINTF(Decode, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[i].getVersion()); } } } diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index a510d8dd9d..0f55d838b4 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -293,7 +293,7 @@ class Decode void setAllStalls(StallReason decodeStall); - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; }; } // namespace o3 diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 3c00c5937d..fba856f813 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -656,8 +656,8 @@ Fetch::processCacheCompletion(PacketPtr pkt) } // Verify fetchBufferPC alignment with the supplying FSQ entry. - if (threads[tid].valid && dbpbtb->ftqHasFetching(0)) { - const auto &stream = dbpbtb->ftqFetchingTarget(0); + if (threads[tid].valid && dbpbtb->ftqHasFetching(tid)) { + const auto &stream = dbpbtb->ftqFetchingTarget(tid); if (threads[tid].startPC != stream.startPC) { panic("fetchBufferPC %#x should be aligned with FSQ startPC %#x", threads[tid].startPC, stream.startPC); @@ -793,7 +793,7 @@ Fetch::lookupAndUpdateNextPC(const DynInstPtr &inst, PCStateBase &next_pc) // Decoupled+BTB-only: compute next PC directly from the supplying FSQ entry. ThreadID tid = inst->threadNumber; assert(dbpbtb); - assert(dbpbtb->ftqHasFetching(0)); + assert(dbpbtb->ftqHasFetching(tid)); const auto &stream = dbpbtb->ftqFetchingTarget(tid); const Addr curr_pc = next_pc.instAddr(); @@ -1002,7 +1002,7 @@ Fetch::handleTranslationFault(ThreadID tid, const RequestPtr &mem_req, const Fau // We will use a nop in order to carry the fault. DynInstPtr instruction = buildInst(tid, nopStaticInstPtr, nullptr, fetch_pc, fetch_pc, false); - instruction->setVersion(localSquashVer); + instruction->setVersion(localSquashVer[tid]); instruction->setNotAnInst(); instruction->setPredTarg(fetch_pc); @@ -1522,35 +1522,42 @@ Fetch::handleIEWSignals() return; } - auto &incoming = fromIEW->iewInfo->resolvedCFIs; const bool had_pending_resolve = !resolveQueue.empty(); - uint8_t enqueueSize = fromIEW->iewInfo->resolvedCFIs.size(); uint8_t enqueueCount = 0; + uint8_t enqueueSize = 0; + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + enqueueSize += fromIEW->iewInfo[tid].resolvedCFIs.size(); + } if (resolveQueueSize && resolveQueue.size() > resolveQueueSize - 4) { fetchStats.resolveQueueFullEvents++; fetchStats.resolveEnqueueFailEvent += enqueueSize; } else { + for (ThreadID tid = 0; tid < numThreads; ++tid) { + auto &incoming = fromIEW->iewInfo[tid].resolvedCFIs; + for (const auto &resolved : incoming) { + bool merged = false; + for (auto &queued : resolveQueue) { + if (queued.resolvedTid == tid && + queued.resolvedFTQId == resolved.ftqId) { + queued.resolvedInstPC.push_back(resolved.pc); + merged = true; + break; + } + } - for (const auto &resolved : incoming) { - bool merged = false; - for (auto &queued : resolveQueue) { - if (queued.resolvedFTQId == resolved.ftqId) { - queued.resolvedInstPC.push_back(resolved.pc); - merged = true; - break; + if (merged) { + continue; } - } - if (merged) { - continue; + ResolveQueueEntry new_entry; + new_entry.resolvedTid = tid; + new_entry.resolvedFTQId = resolved.ftqId; + new_entry.resolvedInstPC.push_back(resolved.pc); + resolveQueue.push_back(std::move(new_entry)); + enqueueCount++; } - - ResolveQueueEntry new_entry; - new_entry.resolvedFTQId = resolved.ftqId; - new_entry.resolvedInstPC.push_back(resolved.pc); - resolveQueue.push_back(std::move(new_entry)); - enqueueCount++; } fetchStats.resolveEnqueueCount.sample(enqueueCount); } @@ -1562,12 +1569,13 @@ Fetch::handleIEWSignals() // and fetch consuming them as predictor resolved updates. if (had_pending_resolve && !resolveQueue.empty()) { auto &entry = resolveQueue.front(); + ThreadID tid = entry.resolvedTid; unsigned int stream_id = entry.resolvedFTQId; - dbpbtb->prepareResolveUpdateEntries(stream_id, 0); + dbpbtb->prepareResolveUpdateEntries(stream_id, tid); for (const auto resolvedInstPC : entry.resolvedInstPC) { - dbpbtb->markCFIResolved(stream_id, resolvedInstPC, 0); + dbpbtb->markCFIResolved(stream_id, resolvedInstPC, tid); } - bool success = dbpbtb->resolveUpdate(stream_id, 0); + bool success = dbpbtb->resolveUpdate(stream_id, tid); if (success) { dbpbtb->notifyResolveSuccess(); resolveQueue.pop_front(); @@ -1612,8 +1620,10 @@ Fetch::handleCommitSignals(ThreadID tid) squash(*fromCommit->commitInfo[tid].pc, squash_seq, squash_inst, tid); - localSquashVer.update(fromCommit->commitInfo[tid].squashVersion.getVersion()); - DPRINTF(Fetch, "Updating squash version to %u\n", localSquashVer.getVersion()); + localSquashVer[tid].update( + fromCommit->commitInfo[tid].squashVersion.getVersion()); + DPRINTF(Fetch, "Updating squash version to %u\n", + localSquashVer[tid].getVersion()); auto mispred_inst = fromCommit->commitInfo[tid].mispredictInst; @@ -1924,7 +1934,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, tid, waitForVsetvl); } - instruction->setVersion(localSquashVer); + instruction->setVersion(localSquashVer[tid]); ppFetch->notify(instruction); numInst++; diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 6e114487cf..1d8c3e88aa 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1126,7 +1126,7 @@ class Fetch statistics::Scalar traceMetaCleanupCommitCalls; } fetchStats; - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; public: const FetchStatGroup &getFetchStats() { return fetchStats; } diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index d9c815b86c..7ea9c872ba 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -822,8 +822,10 @@ IEW::checkSquash() for (int i = 0; i < numThreads; i++) { if (fromCommit->commitInfo[i].squash) { squash(i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); - DPRINTF(IEW, "Updating squash version to %u\n", localSquashVer.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); + DPRINTF(IEW, "Updating squash version to %u\n", + localSquashVer[i].getVersion()); fetchRedirect[i] = false; iewStats.stallEvents[ROBWalk]++; @@ -854,7 +856,7 @@ IEW::moveInstsToBuffer() for (int i = 0; i < insts_from_rename; ++i) { const DynInstPtr &inst = fromRename->insts[i]; assert(inst->threadNumber == tid); - if (localSquashVer.largerThan(inst->getVersion())) { + if (localSquashVer[tid].largerThan(inst->getVersion())) { inst->setSquashed(); } else { fixedbuffer[tid].push_back(inst); @@ -958,9 +960,9 @@ IEW::dispatchInsts() toRename->iewInfo[tid].robHeadStallReason = checkDispatchStall(tid, NumDQ, nullptr, -1); toRename->iewInfo[tid].lqHeadStallReason = - ldstQueue.lqEmpty() ? StallReason::NoStall : checkLSQStall(tid, true); + ldstQueue.lqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, true); toRename->iewInfo[tid].sqHeadStallReason = - ldstQueue.sqEmpty() ? StallReason::NoStall : checkLSQStall(tid, false); + ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false); toRename->iewInfo[tid].blockReason = blockReason; } } diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index fc357dfb28..e23d0fb490 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -405,7 +405,7 @@ class IEW /** Scoreboard pointer. */ Scoreboard* scoreboard; - SquashVersion localSquashVer{0}; + SquashVersion localSquashVer[MaxThreads]; /** Value predictor */ valuepred::VPUnit *valuePred; diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 29573959cf..72f99bbb8e 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -758,7 +758,7 @@ InstructionQueue::commit(const InstSeqNum &inst, ThreadID tid) { DPRINTF(IQ, "[tid:%i] Committing instructions older than [sn:%llu]\n", tid,inst); - scheduler->doCommit(inst); + scheduler->doCommit(inst, tid); } int diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index 50fa7a5eb5..f2d09e17de 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -834,11 +834,16 @@ IssueQue::insertNonSpec(const DynInstPtr& inst) } void -IssueQue::doCommit(const InstSeqNum seqNum) +IssueQue::doCommit(const InstSeqNum seqNum, ThreadID tid) { - while (!instList.empty() && instList.front()->seqNum <= seqNum) { - assert(instList.front()->isIssued()); - instList.pop_front(); + for (auto it = instList.begin(); it != instList.end();) { + const auto &inst = *it; + if (inst->threadNumber == tid && inst->seqNum <= seqNum) { + assert(inst->isIssued()); + it = instList.erase(it); + } else { + ++it; + } } } @@ -1678,10 +1683,10 @@ Scheduler::isDrained() } void -Scheduler::doCommit(const InstSeqNum seqNum) +Scheduler::doCommit(const InstSeqNum seqNum, ThreadID tid) { for (auto it : issueQues) { - it->doCommit(seqNum); + it->doCommit(seqNum, tid); } } diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index b1ab4f361a..a91da979db 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -253,7 +253,7 @@ class IssueQue : public SimObject void retryMem(const DynInstPtr& inst); bool idle(); - void doCommit(const InstSeqNum inst); + void doCommit(const InstSeqNum inst, ThreadID tid); void doSquash(SquashInfo squashInfo); int getIssueStages() { return scheduleToExecDelay; } @@ -395,7 +395,7 @@ class Scheduler : public SimObject uint32_t getCorrectedOpLat(const DynInstPtr& inst); bool hasReadyInsts(); bool isDrained(); - void doCommit(const InstSeqNum seqNum); + void doCommit(const InstSeqNum seqNum, ThreadID tid); void doSquash(SquashInfo squashInfo); uint32_t getIQInsts(); uint32_t getIQInsts(ThreadID tid); diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index e7685c6a0b..0f3b005a8f 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -1393,6 +1393,12 @@ LSQ::lqEmpty() const return true; } +bool +LSQ::lqEmpty(ThreadID tid) const +{ + return thread[tid].lqEmpty(); +} + bool LSQ::sqEmpty() const { @@ -1409,6 +1415,12 @@ LSQ::sqEmpty() const return true; } +bool +LSQ::sqEmpty(ThreadID tid) const +{ + return thread[tid].sqEmpty(); +} + bool LSQ::lqFull() { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index e83794b6b3..504b4d4561 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -938,8 +938,12 @@ class LSQ bool isEmpty() const; /** Returns if all of the LQs are empty. */ bool lqEmpty() const; + /** Returns if the LQ of a given thread is empty. */ + bool lqEmpty(ThreadID tid) const; /** Returns if all of the SQs are empty. */ bool sqEmpty() const; + /** Returns if the SQ of a given thread is empty. */ + bool sqEmpty(ThreadID tid) const; /** Returns if any of the LQs are full. */ bool lqFull(); diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 0cdc70f935..0b9a1b47a8 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -601,7 +601,7 @@ Rename::moveInstsToBuffer() for (int i = 0; i < insts_from_decode; ++i) { const DynInstPtr &inst = fromDecode->insts[i]; assert(inst->threadNumber == tid); - if (localSquashVer.largerThan(inst->getVersion())) { + if (localSquashVer[tid].largerThan(inst->getVersion())) { inst->setSquashed(); } else { assert(!fixedbuffer[tid].full()); @@ -626,9 +626,10 @@ Rename::checkSquash() squash(fromCommit->commitInfo[i].doneSeqNum, i); - localSquashVer.update(fromCommit->commitInfo[i].squashVersion.getVersion()); + localSquashVer[i].update( + fromCommit->commitInfo[i].squashVersion.getVersion()); DPRINTF(Rename, "Updating squash version to %u\n", - localSquashVer.getVersion()); + localSquashVer[i].getVersion()); } } } diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 26769e0b5f..4e83cc0919 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -451,7 +451,7 @@ class Rename StallReason checkRenameStallFromIEW(ThreadID tid); - SquashVersion localSquashVer; + SquashVersion localSquashVer[MaxThreads]; /** Value predictor */ valuepred::VPUnit *valuePred; diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index a1fee43d87..8ed265af90 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -7,6 +7,7 @@ #include "cpu/o3/cpu.hh" #include "cpu/o3/dyn_inst.hh" #include "cpu/pred/btb/folded_hist.hh" +#include "cpu/thread_context.hh" #include "debug/BTB.hh" #include "debug/DecoupleBPHist.hh" #include "debug/DecoupleBPVerbose.hh" @@ -45,7 +46,7 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) // uras(p.uras), bpDBSwitches(p.bpDBSwitches), numStages(p.numStages), - ftq(2, p.ftq_size), + ftq(p.numThreads, p.ftq_size), historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) @@ -115,6 +116,26 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) }); } +ThreadID +DecoupledBPUWithBTB::scheduleThread() +{ + for (ThreadID offset = 0; offset < numThreads; ++offset) { + const ThreadID tid = (nextPredictTid + offset) % numThreads; + + if (cpu) { + auto *tc = cpu->getContext(tid); + if (!tc || tc->status() != gem5::ThreadContext::Active) { + continue; + } + } + + nextPredictTid = (tid + 1) % numThreads; + return tid; + } + + return InvalidThreadID; +} + void DecoupledBPUWithBTB::tick() @@ -122,6 +143,9 @@ DecoupledBPUWithBTB::tick() DPRINTF(Override, "DecoupledBPUWithBTB::tick()\n"); ThreadID curTid = scheduleThread(); + if (curTid == InvalidThreadID) { + return; + } // On squash, reset state if there was a valid prediction. bool squashOccurred = false; @@ -428,8 +452,13 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id, // Find the target being squashed if (!ftq.hasTarget(target_id, tid)) { - assert(!ftq.empty(tid)); - DPRINTF(DecoupleBP, "The squashing target is insane, ignore squash on it"); + DPRINTF(DecoupleBP, + "Ignore squash for tid %u on missing FTQ target %u; " + "recovering predictor state from redirect PC %#lx\n", + tid, target_id, redirect_pc); + clearPreds(tid); + threads[tid].validprediction = false; + threads[tid].s0PC = redirect_pc; return; } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 288450001f..134258f77c 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -75,8 +75,7 @@ class DecoupledBPUWithBTB : public BPredUnit // FetchTargetId fetchHeadFtqId{1}; // next FSQ id to be consumed by fetch CPU *cpu; - - const int numThreads = 2; + ThreadID nextPredictTid = 0; unsigned predictWidth; // max predict width, default 64 unsigned maxInstsNum; @@ -145,7 +144,7 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned resolveDequeueFailCounter{0}; const unsigned resolveBlockThreshold; - ThreadID scheduleThread() { return 0; } + ThreadID scheduleThread(); void processNewPrediction(ThreadID tid); From c96339acd38d48ac851c6ee800bec9d1c32b5807 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 18 Mar 2026 14:28:24 +0800 Subject: [PATCH 03/38] cpu-o3: support shared-address-space fs smt --- .gitignore | 8 +- configs/common/xiangshan.py | 12 +- src/cpu/base.cc | 123 +++++++++++------- src/cpu/base.hh | 17 +-- src/cpu/difftest.cc | 23 ++-- src/cpu/difftest.hh | 7 ++ src/cpu/o3/commit.cc | 3 +- src/cpu/o3/cpu.cc | 9 +- src/cpu/o3/cpu.hh | 2 +- src/cpu/o3/decode.cc | 83 +++++++++--- src/cpu/o3/decode.hh | 3 + src/cpu/o3/fetch.cc | 90 +++++++------ src/cpu/o3/fetch.hh | 8 +- src/cpu/o3/inst_queue.cc | 1 + src/cpu/o3/lsq.hh | 1 - src/cpu/o3/lsq_unit.cc | 3 +- src/cpu/o3/rename.cc | 32 +++-- src/cpu/o3/rename.hh | 4 +- src/cpu/pred/btb/abtb.cc | 3 +- src/cpu/pred/btb/abtb.hh | 2 +- src/cpu/pred/btb/btb_ittage.cc | 107 ++++++++++++---- src/cpu/pred/btb/btb_ittage.hh | 28 +++-- src/cpu/pred/btb/btb_mgsc.cc | 187 ++++++++++++++++++++-------- src/cpu/pred/btb/btb_mgsc.hh | 39 ++++-- src/cpu/pred/btb/btb_tage.cc | 117 +++++++++++------ src/cpu/pred/btb/btb_tage.hh | 37 +++--- src/cpu/pred/btb/btb_tage_ub.cc | 3 +- src/cpu/pred/btb/btb_tage_ub.hh | 2 +- src/cpu/pred/btb/btb_ubtb.hh | 2 +- src/cpu/pred/btb/decoupled_bpred.cc | 69 ++++++---- src/cpu/pred/btb/decoupled_bpred.hh | 13 +- src/cpu/pred/btb/mbtb.cc | 3 +- src/cpu/pred/btb/mbtb.hh | 2 +- src/cpu/pred/btb/microtage.cc | 154 ++++++++++++++++------- src/cpu/pred/btb/microtage.hh | 37 +++--- src/cpu/pred/btb/ras.cc | 3 +- src/cpu/pred/btb/ras.hh | 2 +- src/cpu/pred/btb/timed_base_pred.hh | 5 +- src/cpu/pred/btb/uras.cc | 3 +- src/cpu/pred/btb/uras.hh | 4 +- src/cpu/simple/base.cc | 7 +- src/cpu/simple/base.hh | 2 +- src/sim/system.cc | 4 +- src/sim/system.hh | 5 + 44 files changed, 860 insertions(+), 409 deletions(-) diff --git a/.gitignore b/.gitignore index 0b3dca3746..6a03374ea1 100644 --- a/.gitignore +++ b/.gitignore @@ -70,4 +70,10 @@ package.json microbench/build/ microbench/output/ -microbench/dramsim3* \ No newline at end of file +microbench/dramsim3* + +*.bin +*.db +*.log +*.gz +*.zstd \ No newline at end of file diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py index 3e2c0fa4a2..368f6cd884 100644 --- a/configs/common/xiangshan.py +++ b/configs/common/xiangshan.py @@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace): if args.difftest_ref_so is not None: ref_so = args.difftest_ref_so print("Obtained ref_so from args.difftest_ref_so: ", ref_so) - elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ: + elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ: ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"] print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so) elif "GCBV_REF_SO" in os.environ: @@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys): if args.raw_cpt: # If using raw binary, no restorer is needed. gcpt_restorer = None - elif args.num_cpus > 1: + elif args.num_cpus > 1 or args.smt: if "GCB_MULTI_CORE_RESTORER" in os.environ: gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"] print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer) else: - fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core") + fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest") elif args.restore_rvv_cpt: if "GCBV_RESTORER" in os.environ: gcpt_restorer = os.environ["GCBV_RESTORER"] @@ -355,8 +355,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys): print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer) gcpt_restorer = args.gcpt_restorer - if args.num_cpus > 1: - print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).") + if args.num_cpus > 1 or args.smt: + print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).") sys.gcpt_restorer_size_limit = 2**20 elif args.restore_rvv_cpt: print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.") @@ -403,7 +403,7 @@ def config_difftest(cpu_list, args, sys): if not args.enable_difftest: return else: - if len(cpu_list) > 1: + if len(cpu_list) > 1 or args.smt: sys.enable_mem_dedup = True for cpu in cpu_list: cpu.enable_mem_dedup = True diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 63c0e7964a..83a2a27686 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -43,6 +43,7 @@ #include "cpu/base.hh" +#include #include #include #include @@ -208,40 +209,50 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker) "of threads (%i).\n", params().isa.size(), numThreads); } - diffAllStates = std::make_shared(); + diffAllStates.resize(numThreads); if (enableDifftest) { assert(params().difftest_ref_so.length() > 2); - diffAllStates->diff.nemu_reg = &(diffAllStates->referenceRegFile); - diffAllStates->diff.nemu_this_pc = 0x80000000u; - diffAllStates->diff.cpu_id = params().cpu_id; - warn("cpu_id set to %d\n", params().cpu_id); - - if (params().difftest_ref_so.find("spike") != std::string::npos) { - assert(!system->multiCore()); - diffAllStates->proxy = new SpikeProxy( - params().cpu_id, params().difftest_ref_so.c_str(), - params().nemuSDimg.size() && params().nemuSDCptBin.size()); - } else { - diffAllStates->proxy = - new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(), - params().nemuSDimg.size() && params().nemuSDCptBin.size(), system->enabledMemDedup(), - system->multiCore()); - } + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid] = std::make_shared(); + auto diff_state = diffAllStates[tid]; + diff_state->diff.nemu_reg = &(diff_state->referenceRegFile); + diff_state->diff.nemu_this_pc = 0x80000000u; + diff_state->diff.cpu_id = difftestHartId(tid); + warn("difftest hart id set to %d for tid %d\n", + diff_state->diff.cpu_id, tid); + + if (params().difftest_ref_so.find("spike") != std::string::npos) { + assert(!system->multiContextDifftest()); + diff_state->proxy = new SpikeProxy( + params().cpu_id, params().difftest_ref_so.c_str(), + params().nemuSDimg.size() && params().nemuSDCptBin.size()); + } else { + diff_state->proxy = + new NemuProxy(params().cpu_id, params().difftest_ref_so.c_str(), + params().nemuSDimg.size() && params().nemuSDCptBin.size(), + system->enabledMemDedup(), + system->multiContextDifftest()); + } - warn("Difftest is enabled with ref so: %s.\n", params().difftest_ref_so.c_str()); + warn("Difftest is enabled with ref so: %s.\n", + params().difftest_ref_so.c_str()); - diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), REF_TO_DUT); - diffAllStates->diff.dynamic_config.ignore_illegal_mem_access = false; - diffAllStates->diff.dynamic_config.debug_difftest = false; - diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); - if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) { - diffAllStates->proxy->sdcard_init(params().nemuSDimg.c_str(), - params().nemuSDCptBin.c_str()); + diff_state->proxy->regcpy(&(diff_state->gem5RegFile), REF_TO_DUT); + diff_state->diff.dynamic_config.ignore_illegal_mem_access = false; + diff_state->diff.dynamic_config.debug_difftest = false; + diff_state->proxy->update_config(&diff_state->diff.dynamic_config); + if (params().nemuSDimg.size() && params().nemuSDCptBin.size()) { + diff_state->proxy->sdcard_init(params().nemuSDimg.c_str(), + params().nemuSDCptBin.c_str()); + } + diff_state->diff.will_handle_intr = false; } - diffAllStates->diff.will_handle_intr = false; } else { warn("Difftest is disabled\n"); - diffAllStates->hasCommit = true; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid] = std::make_shared(); + diffAllStates[tid]->hasCommit = true; + } } if (dumpCommitFlag) { @@ -404,11 +415,14 @@ BaseCPU::startup() if (powerState->get() == enums::PwrState::UNDEFINED) powerState->set(enums::PwrState::ON); - if (system->multiCore()) { + if (system->multiContextDifftest()) { goldenMemPtr = system->getGoldenMemPtr(); _goldenMemManager = system->getGoldenMemManager(); - diffAllStates->proxy->initState(params().cpu_id, goldenMemPtr); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + diffAllStates[tid]->proxy->initState(difftestHartId(tid), + goldenMemPtr); + } } else { goldenMemPtr = nullptr; _goldenMemManager = nullptr; @@ -702,7 +716,7 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU) if (enable_diff) { warn("Take over difftest state to new CPU\n"); enableDifftest = enable_diff; - takeOverDiffAllStates(diff_all); + takeOverDiffAllStates(std::move(diff_all)); } } @@ -865,6 +879,12 @@ BaseCPU::GlobalStats::GlobalStats(statistics::Group *parent) hostOpRate = simOps / hostSeconds; } +int +BaseCPU::difftestHartId(ThreadID tid) const +{ + return params().cpu_id * numThreads + tid; +} + void BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq, std::string error_csr_name, int &diff_at) @@ -883,6 +903,8 @@ BaseCPU::csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint std::pair BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; + int diff_at = DiffAt::NoneDiff; bool npc_match = false; bool is_mmio = diffInfo.curInstStrictOrdered; @@ -966,7 +988,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) if (enableRVV) { if (diffInfo.inst->isVector()) { - readGem5Regs(); + readGem5Regs(tid); uint64_t* nemu_val = (uint64_t*)&(diffAllStates->referenceRegFile.vr[0]); uint64_t* gem5_val = (uint64_t*)&(diffAllStates->gem5RegFile.vr[0]); bool maybe_error = false; @@ -1431,7 +1453,8 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) diffInfo.physEffAddr, diffInfo.effSize); } - if (system->multiCore() && (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) && + if (system->multiContextDifftest() && + (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) && _goldenMemManager->inPmem(diffInfo.physEffAddr)) { warn("Difference on %s instr found in multicore mode, check in golden memory\n", diffInfo.inst->isLoad() ? "load" : "amo"); @@ -1517,9 +1540,10 @@ BaseCPU::clearDiffMismatch(ThreadID tid, InstSeqNum seq) { void BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; warn("%s", diffMsg.str()); diffAllStates->proxy->isa_reg_display(); - displayGem5Regs(); + displayGem5Regs(tid); warn("start dump last %lu committed msg\n", diffInfo.lastCommittedMsg.size()); while (diffInfo.lastCommittedMsg.size()) { auto &inst = diffInfo.lastCommittedMsg.front(); @@ -1531,6 +1555,8 @@ BaseCPU::reportDiffMismatch(ThreadID tid, InstSeqNum seq) void BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) { + auto diffAllStates = this->diffAllStates[tid]; + bool should_diff = false; DPRINTF(DumpCommit, "[sn:%llu] %#lx, %s\n", seq, diffInfo.pc->instAddr(), diffInfo.inst->disassemble(diffInfo.pc->instAddr())); @@ -1550,10 +1576,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) should_diff = true; if (!diffAllStates->hasCommit && diffInfo.pc->instAddr() == 0x80000000u) { diffAllStates->hasCommit = true; - readGem5Regs(); + readGem5Regs(tid); diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr(); if (noHypeMode) { - auto start = pmemStart + pmemSize * diffAllStates->diff.cpu_id; + auto start = pmemStart + pmemSize * difftestHartId(tid); warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize); diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF); } else if (enableMemDedup) { @@ -1603,9 +1629,10 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) } void -BaseCPU::displayGem5Regs() +BaseCPU::displayGem5Regs(ThreadID tid) { - readGem5Regs(); + auto diffAllStates = this->diffAllStates[tid]; + readGem5Regs(tid); std::string str; //reg for (size_t i = 0; i < 32; i++) @@ -1712,8 +1739,9 @@ BaseCPU::displayGem5Regs() } void -BaseCPU::difftestRaiseIntr(uint64_t no) +BaseCPU::difftestRaiseIntr(uint64_t no, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; diffAllStates->diff.will_handle_intr = true; diffAllStates->proxy->raise_intr(no); } @@ -1721,19 +1749,24 @@ BaseCPU::difftestRaiseIntr(uint64_t no) void BaseCPU::clearGuideExecInfo() { - diffAllStates->diff.guide.force_raise_exception = false; - diffAllStates->diff.guide.force_set_jump_target = false; + for (auto &diffAllStates : this->diffAllStates) { + diffAllStates->diff.guide.force_raise_exception = false; + diffAllStates->diff.guide.force_set_jump_target = false; + } } void BaseCPU::enableDiffPrint() { - diffAllStates->diff.dynamic_config.debug_difftest = true; - diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); + for (auto &diffAllStates : this->diffAllStates) { + diffAllStates->diff.dynamic_config.debug_difftest = true; + diffAllStates->proxy->update_config(&diffAllStates->diff.dynamic_config); + } } -void BaseCPU::setSCSuccess(bool success, paddr_t addr) +void BaseCPU::setSCSuccess(bool success, paddr_t addr, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; diffAllStates->diff.sync.lrscValid = success; diffAllStates->diff.sync.lrscAddr = addr; // used for spike diff } @@ -1742,6 +1775,8 @@ void BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, bool force_set_jump_target, uint64_t jump_target, ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; + auto &gd = diffAllStates->diff.guide; gd.force_raise_exception = true; gd.exception_num = exception_num; @@ -1769,7 +1804,7 @@ BaseCPU::setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint6 void BaseCPU::checkL1DRefill(Addr paddr, const uint8_t* refill_data, size_t size) { assert(size == 64); - if (system->multiCore()) { + if (system->multiContextDifftest()) { uint8_t *golden_ptr = (uint8_t *)_goldenMemManager->guestToHost(paddr); if (memcmp(golden_ptr, refill_data, size)) { panic("Refill data diff with Golden addr %#lx with size %d\n", paddr, size); diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 8fe6d55d61..3d3e8e5a85 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -693,7 +693,7 @@ class BaseCPU : public ClockedObject bool enableRVV{false}; bool enableRVHDIFF{false}; bool enableSkipCSR{false}; - std::shared_ptr diffAllStates{}; + std::vector> diffAllStates{}; enum diffRegConfig { @@ -701,7 +701,7 @@ class BaseCPU : public ClockedObject diffCsrNum = 36, }; - virtual void readGem5Regs() + virtual void readGem5Regs(ThreadID tid) { panic("difftest:readGem5Regs() is not implemented\n"); } @@ -709,6 +709,7 @@ class BaseCPU : public ClockedObject void csrDiffMessage(uint64_t gem5_val, uint64_t ref_val, int error_num, uint64_t &error_reg, InstSeqNum seq, std::string error_csr_name,int &diff_at); std::pair diffWithNEMU(ThreadID tid, InstSeqNum seq); + int difftestHartId(ThreadID tid) const; std::stringstream diffMsg; void reportDiffMismatch(ThreadID tid, InstSeqNum seq); @@ -779,11 +780,11 @@ class BaseCPU : public ClockedObject inline bool difftestEnabled() const { return enableDifftest; } - void displayGem5Regs(); + void displayGem5Regs(ThreadID tid); - void difftestRaiseIntr(uint64_t no); + void difftestRaiseIntr(uint64_t no, ThreadID tid = 0); - void setSCSuccess(bool success, paddr_t addr); + void setSCSuccess(bool success, paddr_t addr, ThreadID tid); void setExceptionGuideExecInfo(uint64_t exception_num, uint64_t mtval, uint64_t stval, // force set jump target @@ -793,14 +794,14 @@ class BaseCPU : public ClockedObject void enableDiffPrint(); - std::pair> getDiffAllStates() + std::pair>> getDiffAllStates() { return std::make_pair(enableDifftest, diffAllStates); } - void takeOverDiffAllStates(std::shared_ptr diffAllStates) + void takeOverDiffAllStates(std::vector> diffAllStates) { - this->diffAllStates = diffAllStates; + this->diffAllStates = std::move(diffAllStates); } int committedInstNum = 0; diff --git a/src/cpu/difftest.cc b/src/cpu/difftest.cc index 7293e51b9a..63665f194b 100644 --- a/src/cpu/difftest.cc +++ b/src/cpu/difftest.cc @@ -149,6 +149,12 @@ NemuProxy::NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bo #endif multiCore = multi_core; + if (multiCore) { + nemuSetHartId = (void (*)(int))dlsym(handle, "difftest_set_mhartid"); + assert(nemuSetHartId); + nemuPutGmaddr = (void (*)(uint8_t *))dlsym(handle, "difftest_put_gmaddr"); + assert(nemuPutGmaddr); + } if (enable_sdcard_diff) { sdcard_init = (void (*)(const char *, const char *))dlsym( @@ -168,15 +174,18 @@ void NemuProxy::initState(int coreid, uint8_t *golden_mem) { if (multiCore) { - auto nemu_difftest_set_mhartid = (void (*)(int))dlsym(handle, "difftest_set_mhartid"); warn("Setting mhartid to %d\n", coreid); - assert(nemu_difftest_set_mhartid); - nemu_difftest_set_mhartid(coreid); - - auto nemu_difftest_put_gmaddr = (void (*)(uint8_t *ptr))dlsym(handle, "difftest_put_gmaddr"); + setHartId(coreid); warn("Setting gmaddr to %#lx\n", (uint64_t) golden_mem); - assert(nemu_difftest_put_gmaddr); - nemu_difftest_put_gmaddr(golden_mem); + nemuPutGmaddr(golden_mem); + } +} + +void +NemuProxy::setHartId(int coreid) +{ + if (multiCore) { + nemuSetHartId(coreid); } } diff --git a/src/cpu/difftest.hh b/src/cpu/difftest.hh index af4eee4d96..7d91201b4f 100644 --- a/src/cpu/difftest.hh +++ b/src/cpu/difftest.hh @@ -195,6 +195,7 @@ class RefProxy void (*sdcard_init)(const char *img_path, const char *sd_cpt_bin_path) = nullptr; virtual void initState(int coreid, uint8_t *golden_mem) = 0; + virtual void setHartId(int coreid) = 0; protected: bool multiCore; @@ -208,6 +209,11 @@ class NemuProxy : public RefProxy NemuProxy(int coreid, const char *ref_so, bool enable_sdcard_diff, bool enable_mem_dedup, bool multi_core); void initState(int coreid, uint8_t *golden_mem) override; + void setHartId(int coreid) override; + + private: + void (*nemuSetHartId)(int) = nullptr; + void (*nemuPutGmaddr)(uint8_t *) = nullptr; }; @@ -217,6 +223,7 @@ class SpikeProxy : public RefProxy SpikeProxy(int coreid, const char *ref_so, bool enable_sdcard_diff); void initState(int coreid, uint8_t *golden_mem) override { panic("Not implemented\n"); } + void setHartId(int coreid) override { panic("Not implemented\n"); } }; #define DIFFTEST_WIDTH 8 diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index f0040c0d4f..ad42b0c7fe 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -1843,7 +1843,8 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) if (head_inst->isStoreConditional()) { DPRINTF(Commit, "[tid:%i] [sn:%llu] Store Conditional success: %i\n", tid, head_inst->seqNum, head_inst->lockedWriteSuccess()); - cpu->setSCSuccess(head_inst->lockedWriteSuccess(), head_inst->physEffAddr); + cpu->setSCSuccess(head_inst->lockedWriteSuccess(), + head_inst->physEffAddr, tid); } // Update the commit rename map diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 162a3ad341..5961aed7b1 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -1735,12 +1735,13 @@ CPU::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid, } void -CPU::readGem5Regs() +CPU::readGem5Regs(ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; for (int i = 0; i < 32; i++) { - diffAllStates->gem5RegFile[i] = readArchIntReg(i, 0); - diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, 0); - readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], 0); + diffAllStates->gem5RegFile[i] = readArchIntReg(i, tid); + diffAllStates->gem5RegFile[i + 32] = readArchFloatReg(i, tid); + readArchVecReg(i, (uint64_t*)&diffAllStates->gem5RegFile.vr[i], tid); } } diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index fae5eea4d4..8ca2b276e3 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -740,7 +740,7 @@ class CPU : public BaseCPU HtmFailureFaultCause cause) override; //difftest virtual function - void readGem5Regs() override; + void readGem5Regs(ThreadID tid) override; private: /** Value predictor */ diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index d76d0fbc60..93ede3d673 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -72,6 +72,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams ¶ms) iewToDecodeDelay(params.iewToDecodeDelay), commitToDecodeDelay(params.commitToDecodeDelay), fetchToDecodeDelay(params.fetchToDecodeDelay), + decodeToFetchDelay(params.decodeToFetchDelay), decodeWidth(params.decodeWidth), numThreads(params.numThreads), enableLoadFusion(params.enable_loadFusion), @@ -86,8 +87,15 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams ¶ms) for (int i=0;i(decodeWidth); } - stallBuffer = boost::circular_buffer(decodeWidth * (fetchToDecodeDelay + 1)); - eachstallSize = boost::circular_buffer(fetchToDecodeDelay + 1); + // This buffer preserves the fetch->decode pipeline contents when decode + // stalls while TimeBuffer keeps advancing. Its depth matches the original + // forward pipeline window; fetch is backpressured before full to absorb + // both the decode->fetch feedback delay and the request already issued in + // the current cycle before decode computes backpressure. + const auto stallGroupDepth = fetchToDecodeDelay + 1; + stallBuffer = boost::circular_buffer( + decodeWidth * stallGroupDepth); + eachstallSize = boost::circular_buffer(stallGroupDepth); decodeStalls.resize(decodeWidth, StallReason::NoStall); @@ -373,6 +381,38 @@ Decode::updateActivate() void Decode::moveInstsToBuffer() { + auto tryMoveHeadGroupToFixedBuffer = [&]() -> bool { + if (stallBuffer.empty()) { + return false; + } + + // stallbuffer moves to fixedbuffer in strict FIFO order. + ThreadID tid = stallBuffer.front()->threadNumber; + if (!fixedbuffer[tid].empty()) { + return false; + } + + int insts_from_stall = eachstallSize.front(); + eachstallSize.pop_front(); + for (int i = 0; i < insts_from_stall; ++i) { + const DynInstPtr &inst = stallBuffer.front(); + assert(tid == inst->threadNumber); + if (localSquashVer[tid].largerThan(inst->getVersion())) { + inst->setSquashed(); + } + assert(!fixedbuffer[inst->threadNumber].full()); + fixedbuffer[inst->threadNumber].push_back(inst); + stallBuffer.pop_front(); + } + + return true; + }; + + // Model one stage advance before latching the next cycle's input so a + // full stall buffer can still accept a new fetch bundle when its head + // group moves forward in the same cycle. + const bool moved_group = tryMoveHeadGroupToFixedBuffer(); + // do not support mixed thread instructions in one fetch group int insts_from_fetch = fromFetch->size; if (insts_from_fetch != 0) { @@ -392,23 +432,12 @@ Decode::moveInstsToBuffer() if (stallBuffer.empty()) { return; } - // stallbuffer move to fixedbuffer - ThreadID tid = stallBuffer.front()->threadNumber; - if (!fixedbuffer[tid].empty()) - return; - insts_from_fetch = eachstallSize.front(); - eachstallSize.pop_front(); - for (int i = 0; i < insts_from_fetch; ++i) { - const DynInstPtr &inst = stallBuffer.front(); - assert(tid == inst->threadNumber); - if (localSquashVer[tid].largerThan(inst->getVersion())) { - inst->setSquashed(); - } - assert(!fixedbuffer[inst->threadNumber].full()); - fixedbuffer[inst->threadNumber].push_back(inst); - stallBuffer.pop_front(); - } + // If nothing advanced before latching new input, allow the current head + // (possibly the just-arrived group) to fill an empty stage this cycle. + if (!moved_group) { + tryMoveHeadGroupToFixedBuffer(); + } } void @@ -443,13 +472,27 @@ Decode::tick() // check threads stall & status ThreadID tid = InvalidThreadID; ThreadID blocked_tid = InvalidThreadID; + const bool fifoBackpressured = + !stallBuffer.empty() && + eachstallSize.size() + decodeToFetchDelay + 1 >= + eachstallSize.capacity(); + const ThreadID fifoHeadTid = + !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID; + const StallReason fifoBlockReason = + (fifoBackpressured && fifoHeadTid != InvalidThreadID && + stallSig->blockDecode[fifoHeadTid]) ? + stallSig->decodeBlockReason[fifoHeadTid] : + (fifoBackpressured ? StallReason::OtherFragStall : + StallReason::NoStall); for (int i = 0; i < numThreads; i++) { bool block = stallSig->blockDecode[i]; bool active = !block && !fixedbuffer[i].empty(); - stallSig->blockFetch[i] = block; + stallSig->blockFetch[i] = block || fifoBackpressured; stallSig->fetchBlockReason[i] = - block ? stallSig->decodeBlockReason[i] : StallReason::NoStall; + stallSig->blockFetch[i] ? + (block ? stallSig->decodeBlockReason[i] : fifoBlockReason) : + StallReason::NoStall; toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i]; if (active) { if (tid == InvalidThreadID) diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index 0f55d838b4..c548fad3c7 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -236,6 +236,9 @@ class Decode /** Fetch to decode delay. */ Cycles fetchToDecodeDelay; + /** Decode to fetch feedback delay for stage backpressure. */ + Cycles decodeToFetchDelay; + /** The width of decode, in instructions. */ unsigned decodeWidth; diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index fba856f813..d2381123ab 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -98,7 +98,6 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) fetchWidth(params.fetchWidth), decodeWidth(params.decodeWidth), retryPkt(), - retryTid(InvalidThreadID), cacheBlkSize(cpu->cacheLineSize()), fetchBufferSize(params.fetchBufferSize), fetchQueueSize(params.fetchQueueSize), @@ -460,6 +459,10 @@ Fetch::resetStage() { numInst = 0; interruptPending = false; + for (auto *pkt : retryPkt) { + delete pkt; + } + retryPkt.clear(); cacheBlocked = false; priorityList.clear(); @@ -489,7 +492,9 @@ Fetch::resetStage() } assert(dbpbtb); - dbpbtb->resetPC(threads[0].fetchpc->instAddr()); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + dbpbtb->resetPC(tid, threads[tid].fetchpc->instAddr()); + } } bool @@ -587,8 +592,11 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt) DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n", tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size()); - // Note: retry is handled completely by the standard gem5 recvReqRetry mechanism - // No need to handle retry here to avoid duplicate packet sending + if (cacheBlocked && !retryPkt.empty()) { + DPRINTF(Fetch, "[tid:%i] Cache response arrived with queued retries pending; " + "trying one response-driven retry pass\n", tid); + retryPendingIcacheRequests(); + } return false; // Return false to indicate we're still waiting } @@ -687,7 +695,6 @@ Fetch::drainSanityCheck() const { assert(isDrained()); assert(retryPkt.size() == 0); - assert(retryTid == InvalidThreadID); assert(!cacheBlocked); assert(!interruptPending); @@ -939,6 +946,16 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr fetchStats.cacheLines++; + if (cacheBlocked) { + DPRINTF(Fetch, "[tid:%i] I-cache port already waiting for retry, queueing %#lx\n", + tid, mem_req->getVaddr()); + + updateCacheRequestStatusByRequest(tid, mem_req, CacheWaitRetry); + setAllFetchStalls(StallReason::IcacheStall); + retryPkt.push_back(data_pkt); + return; + } + // Access the cache. if (!icachePort.sendTimingReq(data_pkt)) { DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid); @@ -950,7 +967,6 @@ Fetch::handleSuccessfulTranslation(ThreadID tid, const RequestPtr &mem_req, Addr mem_req->getVaddr()); setAllFetchStalls(StallReason::IcacheStall); retryPkt.push_back(data_pkt); - retryTid = tid; cacheBlocked = true; } else { DPRINTF(Fetch, "[tid:%i] Doing Icache access.\n", tid); @@ -1110,15 +1126,17 @@ Fetch::doSquash(PCStateBase &new_pc, const DynInstPtr squashInst, const InstSeqN // Reset the cache request after cancelling threads[tid].cacheReq.reset(); - // Get rid of the retrying packet if it was from this thread. - if (retryTid == tid) { - assert(cacheBlocked); - for (auto it : retryPkt) { - delete it; + // Drop any retry packets that belong to this squashed thread. + for (auto it = retryPkt.begin(); it != retryPkt.end();) { + if (cpu->contextToThread((*it)->req->contextId()) == tid) { + delete *it; + it = retryPkt.erase(it); + } else { + ++it; } - retryPkt.clear(); - retryTid = InvalidThreadID; - cacheBlocked = false; // clear cache blocked + } + if (retryPkt.empty()) { + cacheBlocked = false; } if (squashInst && !squashInst->isControl()) { @@ -1577,11 +1595,11 @@ Fetch::handleIEWSignals() } bool success = dbpbtb->resolveUpdate(stream_id, tid); if (success) { - dbpbtb->notifyResolveSuccess(); + dbpbtb->notifyResolveSuccess(tid); resolveQueue.pop_front(); fetchStats.resolveDequeueCount++; } else { - dbpbtb->notifyResolveFailure(); + dbpbtb->notifyResolveFailure(tid); } } } @@ -1731,8 +1749,8 @@ Fetch::buildInst(ThreadID tid, StaticInstPtr staticInst, instruction->isMov()); assert(dbpbtb); DPRINTF(DecoupleBP, "Set instruction %lu with fetch id %lu\n", - instruction->seqNum, dbpbtb->ftqHeadId(0)); - instruction->setFtqId(dbpbtb->ftqHeadId(0)); + instruction->seqNum, dbpbtb->ftqHeadId(tid)); + instruction->setFtqId(dbpbtb->ftqHeadId(tid)); #if TRACING_ON if (trace) { @@ -2087,36 +2105,32 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { void Fetch::recvReqRetry() { - if (retryPkt.size() == 0) { - assert(retryTid == InvalidThreadID); + if (retryPkt.empty()) { // Access has been squashed since it was sent out. Just clear // the cache being blocked. cacheBlocked = false; return; } assert(cacheBlocked); - assert(retryTid != InvalidThreadID); - // Note: In multi-cacheline fetch, overall status may not be CacheWaitRetry - // if some requests have progressed while others still need retry. - // The presence of retryPkt itself indicates retry is needed. + retryPendingIcacheRequests(); +} - for (auto it = retryPkt.begin(); it != retryPkt.end();) { - if (icachePort.sendTimingReq(*it)) { - // Use new cache state management with specific RequestPtr - updateCacheRequestStatusByRequest(retryTid, (*it)->req, CacheWaitResponse); - // Notify Fetch Request probe when a retryPkt is successfully sent. - // Note that notify must be called before retryPkt is set to NULL. - ppFetchRequestSent->notify((*it)->req); - it = retryPkt.erase(it); - } else { - it++; +void +Fetch::retryPendingIcacheRequests() +{ + while (!retryPkt.empty()) { + PacketPtr pkt = retryPkt.front(); + if (!icachePort.sendTimingReq(pkt)) { + return; } - } - if (retryPkt.size() == 0) { - retryTid = InvalidThreadID; - cacheBlocked = false; + const ThreadID tid = cpu->contextToThread(pkt->req->contextId()); + updateCacheRequestStatusByRequest(tid, pkt->req, CacheWaitResponse); + ppFetchRequestSent->notify(pkt->req); + retryPkt.erase(retryPkt.begin()); } + + cacheBlocked = false; } void diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 1d8c3e88aa..0061b87912 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -322,6 +322,9 @@ class Fetch /** Reset this pipeline stage */ void resetStage(); + /** Retry queued I-cache packets once, stopping at the first new block. */ + void retryPendingIcacheRequests(); + /** Changes the status of this stage to active, and indicates this * to the CPU. */ @@ -676,12 +679,9 @@ class Fetch /** Is the cache blocked? If so no threads can access it. */ bool cacheBlocked; - /** The packet that is waiting to be retried. */ + /** Packets waiting for the next cache-issued retry callback. */ std::vector retryPkt; - /** The thread that is waiting on the cache to tell fetch to retry. */ - ThreadID retryTid; - /** Cache block size. */ unsigned int cacheBlkSize; diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index 72f99bbb8e..db8ec407f4 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -53,6 +53,7 @@ #include "cpu/o3/dyn_inst.hh" #include "cpu/o3/dyn_inst_ptr.hh" #include "cpu/o3/fu_pool.hh" +#include "cpu/o3/iew.hh" #include "cpu/o3/issue_queue.hh" #include "cpu/o3/limits.hh" #include "debug/IQ.hh" diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 504b4d4561..604df7c0f1 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -1143,7 +1143,6 @@ class LSQ std::vector dcacheRefillDataRead; std::vector dcacheRefillDataWrite; std::vector dcacheRefillTagWrite; - bool isDcacheRefillTagWrite() const { for (auto stage : dcacheRefillTagWrite) { diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 433f3c17a8..9cfc4d791f 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -349,7 +349,8 @@ LSQUnit::completeDataAccess(PacketPtr pkt) if (inst->isLoad() || inst->isAtomic()) { Addr addr = pkt->getAddr(); auto [enable_diff, diff_all_states] = cpu->getDiffAllStates(); - if (system->multiCore() && enable_diff && !request->_sbufferBypass && + if (system->multiContextDifftest() && enable_diff && + !request->_sbufferBypass && cpu->goldenMemManager()->inPmem(addr)) { // check data with golden mem uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 0b9a1b47a8..84e3e0e031 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -79,6 +79,8 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams ¶ms) fixedbuffer[tid] = boost::circular_buffer(renameWidth); renameMap[tid] = nullptr; stalls[tid] = {false, false}; + finalCommitSeq[tid] = 0; + releaseSeq[tid] = 0; } assert(decodeToRenameDelay == 1); @@ -261,6 +263,8 @@ Rename::resetStage() for (ThreadID tid = 0; tid < numThreads; tid++) { stalls[tid].iew = false; + finalCommitSeq[tid] = 0; + releaseSeq[tid] = 0; } } @@ -416,7 +420,15 @@ Rename::tick() updateActivate(); - if (wroteToTimeBuffer || releaseSeq < finalCommitSeq) { + bool release_pending = false; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (releaseSeq[tid] < finalCommitSeq[tid]) { + release_pending = true; + break; + } + } + + if (wroteToTimeBuffer || release_pending) { DPRINTF(Activity, "Activity this cycle.\n"); cpu->activityThisCycle(); } @@ -427,21 +439,23 @@ Rename::releasePhysRegs() { // Release physical registers up to releaseWidth auto threads = activeThreads->begin(); - if (releaseSeq + releaseWidth < finalCommitSeq) { - releaseSeq += releaseWidth; - } else { - releaseSeq = finalCommitSeq; - } while (threads != activeThreads->end()) { ThreadID tid = *threads++; - removeFromHistory(releaseSeq, tid); + if (releaseSeq[tid] + releaseWidth < finalCommitSeq[tid]) { + releaseSeq[tid] += releaseWidth; + } else { + releaseSeq[tid] = finalCommitSeq[tid]; + } + + removeFromHistory(releaseSeq[tid], tid); // If we committed this cycle then doneSeqNum will be > 0 if (fromCommit->commitInfo[tid].doneSeqNum != 0 && !fromCommit->commitInfo[tid].squash) { - finalCommitSeq = fromCommit->commitInfo[tid].doneSeqNum; - releaseSeq = historyBuffer->empty() ? 0 : historyBuffer[tid].back().instSeqNum; + finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum; + releaseSeq[tid] = + historyBuffer[tid].empty() ? 0 : historyBuffer[tid].back().instSeqNum; } } } diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 4e83cc0919..861b0f82c2 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -277,9 +277,9 @@ class Rename */ std::list historyBuffer[MaxThreads]; - InstSeqNum finalCommitSeq = 0; + InstSeqNum finalCommitSeq[MaxThreads] = {}; - InstSeqNum releaseSeq = 0; + InstSeqNum releaseSeq[MaxThreads] = {}; void tryFreePReg(PhysRegIdPtr phys_reg); diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc index c4876e8158..aeafc9bb38 100644 --- a/src/cpu/pred/btb/abtb.cc +++ b/src/cpu/pred/btb/abtb.cc @@ -313,8 +313,9 @@ AheadBTB::putPCHistory(Addr startAddr, } std::shared_ptr -AheadBTB::getPredictionMeta() +AheadBTB::getPredictionMeta(ThreadID tid) { + (void)tid; // Lazy-initialize meta so callers never observe a null pointer // This avoids early-cycle crashes when prediction hasn't populated meta yet if (!meta) { diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh index 9e7abc6260..677f5f7f32 100644 --- a/src/cpu/pred/btb/abtb.hh +++ b/src/cpu/pred/btb/abtb.hh @@ -147,7 +147,7 @@ class AheadBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // not used void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index aed825e1e3..e625650d10 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -37,6 +37,8 @@ ittageStats(this, p.numPredictors) tableIndexMasks.resize(numPredictors); tableTagBits.resize(numPredictors); tableTagMasks.resize(numPredictors); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); for (unsigned int i = 0; i < p.numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -52,9 +54,15 @@ ittageStats(this, p.numPredictors) assert(tablePcShifts.size() >= numPredictors); - tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], (int)16)); - altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, (int)16)); - indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], (int)16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i], (int)16); + state.altTagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i] - 1, (int)16); + state.indexFoldedHist.emplace_back( + (int)histLengths[i], (int)tableIndexBits[i], (int)16); + } } // useAlt.resize(128); // for (unsigned i = 0; i < useAlt.size(); ++i) { @@ -63,6 +71,27 @@ ittageStats(this, p.numPredictors) usefulResetCnt = 0; } +ThreadID +BTBITTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBITTAGE::ThreadHistoryState & +BTBITTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBITTAGE::ThreadHistoryState & +BTBITTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + void BTBITTAGE::tickStart() { @@ -72,7 +101,8 @@ void BTBITTAGE::tick() {} void -BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, IndirectTargets& results) +BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, + IndirectTargets& results, ThreadID tid) { DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr); std::vector preds; @@ -149,7 +179,7 @@ BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, } // Note: predTargetHit will be updated in the update phase when we know the actual target TagePrediction pred(btb_entry.pc, main_info, alt_info, use_alt, main_target); - meta->preds[btb_entry.pc] = pred; + threadMeta[tid]->preds[btb_entry.pc] = pred; } } } @@ -161,17 +191,19 @@ BTBITTAGE::dryRunCycle(Addr startPC) { void BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const auto &state = historyState(tid); if (debugPC == stream_start) { debugFlag = true; } DPRINTF(ITTAGE, "putPCHistory startAddr: %#lx\n", stream_start); // clear old metas - meta = std::make_shared(); + threadMeta[tid] = std::make_shared(); // assign history for meta - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; lookupEntries.clear(); lookupIndices.clear(); @@ -180,8 +212,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vectorusefulMask = std::move(useful_mask); + threadMeta[tid]->usefulMask = std::move(useful_mask); for (int s = getDelay(); s < stagePreds.size(); s++) { auto &stage_pred = stagePreds[s]; stage_pred.indirectTargets.clear(); - lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.indirectTargets); + lookupHelper(stream_start, stage_pred.btbEntries, + stage_pred.indirectTargets, tid); } DPRINTF(ITTAGE, "putPCHistory end\n"); debugFlag = false; } std::shared_ptr -BTBITTAGE::getPredictionMeta() { - return meta; +BTBITTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } void @@ -419,7 +456,9 @@ BTBITTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis Addr BTBITTAGE::getTageTag(Addr pc, int t) { - return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get()); + const auto &state = historyState(0); + return getTageTag(pc, t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get()); } Addr @@ -436,7 +475,7 @@ BTBITTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr BTBITTAGE::getTageIndex(Addr pc, int t) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); } bool @@ -477,8 +516,10 @@ BTBITTAGE::satDecrement(int min, short &counter) * @param target The target address of the branch */ void -BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target) +BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, + Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::ITTAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); @@ -491,7 +532,9 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference foldedHist.update(history, 2, taken, pc, target); DPRINTF(ITTAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get()); @@ -502,7 +545,7 @@ BTBITTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr bool BTBITTAGE::tageHit() { - auto meta = getPredictionMeta(); + auto meta = getPredictionMeta(0); auto preds = std::static_pointer_cast(meta)->preds; bool hit = false; for (auto & [pc, pred] : preds) { @@ -530,7 +573,7 @@ void BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, taken, pc, target); + doUpdateHist(history, taken, pc, target, pred.tid); } /** @@ -549,18 +592,28 @@ BTBITTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredic void BTBITTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < numPredictors; i++) { - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } - doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, cond_taken, entry.getControlPC(), + entry.getTakenTarget(), entry.tid); } void BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); if (debugFlag) { DPRINTF(ITTAGE, "checking folded history when %s\n", when); std::string hist_str; @@ -571,7 +624,9 @@ BTBITTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe for (int type = 0; type < 2; type++) { DPRINTF(ITTAGE, "t: %d, type: %d\n", t, type); std::string buf2, buf3; - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; foldedHist.check(hist); } } diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh index e86b45817b..8269fdaeb6 100644 --- a/src/cpu/pred/btb/btb_ittage.hh +++ b/src/cpu/pred/btb/btb_ittage.hh @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -10,6 +11,7 @@ #include "base/statistics.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -30,6 +32,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor { using defer = std::shared_ptr; using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: typedef BTBITTAGEParams Params; @@ -99,7 +102,7 @@ class BTBITTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update 3 folded history, according history and pred.taken // the other specUpdateHist methods are left blank @@ -116,11 +119,13 @@ class BTBITTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); private: // return provided - void lookupHelper(Addr stream_start, const std::vector &btbEntries, IndirectTargets& results); + void lookupHelper(Addr stream_start, const std::vector &btbEntries, + IndirectTargets& results, ThreadID tid); // use blockPC Addr getTageIndex(Addr pc, int table); @@ -139,7 +144,8 @@ class BTBITTAGE : public TimedBaseBTBPredictor } // Update branch history - void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target); + void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target, + ThreadID tid); const unsigned numPredictors; @@ -151,9 +157,14 @@ class BTBITTAGE : public TimedBaseBTBPredictor std::vector tableTagMasks; std::vector tablePcShifts; std::vector histLengths; - std::vector tagFoldedHist; - std::vector altTagFoldedHist; - std::vector indexFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + }; + + std::vector threadHistory; LFSR64 allocLFSR; @@ -261,7 +272,10 @@ class BTBITTAGE : public TimedBaseBTBPredictor } } TageMeta; - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; public: diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc index b2b7726a5f..f0a3837191 100755 --- a/src/cpu/pred/btb/btb_mgsc.cc +++ b/src/cpu/pred/btb/btb_mgsc.cc @@ -60,41 +60,64 @@ BTBMGSC::initStorage() assert(isPowerOf2(numCtrsPerLine)); numCtrsPerLineBits = log2i(numCtrsPerLine); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + auto bwTableSize = allocPredTable(bwTable, bwTableNum, bwTableIdxWidth); - for (unsigned int i = 0; i < bwTableNum; ++i) { - indexBwFoldedHist.push_back(GlobalBwFoldedHist(bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < bwTableNum; ++i) { + state.indexBwFoldedHist.emplace_back( + bwHistLen[i], bwTableIdxWidth - numCtrsPerLineBits, 16); + } } bwIndex.resize(bwTableNum); auto lTableSize = allocPredTable(lTable, lTableNum, lTableIdxWidth); - indexLFoldedHist.resize(numEntriesFirstLocalHistories); - for (unsigned int i = 0; i < lTableNum; ++i) { - for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { - indexLFoldedHist[k].push_back(LocalFoldedHist(lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.indexLFoldedHist.resize(numEntriesFirstLocalHistories); + for (unsigned int i = 0; i < lTableNum; ++i) { + for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { + state.indexLFoldedHist[k].push_back(LocalFoldedHist( + lHistLen[i], lTableIdxWidth - numCtrsPerLineBits, 16)); + } } } lIndex.resize(lTableNum); auto iTableSize = allocPredTable(iTable, iTableNum, iTableIdxWidth); - for (unsigned int i = 0; i < iTableNum; ++i) { - assert(iHistLen[i] >= 0); - assert(static_cast(iHistLen[i]) < 63); - assert(pow2(static_cast(iHistLen[i])) <= iTableSize); - indexIFoldedHist.push_back(ImliFoldedHist(iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < iTableNum; ++i) { + assert(iHistLen[i] >= 0); + assert(static_cast(iHistLen[i]) < 63); + assert(pow2(static_cast(iHistLen[i])) <= iTableSize); + state.indexIFoldedHist.emplace_back( + iHistLen[i], iTableIdxWidth - numCtrsPerLineBits, 16); + } } iIndex.resize(iTableNum); auto gTableSize = allocPredTable(gTable, gTableNum, gTableIdxWidth); - for (unsigned int i = 0; i < gTableNum; ++i) { - assert(gTable.size() >= gTableNum); - indexGFoldedHist.push_back(GlobalFoldedHist(gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < gTableNum; ++i) { + assert(gTable.size() >= gTableNum); + state.indexGFoldedHist.emplace_back( + gHistLen[i], gTableIdxWidth - numCtrsPerLineBits, 16); + } } gIndex.resize(gTableNum); auto pTableSize = allocPredTable(pTable, pTableNum, pTableIdxWidth); - for (unsigned int i = 0; i < pTableNum; ++i) { - assert(pTable.size() >= pTableNum); - indexPFoldedHist.push_back(PathFoldedHist(pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + for (unsigned int i = 0; i < pTableNum; ++i) { + assert(pTable.size() >= pTableNum); + state.indexPFoldedHist.emplace_back( + pHistLen[i], pTableIdxWidth - numCtrsPerLineBits, 2); + } } pIndex.resize(pTableNum); @@ -219,6 +242,27 @@ BTBMGSC::BTBMGSC(const Params &p) #endif BTBMGSC::~BTBMGSC() {} +ThreadID +BTBMGSC::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBMGSC::ThreadHistoryState & +BTBMGSC::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBMGSC::ThreadHistoryState & +BTBMGSC::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void BTBMGSC::setTrace() @@ -357,34 +401,41 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum) * @return TagePrediction containing main and alternative predictions */ BTBMGSC::MgscPrediction -BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info) +BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, + const TageInfoForMGSC &tage_info, + ThreadID tid) { DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc, btb_entry.alwaysTaken); + const auto &state = historyState(tid); // Calculate indices for all tables for (unsigned int i = 0; i < bwTableNum; ++i) { - bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, indexBwFoldedHist[i].get()); + bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, + state.indexBwFoldedHist[i].get()); } for (unsigned int i = 0; i < lTableNum; ++i) { lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits, - indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get()); + state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get()); } // std::string buf; // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf); // DPRINTF(MGSC, "startPC: %#lx, local index: %d, local_folded_hist: %s\n", startPC, lIndex[0], buf.c_str()); for (unsigned int i = 0; i < iTableNum; ++i) { - iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, indexIFoldedHist[i].get()); + iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, + state.indexIFoldedHist[i].get()); } for (unsigned int i = 0; i < gTableNum; ++i) { - gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, indexGFoldedHist[i].get()); + gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, + state.indexGFoldedHist[i].get()); } for (unsigned int i = 0; i < pTableNum; ++i) { - pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, indexPFoldedHist[i].get()); + pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, + state.indexPFoldedHist[i].get()); } for (unsigned int i = 0; i < biasTableNum; ++i) { @@ -478,7 +529,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC */ void BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntries, - const std::unordered_map &tageInfoForMgscs, CondTakens &results) + const std::unordered_map &tageInfoForMgscs, + CondTakens &results, ThreadID tid) { DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC); @@ -488,8 +540,9 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntri if (btb_entry.isCond && btb_entry.valid) { auto tage_info = tageInfoForMgscs.find(btb_entry.pc); if (tage_info != tageInfoForMgscs.end()) { - auto pred = generateSinglePrediction(btb_entry, startPC, tage_info->second); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, + tage_info->second, tid); + threadMeta[tid]->preds[btb_entry.pc] = pred; results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } else { assert(false); @@ -514,6 +567,8 @@ void BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const auto &state = historyState(tid); DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start); // IMPORTANT: when this function is called, @@ -525,25 +580,29 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, } // Clear old prediction metadata and save current history state - meta = std::make_shared(); - meta->indexBwFoldedHist = indexBwFoldedHist; - meta->indexLFoldedHist = indexLFoldedHist; - meta->indexIFoldedHist = indexIFoldedHist; - meta->indexGFoldedHist = indexGFoldedHist; - meta->indexPFoldedHist = indexPFoldedHist; + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->indexBwFoldedHist = state.indexBwFoldedHist; + threadMeta[tid]->indexLFoldedHist = state.indexLFoldedHist; + threadMeta[tid]->indexIFoldedHist = state.indexIFoldedHist; + threadMeta[tid]->indexGFoldedHist = state.indexGFoldedHist; + threadMeta[tid]->indexPFoldedHist = state.indexPFoldedHist; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(stream_start, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens); + lookupHelper(stream_start, stage_pred.btbEntries, + stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid); } } std::shared_ptr -BTBMGSC::getPredictionMeta() +BTBMGSC::getPredictionMeta(ThreadID tid) { - return meta; + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -1092,10 +1151,11 @@ BTBMGSC::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, bool ta void BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getHistInfo(); - doUpdateHist(history, shamt, cond_taken, indexGFoldedHist); // use global history to update G folded history + doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist); // use global history to update G folded history } /** @@ -1113,8 +1173,9 @@ BTBMGSC::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredictio void BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, 2, taken, indexPFoldedHist, pc, target); // only path history needs pc! + doUpdateHist(history, 2, taken, state.indexPFoldedHist, pc, target); // only path history needs pc! } @@ -1133,10 +1194,11 @@ BTBMGSC::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti void BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getBwHistInfo(); - doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist); } /** @@ -1154,12 +1216,13 @@ BTBMGSC::specUpdateBwHist(const boost::dynamic_bitset<> &history, FullBTBPredict void BTBMGSC::specUpdateIHist(FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getBwHistInfo(); // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update) boost::dynamic_bitset<> dummy; - doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist); + doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist); } /** @@ -1177,11 +1240,12 @@ BTBMGSC::specUpdateIHist(FullBTBPrediction &pred) void BTBMGSC::specUpdateLHist(const std::vector> &history, FullBTBPrediction &pred) { + auto &state = historyState(pred.tid); int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getHistInfo(); doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]); + state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]); } /** @@ -1203,11 +1267,12 @@ BTBMGSC::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget & if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < gTableNum; i++) { - indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]); + state.indexGFoldedHist[i].recover(predMeta->indexGFoldedHist[i]); } - doUpdateHist(history, shamt, cond_taken, indexGFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexGFoldedHist); } /** @@ -1229,11 +1294,13 @@ BTBMGSC::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < pTableNum; i++) { - indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]); + state.indexPFoldedHist[i].recover(predMeta->indexPFoldedHist[i]); } - doUpdateHist(history, 2, cond_taken, indexPFoldedHist, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, 2, cond_taken, state.indexPFoldedHist, + entry.getControlPC(), entry.getTakenTarget()); } /** @@ -1255,11 +1322,12 @@ BTBMGSC::recoverBwHist(const boost::dynamic_bitset<> &history, const FetchTarget if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < bwTableNum; i++) { - indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]); + state.indexBwFoldedHist[i].recover(predMeta->indexBwFoldedHist[i]); } - doUpdateHist(history, shamt, cond_taken, indexBwFoldedHist); + doUpdateHist(history, shamt, cond_taken, state.indexBwFoldedHist); } /** @@ -1281,13 +1349,14 @@ BTBMGSC::recoverIHist(const FetchTarget &entry, int shamt, bool cond_taken) if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < iTableNum; i++) { - indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]); + state.indexIFoldedHist[i].recover(predMeta->indexIFoldedHist[i]); } // IMLI uses counter only, pass empty bitset (not used by ImliFoldedHist::update) boost::dynamic_bitset<> dummy; - doUpdateHist(dummy, shamt, cond_taken, indexIFoldedHist); + doUpdateHist(dummy, shamt, cond_taken, state.indexIFoldedHist); } /** @@ -1310,14 +1379,15 @@ BTBMGSC::recoverLHist(const std::vector> &history, const if (!isEnabled()) { return; // No recover when disabled } + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (unsigned int k = 0; k < numEntriesFirstLocalHistories; ++k) { for (int i = 0; i < lTableNum; i++) { - indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]); + state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]); } } doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]); + state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]); } #ifndef UNIT_TEST @@ -1438,6 +1508,15 @@ void BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, const std::vector> &LHistory, const char *when) { + checkFoldedHist(Ghistory, PHistory, LHistory, 0, when); +} + +void +BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, + const std::vector> &LHistory, + ThreadID tid, const char *when) +{ + auto &state = historyState(tid); DPRINTF(MGSC, "checking folded history when %s\n", when); if (debug::MGSC) { std::string hist_str; @@ -1445,17 +1524,17 @@ BTBMGSC::checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::d DPRINTF(MGSC, "history:\t%s\n", hist_str.c_str()); } for (int t = 0; t < gTableNum; t++) { - auto &foldedHist = indexGFoldedHist[t]; + auto &foldedHist = state.indexGFoldedHist[t]; foldedHist.check(Ghistory); } for (int t = 0; t < pTableNum; t++) { - auto &foldedHist = indexPFoldedHist[t]; + auto &foldedHist = state.indexPFoldedHist[t]; foldedHist.check(PHistory); } for (int t = 0; t < lTableNum; t++) { - assert(LHistory.size() == indexLFoldedHist.size()); + assert(LHistory.size() == state.indexLFoldedHist.size()); for (int i = 0; i < LHistory.size(); i++) { - auto &foldedHist = indexLFoldedHist[i][t]; + auto &foldedHist = state.indexLFoldedHist[i][t]; foldedHist.check(LHistory[i]); } } diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh index 100fc639a4..6ff29b13c8 100755 --- a/src/cpu/pred/btb/btb_mgsc.hh +++ b/src/cpu/pred/btb/btb_mgsc.hh @@ -14,6 +14,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -39,6 +40,7 @@ namespace test { class BTBMGSC : public TimedBaseBTBPredictor { + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST BTBMGSC(); @@ -157,7 +159,7 @@ class BTBMGSC : public TimedBaseBTBPredictor void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update all folded history, according history and pred.taken void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -191,6 +193,9 @@ class BTBMGSC : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, const std::vector> &LHistory, const char *when); // Check GHR folded + void checkFoldedHist(const boost::dynamic_bitset<> &Ghistory, const boost::dynamic_bitset<> &PHistory, + const std::vector> &LHistory, + ThreadID tid, const char *when); // Check GHR folded // Calculate MGSC weight index Addr getPcIndex(Addr pc, unsigned tableIndexBits); @@ -247,7 +252,8 @@ class BTBMGSC : public TimedBaseBTBPredictor // Look up predictions in MGSC tables for a stream of instructions void lookupHelper(const Addr &stream_start, const std::vector &btbEntries, - const std::unordered_map &tageInfoForMgscs, CondTakens &results); + const std::unordered_map &tageInfoForMgscs, + CondTakens &results, ThreadID tid); // Calculate MGSC history index with folded history Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist); @@ -277,7 +283,8 @@ class BTBMGSC : public TimedBaseBTBPredictor // Helper method to generate prediction for a single BTB entry MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const TageInfoForMGSC &tage_info); + const TageInfoForMGSC &tage_info, + ThreadID tid); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -353,12 +360,16 @@ class BTBMGSC : public TimedBaseBTBPredictor bool enablePCThreshold; Addr focusBranchPC; - // Folded history for index calculation - std::vector indexBwFoldedHist; - std::vector> indexLFoldedHist; - std::vector indexIFoldedHist; - std::vector indexGFoldedHist; - std::vector indexPFoldedHist; + struct ThreadHistoryState + { + std::vector indexBwFoldedHist; + std::vector> indexLFoldedHist; + std::vector indexIFoldedHist; + std::vector indexGFoldedHist; + std::vector indexPFoldedHist; + }; + + std::vector threadHistory; // The actual MGSC prediction tables (table x index x line) std::vector>> bwTable; @@ -552,8 +563,9 @@ class BTBMGSC : public TimedBaseBTBPredictor static const std::unordered_map &preds(const BTBMGSC &mgsc) { - assert(mgsc.meta); - return mgsc.meta->preds; + assert(!mgsc.threadMeta.empty()); + assert(mgsc.threadMeta[0]); + return mgsc.threadMeta[0]->preds; } }; #endif @@ -594,7 +606,10 @@ class BTBMGSC : public TimedBaseBTBPredictor } } MgscMeta; - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index 85db48441a..c81bfb1a1d 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -155,6 +155,9 @@ tageStats(this, p.numPredictors, p.numBanks) tableTagBits.resize(numPredictors); tableTagMasks.resize(numPredictors); + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + for (unsigned int i = 0; i < numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -173,17 +176,14 @@ tageStats(this, p.numPredictors, p.numBanks) tableTagMasks[i].resize(tableTagBits[i], true); assert(tablePcShifts.size() >= numPredictors); - const auto historyType = usePathHistory ? HistoryType::PATH : HistoryType::GLOBAL; - tagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i], - 16, historyType); - altTagFoldedHist.emplace_back((int)histLengths[i], - (int)tableTagBits[i] - 1, 16, - historyType); - indexFoldedHist.emplace_back((int)histLengths[i], - (int)tableIndexBits[i], 16, - historyType); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i], 16, historyType); + state.altTagFoldedHist.emplace_back((int)histLengths[i], (int)tableTagBits[i] - 1, 16, historyType); + state.indexFoldedHist.emplace_back((int)histLengths[i], (int)tableIndexBits[i], 16, historyType); + } } usefulResetCnt = 0; @@ -202,6 +202,27 @@ BTBTAGE::~BTBTAGE() { } +ThreadID +BTBTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +BTBTAGE::ThreadHistoryState & +BTBTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const BTBTAGE::ThreadHistoryState & +BTBTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void BTBTAGE::setTrace() @@ -275,8 +296,10 @@ BTBTAGE::tickStart() {} BTBTAGE::TagePrediction BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - std::shared_ptr predMeta) { + std::shared_ptr predMeta, + ThreadID tid) { DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); + const auto &state = historyState(tid); // Find main and alternative predictions bool provided = false; @@ -292,10 +315,11 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, // Calculate index and tag: use snapshot if provided, otherwise use current folded history // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i); + : getTageIndex(startPC, i, state.indexFoldedHist[i].get()); Addr tag = predMeta ? getTageTag(startPC, i, predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position) - : getTageTag(startPC, i, position); + : getTageTag(startPC, i, state.tagFoldedHist[i].get(), + state.altTagFoldedHist[i].get(), position); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -391,7 +415,8 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, */ void BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, - std::unordered_map &tageInfoForMgscs, CondTakens& results) + std::unordered_map &tageInfoForMgscs, + CondTakens& results, ThreadID tid) { DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -399,8 +424,8 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntri for (auto &btb_entry : btbEntries) { // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { - auto pred = generateSinglePrediction(btb_entry, startPC); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid); + threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); tageInfoForMgscs[btb_entry.pc].tage_pred_taken = pred.taken; @@ -442,6 +467,8 @@ BTBTAGE::dryRunCycle(Addr startPC) { */ void BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); predBankValid = true; @@ -459,24 +486,28 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector(); - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; - meta->history = history; + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; + threadMeta[tid]->history = history; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, stage_pred.condTakens); + lookupHelper(startPC, stage_pred.btbEntries, stage_pred.tageInfoForMgscs, + stage_pred.condTakens, tid); } } std::shared_ptr -BTBTAGE::getPredictionMeta() { - return meta; +BTBTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -1042,7 +1073,9 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr BTBTAGE::getTageTag(Addr pc, int t, Addr position) { - return getTageTag(pc, t, tagFoldedHist[t].get(), altTagFoldedHist[t].get(), position); + const auto &state = historyState(0); + return getTageTag(pc, t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get(), position); } Addr @@ -1062,7 +1095,7 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr BTBTAGE::getTageIndex(Addr pc, int t) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); } bool @@ -1125,8 +1158,9 @@ BTBTAGE::getBankId(Addr pc) const */ void BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, - bool taken, Addr pc, Addr target) + bool taken, Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::TAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); @@ -1149,7 +1183,10 @@ BTBTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, int shamt, for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; + // since we have folded path history, we can put arbitrary shamt here, and it wouldn't make a difference foldedHist.update(history, shamt, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, type: %d, foldedHist _folded 0x%lx\n", t, type, foldedHist.get()); } @@ -1177,7 +1214,7 @@ BTBTAGE::specUpdateHist(const boost::dynamic_bitset<> &history, } auto [shamt, taken] = pred.getHistInfo(); - doUpdateHist(history, shamt, taken, 0, 0); + doUpdateHist(history, shamt, taken, 0, 0, pred.tid); } void @@ -1188,7 +1225,7 @@ BTBTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPredicti } auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, 2, taken, pc, target); + doUpdateHist(history, 2, taken, pc, target, pred.tid); } void @@ -1197,9 +1234,9 @@ BTBTAGE::recoverFoldedHist(const FetchTarget &entry) auto predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); for (int i = 0; i < numPredictors; i++) { - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + threadHistory[entry.tid].tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + threadHistory[entry.tid].altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + threadHistory[entry.tid].indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } } @@ -1225,7 +1262,7 @@ BTBTAGE::recoverHist(const boost::dynamic_bitset<> &history, } recoverFoldedHist(entry); - doUpdateHist(history, shamt, cond_taken, 0, 0); + doUpdateHist(history, shamt, cond_taken, 0, 0, entry.tid); } void @@ -1238,13 +1275,21 @@ BTBTAGE::recoverPHist(const boost::dynamic_bitset<> &history, recoverFoldedHist(entry); doUpdateHist(history, 2, cond_taken, entry.getControlPC(), - entry.getTakenTarget()); + entry.getTakenTarget(), entry.tid); } // Check folded history after speculative update and recovery void BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); DPRINTF(TAGE, "checking folded history when %s\n", when); if (debug::TAGEHistory) { std::string hist_str; @@ -1254,7 +1299,9 @@ BTBTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) for (int t = 0; t < numPredictors; t++) { for (int type = 0; type < 3; type++) { std::string buf2, buf3; - auto &foldedHist = type == 0 ? indexFoldedHist[t] : type == 1 ? tagFoldedHist[t] : altTagFoldedHist[t]; + auto &foldedHist = type == 0 ? state.indexFoldedHist[t] + : type == 1 ? state.tagFoldedHist[t] + : state.altTagFoldedHist[t]; foldedHist.check(hist); } } diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index abd4bf0f49..33bd6826ae 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include "base/sat_counter.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -43,6 +45,7 @@ class BTBTAGE : public TimedBaseBTBPredictor { using defer = std::shared_ptr; using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST // Test constructor @@ -140,7 +143,7 @@ class BTBTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // Update folded history from GHR when configured in direction-history mode. void specUpdateHist(const boost::dynamic_bitset<> &history, @@ -167,6 +170,7 @@ class BTBTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover virtual void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); #ifndef UNIT_TEST protected: @@ -174,7 +178,8 @@ class BTBTAGE : public TimedBaseBTBPredictor // Look up predictions in TAGE tables for a stream of instructions void lookupHelper(const Addr &startPC, const std::vector &btbEntries, - std::unordered_map &tageInfoForMgscs, CondTakens& results); + std::unordered_map &tageInfoForMgscs, + CondTakens& results, ThreadID tid); // Calculate TAGE index for a given PC and table Addr getTageIndex(Addr pc, int table); @@ -204,7 +209,7 @@ class BTBTAGE : public TimedBaseBTBPredictor // Update branch history void doUpdateHist(const bitset &history, int shamt, bool taken, - Addr pc, Addr target); + Addr pc, Addr target, ThreadID tid); void recoverFoldedHist(const FetchTarget &entry); // Number of TAGE predictor tables @@ -231,17 +236,16 @@ class BTBTAGE : public TimedBaseBTBPredictor // History lengths for each table std::vector histLengths; - // Folded history for tag calculation - std::vector tagFoldedHist; - - // Folded history for alternative tag calculation - std::vector altTagFoldedHist; + const bool usePathHistory; - // Folded history for index calculation - std::vector indexFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + }; - // Select whether BTBTAGE consumes PHR or GHR folded history. - const bool usePathHistory; + std::vector threadHistory; // Linear feedback shift register for allocation LFSR64 allocLFSR; @@ -461,7 +465,8 @@ private: // If predMeta is nullptr, use current folded history (prediction path) TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const std::shared_ptr predMeta = nullptr); + const std::shared_ptr predMeta = nullptr, + ThreadID tid = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -486,7 +491,11 @@ private: unsigned getLRUVictim(int table, Addr index); unsigned getNumWays(unsigned table) const; - std::shared_ptr meta; + std::vector> threadMeta; + + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/btb_tage_ub.cc b/src/cpu/pred/btb/btb_tage_ub.cc index e1a1cf698a..d324bd962b 100644 --- a/src/cpu/pred/btb/btb_tage_ub.cc +++ b/src/cpu/pred/btb/btb_tage_ub.cc @@ -322,8 +322,9 @@ BTBTAGEUpperBound::putPCHistory(Addr startAddr, const bitset &history, } std::shared_ptr -BTBTAGEUpperBound::getPredictionMeta() +BTBTAGEUpperBound::getPredictionMeta(ThreadID tid) { + (void)tid; return ubMeta; } diff --git a/src/cpu/pred/btb/btb_tage_ub.hh b/src/cpu/pred/btb/btb_tage_ub.hh index f97792c713..b4aae9e7cc 100644 --- a/src/cpu/pred/btb/btb_tage_ub.hh +++ b/src/cpu/pred/btb/btb_tage_ub.hh @@ -95,7 +95,7 @@ class BTBTAGEUpperBound : public BTBTAGE const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh index 5c394ac9cc..649641b420 100644 --- a/src/cpu/pred/btb/btb_ubtb.hh +++ b/src/cpu/pred/btb/btb_ubtb.hh @@ -141,7 +141,7 @@ class UBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override { return meta; } diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 8ed265af90..2e272047eb 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -47,7 +47,6 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) bpDBSwitches(p.bpDBSwitches), numStages(p.numStages), ftq(p.numThreads, p.ftq_size), - historyManager(16), // TODO: fix this resolveBlockThreshold(p.resolveBlockThreshold), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { @@ -87,6 +86,12 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) printf("\n"); } + historyManagers.reserve(numThreads); + resolveDequeueFailCounters.assign(numThreads, 0); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + historyManagers.emplace_back(16); + } + for (int tid=0;tid= resolveBlockThreshold) { - blockPredictionOnce(); - resolveDequeueFailCounter = 0; + auto &failCounter = resolveDequeueFailCounters[tid]; + failCounter++; + if (failCounter >= resolveBlockThreshold) { + blockPredictionOnce(tid); + failCounter = 0; } } void -DecoupledBPUWithBTB::blockPredictionOnce() +DecoupledBPUWithBTB::blockPredictionOnce(ThreadID tid) { - // smtTODO - threads[0].blockPredictionPending = true; + threads[tid].blockPredictionPending = true; } void @@ -808,7 +813,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid) // Save predictors' metadata for (int i = 0; i < numComponents; i++) { - entry.predMetas[i] = components[i]->getPredictionMeta(); + entry.predMetas[i] = components[i]->getPredictionMeta(tid); } // Initialize default resolution state @@ -843,7 +848,8 @@ DecoupledBPUWithBTB::fillAheadPipeline(FetchTarget &entry) } void -DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history) +DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history, + ThreadID tid) { // This function performs a crucial validation of branch history consistency // It rebuilds the "ideal" history from HistoryManager's records and compares @@ -854,7 +860,7 @@ DecoupledBPUWithBTB::checkHistory(const boost::dynamic_bitset<> &history) boost::dynamic_bitset<> ideal_hash_hist(historyBits, 0); // Iterate through all speculative history entries stored in HistoryManager - for (const auto entry: historyManager.getSpeculativeHist()) { + for (const auto entry: historyManagers[tid].getSpeculativeHist()) { // Only process entries that have non-zero shift amount (actual branches) if (entry.shamt != 0) { // Accumulate total history bits @@ -897,6 +903,12 @@ DecoupledBPUWithBTB::resetPC(Addr new_pc) threads[i].s0PC = new_pc; } +void +DecoupledBPUWithBTB::resetPC(ThreadID tid, Addr new_pc) +{ + threads[tid].s0PC = new_pc; +} + Addr DecoupledBPUWithBTB::getPreservedReturnAddr(const DynInstPtr &dynInst) { @@ -944,7 +956,7 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry) histShiftIn(shamt, taken, s0History); // Update history manager and verify TAGE folded history - historyManager.addSpeculativeHist( + historyManagers[tid].addSpeculativeHist( entry.startPC, shamt, taken, entry.predBranchInfo, ftq.backId(tid) + 1); // Get prediction information for global backward history updates @@ -968,17 +980,18 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry) #ifndef NDEBUG if (tage->isEnabled()) { tage->checkFoldedHist( - tage->usesPathHistory() ? s0PHistory : s0History, + tage->usesPathHistory() ? s0PHistory : s0History, tid, "speculative update"); } if (ittage->isEnabled()) { - ittage->checkFoldedHist(s0PHistory, "speculative update"); + ittage->checkFoldedHist(s0PHistory, tid, "speculative update"); } if (microtage->isEnabled()) { - microtage->checkFoldedHist(s0PHistory, "speculative update"); + microtage->checkFoldedHist(s0PHistory, tid, "speculative update"); } if (mgsc->isEnabled()) { - mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, "speculative update"); + mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid, + "speculative update"); } #endif } @@ -1054,32 +1067,34 @@ DecoupledBPUWithBTB::recoverHistoryForSquash( // Update history manager with appropriate branch info if (squash_type == SQUASH_CTRL) { - historyManager.squash(target_id, real_shamt, real_taken, target.exeBranchInfo); + historyManagers[tid].squash(target_id, real_shamt, real_taken, + target.exeBranchInfo); } else { - historyManager.squash(target_id, real_shamt, real_taken, BranchInfo()); + historyManagers[tid].squash(target_id, real_shamt, real_taken, + BranchInfo()); } // Perform history consistency checks when not a fast build variant #ifndef NDEBUG - checkHistory(s0History); + checkHistory(s0History, tid); if (tage->isEnabled()) { tage->checkFoldedHist( - tage->usesPathHistory() ? s0PHistory : s0History, + tage->usesPathHistory() ? s0PHistory : s0History, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (ittage->isEnabled()) { - ittage->checkFoldedHist(s0PHistory, + ittage->checkFoldedHist(s0PHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (microtage->isEnabled()) { - microtage->checkFoldedHist(s0PHistory, + microtage->checkFoldedHist(s0PHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } if (mgsc->isEnabled()) { - mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, + mgsc->checkFoldedHist(s0History, s0PHistory, s0LHistory, tid, squash_type == SQUASH_CTRL ? "control squash" : squash_type == SQUASH_OTHER ? "non control squash" : "trap squash"); } diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 134258f77c..38adad4115 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -140,8 +140,8 @@ class DecoupledBPUWithBTB : public BPredUnit bool blockPredictionPending{false}; } threads[MaxThreads]; - HistoryManager historyManager; - unsigned resolveDequeueFailCounter{0}; + std::vector historyManagers; + std::vector resolveDequeueFailCounters; const unsigned resolveBlockThreshold; ThreadID scheduleThread(); @@ -424,7 +424,7 @@ class DecoupledBPUWithBTB : public BPredUnit void overrideStats(OverrideReason overrideReason); - void checkHistory(const boost::dynamic_bitset<> &history); + void checkHistory(const boost::dynamic_bitset<> &history, ThreadID tid); Addr getPreservedReturnAddr(const DynInstPtr &dynInst); @@ -703,6 +703,7 @@ class DecoupledBPUWithBTB : public BPredUnit unsigned control_inst_size = 0); void resetPC(Addr new_pc); + void resetPC(ThreadID tid, Addr new_pc); // Helper functions for update bool resolveUpdate(unsigned &target_id, ThreadID tid); @@ -710,9 +711,9 @@ class DecoupledBPUWithBTB : public BPredUnit void markCFIResolved(unsigned &target, uint64_t resolvedInstPC, ThreadID tid); void updatePredictorComponents(FetchTarget &target); void updateStatistics(const FetchTarget &target); - void notifyResolveSuccess(); - void notifyResolveFailure(); - void blockPredictionOnce(); + void notifyResolveSuccess(ThreadID tid); + void notifyResolveFailure(ThreadID tid); + void blockPredictionOnce(ThreadID tid); /** * @brief Types of control flow instructions for misprediction tracking diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index 4ab8445677..abd2923739 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -313,8 +313,9 @@ MBTB::putPCHistory(Addr startAddr, } std::shared_ptr -MBTB::getPredictionMeta() +MBTB::getPredictionMeta(ThreadID tid) { + (void)tid; return meta; } diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh index d736d0f55c..b4f587a141 100644 --- a/src/cpu/pred/btb/mbtb.hh +++ b/src/cpu/pred/btb/mbtb.hh @@ -147,7 +147,7 @@ class MBTB : public TimedBaseBTBPredictor /** Get prediction BTBMeta * @return Returns the prediction meta */ - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // not used void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc index fb8dd2f139..7fd88b0845 100644 --- a/src/cpu/pred/btb/microtage.cc +++ b/src/cpu/pred/btb/microtage.cc @@ -95,6 +95,9 @@ tageStats(this, p.numPredictors, p.numBanks) } // Initialize base table for fallback predictions + threadHistory.resize(MaxThreads); + threadMeta.resize(MaxThreads); + for (unsigned int i = 0; i < numPredictors; ++i) { //initialize ittage predictor assert(tableSizes.size() >= numPredictors); @@ -111,9 +114,15 @@ tageStats(this, p.numPredictors, p.numBanks) assert(tablePcShifts.size() >= numPredictors); - tagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i], 16)); - altTagFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableTagBits[i]-1, 16)); - indexFoldedHist.push_back(PathFoldedHist((int)histLengths[i], (int)tableIndexBits[i], 16)); + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + auto &state = threadHistory[tid]; + state.tagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i], 16); + state.altTagFoldedHist.emplace_back( + (int)histLengths[i], (int)tableTagBits[i] - 1, 16); + state.indexFoldedHist.emplace_back( + (int)histLengths[i], (int)tableIndexBits[i], 16); + } } usefulResetCnt = 0; @@ -127,6 +136,27 @@ MicroTAGE::~MicroTAGE() { } +ThreadID +MicroTAGE::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +MicroTAGE::ThreadHistoryState & +MicroTAGE::historyState(ThreadID tid) +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + +const MicroTAGE::ThreadHistoryState & +MicroTAGE::historyState(ThreadID tid) const +{ + assert(tid < threadHistory.size()); + return threadHistory[tid]; +} + // Set up tracing for debugging void MicroTAGE::setTrace() @@ -181,8 +211,10 @@ MicroTAGE::tickStart() {} MicroTAGE::TagePrediction MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - std::shared_ptr predMeta) { + std::shared_ptr predMeta, + ThreadID tid) { DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); + const auto &state = historyState(tid); bool provided = false; TageTableInfo main_info; @@ -196,10 +228,11 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i); + : getTageIndex(startPC, i, state.indexFoldedHist[i].get()); Addr tag = predMeta ? getTageTag(startPC, i, predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position) - : getTageTag(startPC, i, tagFoldedHist[i].get(),altTagFoldedHist[i].get(), position); + : getTageTag(startPC, i, state.tagFoldedHist[i].get(), + state.altTagFoldedHist[i].get(), position); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -255,7 +288,8 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, * @return Map of branch PC addresses to their predicted outcomes */ void -MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, CondTakens& results) +MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, + CondTakens& results, ThreadID tid) { DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -263,8 +297,9 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEnt for (auto &btb_entry : btbEntries) { // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { - auto pred = generateSinglePrediction(btb_entry, startPC); - meta->preds[btb_entry.pc] = pred; + auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, + tid); + threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } @@ -295,6 +330,8 @@ MicroTAGE::dryRunCycle(Addr startPC) { */ void MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { + const ThreadID tid = predictorTid(stagePreds); + const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); predBankValid = true; @@ -312,30 +349,36 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector(); - meta->tagFoldedHist = tagFoldedHist; - meta->altTagFoldedHist = altTagFoldedHist; - meta->indexFoldedHist = indexFoldedHist; - meta->aheadIndexFoldedHistValid = !aheadindexFoldedHist.empty(); - if (meta->aheadIndexFoldedHistValid) { - meta->aheadIndexFoldedHist = aheadindexFoldedHist.front(); + threadMeta[tid] = std::make_shared(); + threadMeta[tid]->tagFoldedHist = state.tagFoldedHist; + threadMeta[tid]->altTagFoldedHist = state.altTagFoldedHist; + threadMeta[tid]->indexFoldedHist = state.indexFoldedHist; + threadMeta[tid]->aheadIndexFoldedHistValid = + !state.aheadIndexFoldedHist.empty(); + if (threadMeta[tid]->aheadIndexFoldedHistValid) { + threadMeta[tid]->aheadIndexFoldedHist = + state.aheadIndexFoldedHist.front(); } else { - meta->aheadIndexFoldedHist.clear(); + threadMeta[tid]->aheadIndexFoldedHist.clear(); } - meta->history = history; + threadMeta[tid]->history = history; for (int s = getDelay(); s < stagePreds.size(); s++) { // TODO: only lookup once for one btb entry in different stages auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); - lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens); + lookupHelper(startPC, stage_pred.btbEntries, stage_pred.condTakens, + tid); } } std::shared_ptr -MicroTAGE::getPredictionMeta() { - return meta; +MicroTAGE::getPredictionMeta(ThreadID tid) { + if (tid >= threadMeta.size()) { + return nullptr; + } + return threadMeta[tid]; } /** @@ -783,7 +826,7 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr MicroTAGE::getTageIndex(Addr pc, int t) { - return getTageIndex(pc, t, indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); } bool @@ -849,23 +892,26 @@ MicroTAGE::getBankId(Addr pc) const * @param taken Whether the branch was taken */ void -MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr pc, Addr target) +MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, + Addr pc, Addr target, ThreadID tid) { + auto &state = historyState(tid); if (debug::TAGEHistory) { // if debug flag is off, do not use to_string since it's too slow std::string buf; boost::to_string(history, buf); DPRINTF(TAGEHistory, "in doUpdateHist, taken %d, pc %#lx, history %s\n", taken, pc, buf.c_str()); } - if (!aheadindexFoldedHist.empty()) { - indexFoldedHist = aheadindexFoldedHist.front(); + if (!state.aheadIndexFoldedHist.empty()) { + state.indexFoldedHist = state.aheadIndexFoldedHist.front(); } if (!taken) { - if (debug::TAGEHistory && !aheadindexFoldedHist.empty()) { + if (debug::TAGEHistory && !state.aheadIndexFoldedHist.empty()) { bool mismatch = false; for (int t = 0; t < numPredictors; t++) { - if (indexFoldedHist[t].get() != aheadindexFoldedHist.front()[t].get()) { + if (state.indexFoldedHist[t].get() != + state.aheadIndexFoldedHist.front()[t].get()) { mismatch = true; break; } @@ -881,22 +927,23 @@ MicroTAGE::doUpdateHist(const boost::dynamic_bitset<> &history, bool taken, Addr for (int t = 0; t < numPredictors; t++) { // Update tag folded history immediately so tag calculation always sees current history. - tagFoldedHist[t].update(history, 2, taken, pc, target); - altTagFoldedHist[t].update(history, 2, taken, pc, target); + state.tagFoldedHist[t].update(history, 2, taken, pc, target); + state.altTagFoldedHist[t].update(history, 2, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, tag 0x%lx, altTag 0x%lx\n", - t, tagFoldedHist[t].get(), altTagFoldedHist[t].get()); + t, state.tagFoldedHist[t].get(), + state.altTagFoldedHist[t].get()); } // Prepare next-cycle index folded history and delay its visibility by one cycle. - auto nextIndexFoldedHist = indexFoldedHist; + auto nextIndexFoldedHist = state.indexFoldedHist; for (int t = 0; t < numPredictors; t++) { nextIndexFoldedHist[t].update(history, 2, taken, pc, target); DPRINTF(TAGEHistory, "t: %d, index foldedHist(next) _folded 0x%lx\n", t, nextIndexFoldedHist[t].get()); } - aheadindexFoldedHist.push(nextIndexFoldedHist); - if (aheadindexFoldedHist.size() > 1) { - aheadindexFoldedHist.pop(); + state.aheadIndexFoldedHist.push(nextIndexFoldedHist); + if (state.aheadIndexFoldedHist.size() > 1) { + state.aheadIndexFoldedHist.pop(); } } @@ -916,7 +963,7 @@ void MicroTAGE::specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { auto [pc, target, taken] = pred.getPHistInfo(); - doUpdateHist(history, taken, pc, target); + doUpdateHist(history, taken, pc, target, pred.tid); } /** @@ -936,6 +983,7 @@ void MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = historyState(entry.tid); std::shared_ptr predMeta = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); if (!predMeta) { DPRINTF(UTAGE, "recoverPHist: no prediction metadata, cannot recover\n"); @@ -943,21 +991,22 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, } // Restore current folded index history exactly to prediction-time state. for (int i = 0; i < numPredictors; i++) { - indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); + state.indexFoldedHist[i].recover(predMeta->indexFoldedHist[i]); } // Restore delayed index folded history slot exactly to prediction-time state. - while (!aheadindexFoldedHist.empty()) { - aheadindexFoldedHist.pop(); + while (!state.aheadIndexFoldedHist.empty()) { + state.aheadIndexFoldedHist.pop(); } if (predMeta->aheadIndexFoldedHistValid) { assert(predMeta->aheadIndexFoldedHist.size() == numPredictors); - aheadindexFoldedHist.push(predMeta->aheadIndexFoldedHist); + state.aheadIndexFoldedHist.push(predMeta->aheadIndexFoldedHist); } if (debug::TAGEHistory) { bool queue_valid_mismatch = - (predMeta->aheadIndexFoldedHistValid != !aheadindexFoldedHist.empty()); + (predMeta->aheadIndexFoldedHistValid != + !state.aheadIndexFoldedHist.empty()); if (queue_valid_mismatch) { DPRINTF(TAGEHistory, "recoverPHist: ahead queue valid mismatch after restore, cond_taken %d\n", @@ -966,16 +1015,25 @@ MicroTAGE::recoverPHist(const boost::dynamic_bitset<> &history, } for (int i = 0; i < numPredictors; i++) { - altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); - tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); + state.altTagFoldedHist[i].recover(predMeta->altTagFoldedHist[i]); + state.tagFoldedHist[i].recover(predMeta->tagFoldedHist[i]); } - doUpdateHist(history, cond_taken, entry.getControlPC(), entry.getTakenTarget()); + doUpdateHist(history, cond_taken, entry.getControlPC(), + entry.getTakenTarget(), entry.tid); } // Check folded history after speculative update and recovery void MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * when) { + checkFoldedHist(hist, 0, when); +} + +void +MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, ThreadID tid, + const char * when) +{ + auto &state = historyState(tid); DPRINTF(UTAGE, "checking folded history when %s\n", when); if (debug::TAGEHistory) { std::string hist_str; @@ -987,13 +1045,13 @@ MicroTAGE::checkFoldedHist(const boost::dynamic_bitset<> &hist, const char * whe // aheadindexFoldedHist in doUpdateHist(). During consistency checks // right after speculative/recovery updates, compare against the staged // next-cycle value when available. - if (!aheadindexFoldedHist.empty()) { - aheadindexFoldedHist.front()[t].check(hist); + if (!state.aheadIndexFoldedHist.empty()) { + state.aheadIndexFoldedHist.front()[t].check(hist); } else { - indexFoldedHist[t].check(hist); + state.indexFoldedHist[t].check(hist); } - tagFoldedHist[t].check(hist); - altTagFoldedHist[t].check(hist); + state.tagFoldedHist[t].check(hist); + state.altTagFoldedHist[t].check(hist); } } diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh index b99258face..3a5fcc518c 100644 --- a/src/cpu/pred/btb/microtage.hh +++ b/src/cpu/pred/btb/microtage.hh @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include "base/sat_counter.hh" #include "base/types.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/limits.hh" #include "cpu/pred/btb/folded_hist.hh" #include "cpu/pred/btb/timed_base_pred.hh" @@ -42,6 +44,7 @@ namespace test { class MicroTAGE : public TimedBaseBTBPredictor { using bitset = boost::dynamic_bitset<>; + static constexpr unsigned MaxThreads = o3::MaxThreads; public: #ifdef UNIT_TEST // Test constructor @@ -121,7 +124,7 @@ class MicroTAGE : public TimedBaseBTBPredictor const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; // speculative update 3 folded history, according history and pred.taken // the other specUpdateHist methods are left blank @@ -157,13 +160,15 @@ class MicroTAGE : public TimedBaseBTBPredictor // check folded hists after speculative update and recover void checkFoldedHist(const bitset &history, const char *when); + void checkFoldedHist(const bitset &history, ThreadID tid, const char *when); #ifndef UNIT_TEST private: #endif // Look up predictions in TAGE tables for a stream of instructions - void lookupHelper(const Addr &startPC, const std::vector &btbEntries, CondTakens& results); + void lookupHelper(const Addr &startPC, const std::vector &btbEntries, + CondTakens& results, ThreadID tid); // Calculate TAGE index for a given PC and table Addr getTageIndex(Addr pc, int table); @@ -183,7 +188,8 @@ class MicroTAGE : public TimedBaseBTBPredictor unsigned getBankId(Addr pc) const; // Update branch history - void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target); + void doUpdateHist(const bitset &history, bool taken, Addr pc, Addr target, + ThreadID tid); // Number of TAGE predictor tables const unsigned numPredictors; @@ -203,14 +209,15 @@ class MicroTAGE : public TimedBaseBTBPredictor // History lengths for each table std::vector histLengths; - // Folded history for tag calculation - std::vector tagFoldedHist; - - // Folded history for alternative tag calculation - std::vector altTagFoldedHist; + struct ThreadHistoryState + { + std::vector tagFoldedHist; + std::vector altTagFoldedHist; + std::vector indexFoldedHist; + std::queue> aheadIndexFoldedHist; + }; - // Folded history for index calculation - std::vector indexFoldedHist; + std::vector threadHistory; // Maximum history length, not used unsigned maxHistLen; @@ -257,8 +264,6 @@ class MicroTAGE : public TimedBaseBTBPredictor unsigned lastPredBankId; // Bank ID of last prediction bool predBankValid; // Whether lastPredBankId is valid - std::queue> aheadindexFoldedHist; - #ifdef UNIT_TEST typedef uint64_t Scalar; #else @@ -349,7 +354,8 @@ private: // If predMeta is nullptr, use current folded history (prediction path) TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, - const std::shared_ptr predMeta = nullptr); + const std::shared_ptr predMeta = nullptr, + ThreadID tid = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -370,7 +376,10 @@ private: uint64_t &allocated_index, uint64_t &allocated_way); - std::shared_ptr meta; + std::vector> threadMeta; + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadHistoryState &historyState(ThreadID tid); + const ThreadHistoryState &historyState(ThreadID tid) const; }; // Close conditional namespace wrapper for testing diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc index 4dabf6dabf..7f279bdb8e 100644 --- a/src/cpu/pred/btb/ras.cc +++ b/src/cpu/pred/btb/ras.cc @@ -116,8 +116,9 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, } std::shared_ptr -BTBRAS::getPredictionMeta() +BTBRAS::getPredictionMeta(ThreadID tid) { + (void)tid; return meta; } diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh index 0055446013..b0b31c6d94 100644 --- a/src/cpu/pred/btb/ras.hh +++ b/src/cpu/pred/btb/ras.hh @@ -94,7 +94,7 @@ namespace btb_pred { void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; diff --git a/src/cpu/pred/btb/timed_base_pred.hh b/src/cpu/pred/btb/timed_base_pred.hh index fce1a6aef1..db611fef25 100644 --- a/src/cpu/pred/btb/timed_base_pred.hh +++ b/src/cpu/pred/btb/timed_base_pred.hh @@ -61,7 +61,10 @@ class TimedBaseBTBPredictor: public SimObject const boost::dynamic_bitset<> &history, std::vector &stagePreds) {} - virtual std::shared_ptr getPredictionMeta() { return nullptr; } + virtual std::shared_ptr getPredictionMeta(ThreadID tid = 0) + { + return nullptr; + } virtual void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {} virtual void specUpdatePHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) {} diff --git a/src/cpu/pred/btb/uras.cc b/src/cpu/pred/btb/uras.cc index c507956d0e..53825d818a 100644 --- a/src/cpu/pred/btb/uras.cc +++ b/src/cpu/pred/btb/uras.cc @@ -85,8 +85,9 @@ BTBuRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, } std::shared_ptr -BTBuRAS::getPredictionMeta() +BTBuRAS::getPredictionMeta(ThreadID tid) { + (void)tid; std::shared_ptr meta_void_ptr = std::make_shared(meta); return meta_void_ptr; } diff --git a/src/cpu/pred/btb/uras.hh b/src/cpu/pred/btb/uras.hh index cdcde96b54..4ba12b3099 100644 --- a/src/cpu/pred/btb/uras.hh +++ b/src/cpu/pred/btb/uras.hh @@ -43,7 +43,7 @@ class BTBuRAS : public TimedBaseBTBPredictor void putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) override; - std::shared_ptr getPredictionMeta() override; + std::shared_ptr getPredictionMeta(ThreadID tid = 0) override; void specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) override; @@ -161,4 +161,4 @@ struct NonSpecRASTrace : public Record { } // namespace branch_prediction } // namespace gem5 -#endif // __CPU_PRED_BTB_URAS_HH__ \ No newline at end of file +#endif // __CPU_PRED_BTB_URAS_HH__ diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index fc91c8d2f3..27adf7f598 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -519,13 +519,14 @@ BaseSimpleCPU::readMiscReg(int misc_reg, ThreadID tid) } void -BaseSimpleCPU::readGem5Regs() +BaseSimpleCPU::readGem5Regs(ThreadID tid) { + auto diffAllStates = this->diffAllStates[tid]; for (int i = 0; i < 32; i++) { diffAllStates->gem5RegFile[i] = - threadContexts[curThread]->getReg(RegId(IntRegClass, i)); + threadContexts[tid]->getReg(RegId(IntRegClass, i)); diffAllStates->gem5RegFile[i + 32] = - threadContexts[curThread]->getReg(RegId(FloatRegClass, i)); + threadContexts[tid]->getReg(RegId(FloatRegClass, i)); } } diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh index b289ac778f..bcdd7c9066 100644 --- a/src/cpu/simple/base.hh +++ b/src/cpu/simple/base.hh @@ -207,7 +207,7 @@ class BaseSimpleCPU : public BaseCPU RegVal readMiscReg(int misc_reg, ThreadID tid) override; - void readGem5Regs() override; + void readGem5Regs(ThreadID tid) override; }; } // namespace gem5 diff --git a/src/sim/system.cc b/src/sim/system.cc index 7bc4ec37ce..c640334f4d 100644 --- a/src/sim/system.cc +++ b/src/sim/system.cc @@ -562,8 +562,8 @@ void System::initState() } // have to initiate golden memory after checkpoint restored - if (numCPUs > 1 && enableDifftest) { - warn("Creating golden memory for multi-core difftest\n"); + if (multiContextDifftest()) { + warn("Creating golden memory for multi-context difftest\n"); assert(enableMemDedup); goldenMem = dedupMemManager.createCopyOnWriteBranch(); goldenMemManager.initGoldenMem(physmem.getStartaddr(), memSize(), goldenMem); diff --git a/src/sim/system.hh b/src/sim/system.hh index db49b66926..1dca935d6e 100644 --- a/src/sim/system.hh +++ b/src/sim/system.hh @@ -416,6 +416,11 @@ class System : public SimObject, public PCEventScope bool multiCore() const { return numCPUs > 1; } + bool multiContextDifftest() const + { + return enableDifftest && (multiCore() || multiThread); + } + uint8_t *getGoldenMemPtr() const { return goldenMem; } GoldenGloablMem *getGoldenMemManager() { return &goldenMemManager; } From 099bf7c52b656428ba12efce12c94af34eed2efa Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 18 Mar 2026 16:03:01 +0800 Subject: [PATCH 04/38] cpu-pred: fix unit test compile --- src/cpu/pred/btb/common.hh | 4 +++- src/cpu/pred/btb/test/btb_tage.test.cc | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index dc327c8589..b61e459ff6 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -323,7 +323,8 @@ struct FetchTarget int s3Source; // which stage the prediction comes from FetchTarget() - : startPC(0), + : tid(0), + startPC(0), predTaken(false), predEndPC(0), predBranchInfo(BranchInfo()), @@ -472,6 +473,7 @@ struct FullBTBPrediction int s3Source; FullBTBPrediction() : + tid(0), bbStart(0), btbEntries(), condTakens(), diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index 75514e2d3a..e945065e9f 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -374,7 +374,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) { // Test case 1: Update with taken branch (PHR shifts in 2 bits from PC hash) // Correct order: first update folded histories with pre-update PHR, then mutate PHR - tage->doUpdateHist(history, 2, true, pc, target); + tage->doUpdateHist(history, 2, true, pc, target, 0); applyPathHistoryTaken(history, pc, target); // Verify folded history matches the ideal fold of the updated PHR @@ -382,7 +382,7 @@ TEST_F(BTBTAGETest, HistoryUpdate) { // Test case 2: Update with not-taken branch (PHR unchanged, folded update is no-op) boost::dynamic_bitset<> before_not_taken = history; - tage->doUpdateHist(history, 2, false, pc, target); + tage->doUpdateHist(history, 2, false, pc, target, 0); // Verify folded history remains consistent tage->checkFoldedHist(history, "not-taken update"); @@ -615,9 +615,9 @@ TEST_F(BTBTAGETest, HistoryRecoveryCorrectness) { // Verify recovery produced the expected history for (int i = 0; i < tage->numPredictors; i++) { - tage->tagFoldedHist[i].check(expectedHistory); - tage->altTagFoldedHist[i].check(expectedHistory); - tage->indexFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].tagFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].altTagFoldedHist[i].check(expectedHistory); + tage->threadHistory[0].indexFoldedHist[i].check(expectedHistory); } } From 9def4665a400eab1a8fa1504b9e2f9b0a09cd7d1 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 23 Mar 2026 11:32:56 +0800 Subject: [PATCH 05/38] cpu-o3: integrate FS-SMT support changes Change-Id: I7690e69545b01ca4a8ba3e751f6cab7665f8767e --- configs/common/FSConfig.py | 18 +- configs/common/xiangshan.py | 5 +- src/cpu/base.cc | 79 +++- src/cpu/base.hh | 13 + src/cpu/o3/commit.cc | 681 +++++++++++++++------------- src/cpu/o3/commit.hh | 4 +- src/cpu/o3/fetch.cc | 46 +- src/cpu/o3/lsq.cc | 28 +- src/cpu/o3/lsq.hh | 20 +- src/cpu/o3/lsq_unit.cc | 56 ++- src/cpu/o3/rob.cc | 16 +- src/cpu/o3/rob.hh | 1 + src/cpu/pred/BranchPredictor.py | 1 + src/cpu/pred/btb/decoupled_bpred.cc | 5 +- src/cpu/pred/btb/decoupled_bpred.hh | 7 +- src/cpu/pred/btb/ras.cc | 326 +++++++------ src/cpu/pred/btb/ras.hh | 75 +-- src/dev/riscv/HartCtrl.py | 13 + src/dev/riscv/SConscript | 2 + src/dev/riscv/hart_ctrl.cc | 98 ++++ src/dev/riscv/hart_ctrl.hh | 33 ++ 21 files changed, 978 insertions(+), 549 deletions(-) create mode 100644 src/dev/riscv/HartCtrl.py create mode 100644 src/dev/riscv/hart_ctrl.cc create mode 100644 src/dev/riscv/hart_ctrl.hh diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py index dc66ed7833..d650b82f70 100644 --- a/configs/common/FSConfig.py +++ b/configs/common/FSConfig.py @@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None): self.system_port = self.membus.cpu_side_ports return self -def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False): - self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby) +def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, + ruby=False, num_threads=None): + self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby, + num_threads=num_threads) self.workload = RiscvBareMetal() self.workload.reset_vect = 0x80000000 return self -def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): +def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False, + num_threads=None): self = System() if not mdesc: # generic system mdesc = SysConfig() + if num_threads is None: + num_threads = np self.mem_mode = mem_mode self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())] print(self.mem_ranges) @@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): self.lint = Clint() self.lint.pio = self.iobus.mem_side_ports self.lint.pio_addr = 0x38000000 - self.lint.num_threads = np + self.lint.num_threads = num_threads + + self.hartctrl = HartCtrl() + self.hartctrl.pio = self.iobus.mem_side_ports + self.hartctrl.num_threads = num_threads self.mmcs = NemuMMC() self.mmcs.pio = self.iobus.mem_side_ports @@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False): AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr + self.uartlite.pio_size), AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size), + AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size), AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size), AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size), ] diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py index 368f6cd884..6abcd5ea39 100644 --- a/configs/common/xiangshan.py +++ b/configs/common/xiangshan.py @@ -827,8 +827,11 @@ def build_xiangshan_system(args): TestCPUClass = get_xiangshan_cpu_class(args) ruby = bool(hasattr(args, 'ruby') and args.ruby) + num_threads = np * (2 if getattr(args, 'smt', False) else 1) - test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby) + test_sys = makeBareMetalXiangshanSystem( + 'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby, + num_threads=num_threads) if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode: if bool(getattr(args, 'trace_timing_ptw', False)): diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 83a2a27686..68808f3b3a 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -210,6 +210,7 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker) } diffAllStates.resize(numThreads); + recentCommittedStores.resize(numThreads); if (enableDifftest) { assert(params().difftest_ref_so.length() > 2); for (ThreadID tid = 0; tid < numThreads; ++tid) { @@ -431,6 +432,33 @@ BaseCPU::startup() } +void +BaseCPU::recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst) +{ + RecentCommittedStore recent; + + if (!system->multiContextDifftest() || !_goldenMemManager || + !inst->isStore() || inst->isAtomic() || + (inst->isStoreConditional() && !inst->lockedWriteSuccess()) || + !inst->memData || inst->effSize == 0 || + inst->effSize > sizeof(recent.data) || + !_goldenMemManager->inPmem(inst->physEffAddr)) { + return; + } + + auto &recent_history = recentCommittedStores.at(tid); + recent.valid = true; + recent.addr = inst->physEffAddr; + recent.size = inst->effSize; + recent.seq = inst->seqNum; + std::memcpy(recent.data, inst->memData, recent.size); + recent_history.push_back(recent); + constexpr size_t max_store_history = 16; + if (recent_history.size() > max_store_history) { + recent_history.pop_front(); + } +} + probing::PMUUPtr BaseCPU::pmuProbePoint(const char *name) { @@ -1459,10 +1487,31 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) warn("Difference on %s instr found in multicore mode, check in golden memory\n", diffInfo.inst->isLoad() ? "load" : "amo"); uint8_t *golden_ptr = diffInfo.goldenValue; + const RecentCommittedStore *matched_recent_store = nullptr; + if (diffInfo.inst->isLoad()) { + const auto &recent_history = recentCommittedStores.at(tid); + for (auto it = recent_history.rbegin(); + it != recent_history.rend(); ++it) { + if (!it->valid || + it->addr != diffInfo.physEffAddr || + it->size != diffInfo.effSize || + it->seq >= seq || + (seq - it->seq) > 256) { + continue; + } + if (memcmp(it->data, &gem5_val, + diffInfo.effSize) == 0) { + matched_recent_store = &(*it); + break; + } + } + } // a lambda function to sync memory and register from golden results to ref - auto sync_mem_reg = [&]() { - diffAllStates->proxy->memcpy(diffInfo.physEffAddr, golden_ptr, diffInfo.effSize, + auto sync_mem_reg = [&](const uint8_t *mem_src) { + diffAllStates->proxy->memcpy(diffInfo.physEffAddr, + const_cast(mem_src), + diffInfo.effSize, DIFFTEST_TO_REF); diffAllStates->referenceRegFile[dest_tag] = gem5_val; diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF); @@ -1470,7 +1519,16 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) { DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n"); - sync_mem_reg(); + sync_mem_reg(golden_ptr); + continue; + } else if (matched_recent_store) { + DPRINTF(Diff, + "Load content matched recent committed store " + "[sn:%llu] at addr %#lx. Syncing ref from the " + "store snapshot for this hart.\n", + matched_recent_store->seq, + diffInfo.physEffAddr); + sync_mem_reg(matched_recent_store->data); continue; } else if (diffInfo.inst->isAtomic()) { DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue, @@ -1478,7 +1536,7 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr); if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) { DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n"); - sync_mem_reg(); + sync_mem_reg(golden_ptr); continue; } else { warn("Atomic old value not matched!\n"); @@ -1583,9 +1641,16 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize); diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF); } else if (enableMemDedup) { - warn("Let ref share a COW mirror of root memory\n"); - assert(diffAllStates->proxy->ref_get_backed_memory); - diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); + if (system->multiContextDifftest()) { + warn("Let ref share the multi-context golden memory\n"); + assert(goldenMemPtr); + assert(diffAllStates->proxy->ref_get_backed_memory); + diffAllStates->proxy->ref_get_backed_memory(goldenMemPtr, pmemSize); + } else { + warn("Let ref share a COW mirror of root memory\n"); + assert(diffAllStates->proxy->ref_get_backed_memory); + diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); + } } else { warn("MemDedup disabled, copying pmem to NEMU\n"); warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize); diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 3d3e8e5a85..feaf6e13cd 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -42,6 +42,7 @@ #ifndef __CPU_BASE_HH__ #define __CPU_BASE_HH__ +#include #include #include @@ -138,6 +139,16 @@ struct DiffAllStates class BaseCPU : public ClockedObject { protected: + struct RecentCommittedStore + { + bool valid = false; + Addr addr = 0; + size_t size = 0; + InstSeqNum seq = 0; + uint8_t data[16] = {}; + }; + + std::vector> recentCommittedStores; const unsigned IntRegIndexBase = 0; const unsigned FPRegIndexBase = 32; @@ -778,6 +789,8 @@ class BaseCPU : public ClockedObject void difftestStep(ThreadID tid, InstSeqNum seq); + void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst); + inline bool difftestEnabled() const { return enableDifftest; } void displayGem5Regs(ThreadID tid); diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index ad42b0c7fe..e1b20025ce 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -42,6 +42,7 @@ #include "cpu/o3/commit.hh" #include +#include #include #include #include @@ -104,32 +105,35 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara : commitPolicy(params.smtCommitPolicy), stuckCheckEvent([this]() { static std::vector debug_insts; - if (cpu->curCycle() - this->lastCommitCycle > 40000) { - if (traceMaybeExitOnPipelineDrainFromStuckCheck()) { - return; - } - if (auto inst = rob->readHeadInst(0)) { - warn("can't commit inst %s\n", inst->genDisassembly()); - debug_insts.insert( - debug_insts.begin(), rob->getInstList(0).begin(), - rob->getInstList(0).end()); - warn("dump rob front 10 insts\n"); - int i = 0; - for (auto inst = debug_insts.begin(); - inst != debug_insts.end() && i < 10; inst++, i++) { - warn("%s\n", (*inst)->genDisassembly()); + for (ThreadID tid = 0; tid < numThreads; tid++) { + if (cpu->curCycle() - this->lastCommitCycle[tid] > 40000) { + if (traceMaybeExitOnPipelineDrainFromStuckCheck()) { + return; } - } else { - warn("rob was empty, may be fetch or rename stuck\n"); + + if (auto inst = rob->readHeadInst(0)) { + warn("can't commit inst %s\n", inst->genDisassembly()); + debug_insts.insert( + debug_insts.begin(), rob->getInstList(tid).begin(), + rob->getInstList(tid).end()); + warn("dump rob front 10 insts\n"); + int i = 0; + for (auto inst = debug_insts.begin(); + inst != debug_insts.end() && i < 10; inst++, i++) { + warn("%s\n", (*inst)->genDisassembly()); + } + } else { + warn("rob was empty, may be fetch or rename stuck\n"); + } + panic( + "Commit stage is stucked for more than 40,000 cycles!\n" + "Thread: %d Last commit cycle: %lu, current cycle: %lu, suggested " + "--debug-start=%llu --debug-end=%llu\n", tid, + lastCommitCycle[tid], cpu->curCycle(), + cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] - 200)), + cpu->cyclesToTicks(Cycles(lastCommitCycle[tid] + 200))); } - panic( - "Commit stage is stucked for more than 40,000 cycles!\n" - "Last commit cycle: %lu, current cycle: %lu, suggested " - "--debug-start=%llu --debug-end=%llu\n", - lastCommitCycle, cpu->curCycle(), - cpu->cyclesToTicks(Cycles(lastCommitCycle - 200)), - cpu->cyclesToTicks(Cycles(lastCommitCycle + 200))); } cpu->schedule(this->stuckCheckEvent, cpu->clockEdge(Cycles(40010))); }, "CommitStuckCheckEvent"), @@ -1204,349 +1208,395 @@ Commit::commitInsts() DPRINTF(Commit, "Trying to commit instructions in the ROB.\n"); unsigned num_committed = 0; + std::array num_committed_per_thread = {}; + std::array commit_width_per_thread = {}; DynInstPtr head_inst; - int commit_width = rob->countInstsOfGroups(commitWidth); + int commit_width = 0; + for (ThreadID tid : *activeThreads) { + commit_width_per_thread[tid] = + rob->countInstsOfGroups(tid, commitWidth); + commit_width += commit_width_per_thread[tid]; + } if (commit_width >= 0) { cpu->activityThisCycle(); } - // Commit as many instructions as possible until the commit bandwidth - // limit is reached, or it becomes impossible to commit any more. - while (num_committed < commit_width) { - // hardware transactionally memory - // If executing within a transaction, - // need to handle interrupts specially - - ThreadID commit_thread = getCommittingThread(); - - // Check for any interrupt that we've already squashed for - // and start processing it. - if (interrupt != NoFault) { - // If inside a transaction, postpone interrupts - if (executingHtmTransaction(commit_thread)) { - cpu->clearInterrupts(0); - toIEW->commitInfo[0].clearInterrupt = true; - interrupt = NoFault; - avoidQuiesceLiveLock = true; - } else { - handleInterrupt(); - } + // Commit each thread independently for up to its local commit window. + for (ThreadID commit_thread : *activeThreads) { + if (commitStatus[commit_thread] != Running && + commitStatus[commit_thread] != Idle && + commitStatus[commit_thread] != FetchTrapPending) { + continue; } - // ThreadID commit_thread = getCommittingThread(); - - if (commit_thread == -1) - break; - - head_inst = rob->readHeadInst(commit_thread); - - if (!rob->isHeadGroupReady(commit_thread)) { - if (debug::Commit && head_inst->readyToCommit()) { - InstSeqNum seqnum = rob->getHeadGroupLastDoneSeq(commit_thread); - DPRINTF( - Commit, - "[sn:%llu] Head is ready to commit, but the group is not all ready, last done inst [sn:%llu]\n", - head_inst->seqNum, seqnum); + while (num_committed < commit_width && + num_committed_per_thread[commit_thread] < + commit_width_per_thread[commit_thread]) { + // hardware transactionally memory + // If executing within a transaction, + // need to handle interrupts specially + + // Check for any interrupt that we've already squashed for + // and start processing it. + if (interrupt != NoFault) { + // If inside a transaction, postpone interrupts + if (executingHtmTransaction(commit_thread)) { + cpu->clearInterrupts(0); + toIEW->commitInfo[0].clearInterrupt = true; + interrupt = NoFault; + avoidQuiesceLiveLock = true; + } else { + handleInterrupt(); + } } - break; - } - ThreadID tid = head_inst->threadNumber; - - assert(tid == commit_thread); - - DPRINTF(Commit, - "Trying to commit head instruction, [tid:%i] [sn:%llu]\n", - tid, head_inst->seqNum); + head_inst = rob->readHeadInst(commit_thread); + + if (!rob->isHeadGroupReady(commit_thread)) { + if (debug::Commit && head_inst->readyToCommit()) { + InstSeqNum seqnum = + rob->getHeadGroupLastDoneSeq(commit_thread); + DPRINTF( + Commit, + "[sn:%llu] Head is ready to commit, but the group " + "is not all ready, last done inst [sn:%llu]\n", + head_inst->seqNum, seqnum); + } + break; + } - // If the head instruction is squashed, it is ready to retire - // (be removed from the ROB) at any time. - if (head_inst->isSquashed()) { + ThreadID tid = head_inst->threadNumber; - DPRINTF(Commit, "Retiring squashed instruction from " - "ROB.\n"); + assert(tid == commit_thread); - rob->retireHead(commit_thread); + DPRINTF(Commit, + "Trying to commit head instruction, [tid:%i] [sn:%llu]\n", + tid, head_inst->seqNum); - ++stats.commitSquashedInsts; - // Notify potential listeners that this instruction is squashed - ppSquash->notify(head_inst); + // If the head instruction is squashed, it is ready to retire + // (be removed from the ROB) at any time. + if (head_inst->isSquashed()) { - // Record that the number of ROB entries has changed. - changedROBNumEntries[tid] = true; - } else { - set(pc[tid], head_inst->pcState()); - traceMaybeInjectCtrlFlowChangeFault(tid, head_inst); + DPRINTF(Commit, "Retiring squashed instruction from " + "ROB.\n"); - // Try to commit the head instruction. - bool commit_success = commitHead(head_inst, num_committed); + rob->retireHead(commit_thread); - if (commit_success) { - cpu->perfCCT->updateInstPos(head_inst->seqNum, PerfRecord::AtCommit); - auto res = head_inst->getResult(); - if (res.is()) { - cpu->perfCCT->updateInstMeta(head_inst->seqNum, InstDetail::Result, res.as()); - } - cpu->perfCCT->commitMeta(head_inst->seqNum); + ++stats.commitSquashedInsts; + // Notify potential listeners that this instruction is squashed + ppSquash->notify(head_inst); - DPRINTF(CommitTrace, "CT: %s\n", head_inst->genDisassembly()); + // Record that the number of ROB entries has changed. + changedROBNumEntries[tid] = true; + } else { + set(pc[tid], head_inst->pcState()); + traceMaybeInjectCtrlFlowChangeFault(tid, head_inst); + + // Try to commit the head instruction. + bool commit_success = commitHead(head_inst, + num_committed_per_thread[tid]); + + if (commit_success) { + cpu->perfCCT->updateInstPos(head_inst->seqNum, + PerfRecord::AtCommit); + auto res = head_inst->getResult(); + if (res.is()) { + cpu->perfCCT->updateInstMeta( + head_inst->seqNum, InstDetail::Result, + res.as()); + } + cpu->perfCCT->commitMeta(head_inst->seqNum); - if (ismispred) { - ismispred = false; - stats.recovery_bubble += (cpu->curCycle() - lastCommitCycle) * renameWidth; - } - if (head_inst->mispredicted()) { - ismispred = true; - } + DPRINTF(CommitTrace, "CT [tid:%d]: %s\n", + head_inst->threadNumber, + head_inst->genDisassembly()); - lastCommitCycle = cpu->curCycle(); - const auto &head_rv_pc = head_inst->pcState().as(); - if (bp->isBTB()) { - auto dbbtb = dynamic_cast(bp); - bool miss = head_inst->mispredicted(); - if (head_inst->isReturn()) { - DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n", - head_inst->pcState().instAddr(), miss, - head_rv_pc.npc(), *(head_inst->predPC)); + if (ismispred) { + ismispred = false; + stats.recovery_bubble += + (cpu->curCycle() - lastCommitCycle[tid]) * + renameWidth; + } + if (head_inst->mispredicted()) { + ismispred = true; } - // FIXME: ignore mret/sret/uret in correspond with RTL - if (!head_inst->isNonSpeculative() && head_inst->isControl()) { - dbbtb->commitBranch(head_inst, miss); - if (!head_inst->isReturn() && head_inst->isIndirectCtrl() && miss) { - misPredIndirect[head_inst->pcState().instAddr()]++; + lastCommitCycle[tid] = cpu->curCycle(); + const auto &head_rv_pc = + head_inst->pcState().as(); + if (bp->isBTB()) { + auto dbbtb = dynamic_cast< + branch_prediction::btb_pred:: + DecoupledBPUWithBTB *>(bp); + bool miss = head_inst->mispredicted(); + if (head_inst->isReturn()) { + DPRINTF(RAS, "commit inst PC %x miss %d real target %x pred target %x\n", + head_inst->pcState().instAddr(), miss, + head_rv_pc.npc(), *(head_inst->predPC)); } - } - dbbtb->notifyInstCommit(head_inst); - } - if (traceMaybeExitOnLastTraceInst(head_inst)) { - return; - } - if (head_inst->isUpdateVsstatusSd()) { - auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); - RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); - RiscvISA::VSSTATUS vsstatus = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - RiscvISA::VSSTATUS32 vsstatus32 = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - - if (v) { - if (hstatus.vsxl ==1) { - vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid); - } else { - vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid); + // FIXME: ignore mret/sret/uret in correspond with RTL + if (!head_inst->isNonSpeculative() && head_inst->isControl()) { + dbbtb->commitBranch(head_inst, miss); + if (!head_inst->isReturn() && + head_inst->isIndirectCtrl() && miss) { + misPredIndirect[head_inst->pcState().instAddr()]++; + } } + dbbtb->notifyInstCommit(head_inst); } + if (traceMaybeExitOnLastTraceInst(head_inst)) { + return; + } - } - if (head_inst->isUpdateMstatusSd()) { - updateMstatusSd(tid); - } - - ++num_committed; - stats.committedInstType[tid][head_inst->opClass()]++; - ppCommit->notify(head_inst); - - // hardware transactional memory - - // update nesting depth - if (head_inst->isHtmStart()) - htmStarts[tid]++; + if (head_inst->isUpdateVsstatusSd()) { + auto v = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); + RiscvISA::HSTATUS hstatus = + cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); + RiscvISA::VSSTATUS vsstatus = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + RiscvISA::VSSTATUS32 vsstatus32 = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + + if (v) { + if (hstatus.vsxl ==1) { + vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus32.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus32, tid); + } else { + vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus, tid); + } + } - // sanity check - if (head_inst->inHtmTransactionalState()) { - assert(executingHtmTransaction(tid)); - } else { - assert(!executingHtmTransaction(tid)); - } + } + if (head_inst->isUpdateMstatusSd()) { + updateMstatusSd(tid); + } - // update nesting depth - if (head_inst->isHtmStop()) - htmStops[tid]++; + ++num_committed; + ++num_committed_per_thread[tid]; + stats.committedInstType[tid][head_inst->opClass()]++; + ppCommit->notify(head_inst); - changedROBNumEntries[tid] = true; + // hardware transactional memory - // Set the doneSeqNum to the youngest committed instruction. - toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum; + // update nesting depth + if (head_inst->isHtmStart()) + htmStarts[tid]++; - if (head_inst->getFtqId() > 1) { - toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1; - } - committedTargetId = head_inst->getFtqId(); - committedLoopIter = head_inst->getLoopIteration(); - - if (tid == 0) - canHandleInterrupts = !head_inst->isDelayedCommit(); - - // at this point store conditionals should either have - // been completed or predicated false - assert(!head_inst->isStoreConditional() || - head_inst->isCompleted() || - !head_inst->readPredicate()); - - // Updates misc. registers. - head_inst->updateMiscRegs(); - if (head_inst->staticInst->isVectorConfig()) { - auto vset = static_cast(head_inst->staticInst.get()); - if (!(vset->vtypeIsImm)) { - auto tc = head_inst->tcBase(); - RiscvISA::VTYPE new_vtype = head_inst->readMiscReg(RiscvISA::MISCREG_VTYPE); - tc->getDecoderPtr()->as().setVtype(new_vtype); - } - if (hasExecutedYoungerInst(tid, head_inst->seqNum)) { - DPRINTF(Commit, - "[tid:%i] [sn:%llu] Vector config committed with executed younger instructions in " - "ROB, squash younger instructions.\n", - tid, head_inst->seqNum); - squashAfter(tid, head_inst); - } - } - if (head_inst->isFloating() && head_inst->isLoad()){ - RiscvISA::STATUS status = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, tid); - status.sd = 1; - status.fs = 3; - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_STATUS, (RegVal)status, tid); - } - if (head_inst->isUpdateVsstatusSd()) { - auto v = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); - RiscvISA::HSTATUS hstatus = cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); - RiscvISA::VSSTATUS vsstatus = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - RiscvISA::VSSTATUS32 vsstatus32 = - cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); - - if (v) { - if (hstatus.vsxl ==1) { - vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus32, tid); - } else { - vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); - cpu->setMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, (RegVal)vsstatus, tid); - } + // sanity check + if (head_inst->inHtmTransactionalState()) { + assert(executingHtmTransaction(tid)); + } else { + assert(!executingHtmTransaction(tid)); } - } + // update nesting depth + if (head_inst->isHtmStop()) + htmStops[tid]++; - if (cpu->difftestEnabled()) { - diffInst(tid, head_inst); - } + changedROBNumEntries[tid] = true; + + // Set the doneSeqNum to the youngest committed instruction. + toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum; - if (head_inst->isLoad()) { - Addr load_pc = head_inst->pcState().instAddr(); - Addr load_addr = head_inst->physEffAddr; - char buffer[8] = {0}; - if (head_inst->memData) { - std::memcpy(buffer, head_inst->memData, - std::min(head_inst->effSize, - sizeof(buffer))); + if (head_inst->getFtqId() > 1) { + toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1; } - Addr load_value = *((uint64_t *)buffer); - bool hit = loadTripleCounter.update(load_pc, load_addr, load_value); - if (hit) { - // same PC && same addr && same value - stats.loadTriple++; + committedTargetId = head_inst->getFtqId(); + committedLoopIter = head_inst->getLoopIteration(); + + if (tid == 0) + canHandleInterrupts = !head_inst->isDelayedCommit(); + + // at this point store conditionals should either have + // been completed or predicated false + assert(!head_inst->isStoreConditional() || + head_inst->isCompleted() || + !head_inst->readPredicate()); + + // Updates misc. registers. + head_inst->updateMiscRegs(); + if (head_inst->staticInst->isVectorConfig()) { + auto vset = static_cast( + head_inst->staticInst.get()); + if (!(vset->vtypeIsImm)) { + auto tc = head_inst->tcBase(); + RiscvISA::VTYPE new_vtype = + head_inst->readMiscReg( + RiscvISA::MISCREG_VTYPE); + tc->getDecoderPtr()->as().setVtype(new_vtype); + } + if (hasExecutedYoungerInst(tid, head_inst->seqNum)) { + DPRINTF(Commit, + "[tid:%i] [sn:%llu] Vector config " + "committed with executed younger " + "instructions in ROB, squash younger " + "instructions.\n", + tid, head_inst->seqNum); + squashAfter(tid, head_inst); + } } - // EA reuse: compare to last committed EA of same static load - auto itEA = lastLoadEA.find(load_pc); - if (itEA != lastLoadEA.end() && itEA->second == load_addr) { - stats.loadEAReused++; + if (head_inst->isFloating() && head_inst->isLoad()) { + RiscvISA::STATUS status = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_STATUS, tid); + status.sd = 1; + status.fs = 3; + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_STATUS, + (RegVal)status, tid); } - lastLoadEA[load_pc] = load_addr; - // Producer stability: only if this load had a forwarding producer - if (head_inst->hasProducerStorePC()) { - stats.loadsWithProducer++; - const Addr prodPC = head_inst->producerStorePC(); - auto itP = lastLoadProducerStorePC.find(load_pc); - if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) { - stats.producerStable++; + if (head_inst->isUpdateVsstatusSd()) { + auto v = cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VIRMODE, tid); + RiscvISA::HSTATUS hstatus = + cpu->readMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_HSTATUS, tid); + RiscvISA::VSSTATUS vsstatus = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + RiscvISA::VSSTATUS32 vsstatus32 = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, tid); + + if (v) { + if (hstatus.vsxl ==1) { + vsstatus32.sd = (vsstatus32.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus32, tid); + } else { + vsstatus.sd = (vsstatus.fs == 3) || (vsstatus.vs == 3); + cpu->setMiscRegNoEffect( + RiscvISA::MiscRegIndex::MISCREG_VSSTATUS, + (RegVal)vsstatus, tid); + } } - lastLoadProducerStorePC[load_pc] = prodPC; - // optional: clear after use to avoid confusing later stages - head_inst->clearProducerStorePC(); } - } + if (cpu->difftestEnabled()) { + diffInst(tid, head_inst); + } + + if (head_inst->isLoad()) { + Addr load_pc = head_inst->pcState().instAddr(); + Addr load_addr = head_inst->physEffAddr; + char buffer[8] = {0}; + if (head_inst->memData) { + std::memcpy(buffer, head_inst->memData, + std::min(head_inst->effSize, + sizeof(buffer))); + } + Addr load_value = *((uint64_t *)buffer); + bool hit = loadTripleCounter.update(load_pc, load_addr, load_value); + if (hit) { + // same PC && same addr && same value + stats.loadTriple++; + } + // EA reuse: compare to last committed EA of same static load + auto itEA = lastLoadEA.find(load_pc); + if (itEA != lastLoadEA.end() && itEA->second == load_addr) { + stats.loadEAReused++; + } + lastLoadEA[load_pc] = load_addr; + // Producer stability: only if this load had a forwarding producer + if (head_inst->hasProducerStorePC()) { + stats.loadsWithProducer++; + const Addr prodPC = head_inst->producerStorePC(); + auto itP = lastLoadProducerStorePC.find(load_pc); + if (itP != lastLoadProducerStorePC.end() && itP->second == prodPC) { + stats.producerStable++; + } + lastLoadProducerStorePC[load_pc] = prodPC; + + // optional: clear after use to avoid confusing later stages + head_inst->clearProducerStorePC(); + } + } - // Check instruction execution if it successfully commits and - // is not carrying a fault. - if (cpu->checker) { - cpu->checker->verify(head_inst); - } - cpu->traceFunctions(pc[tid]->instAddr()); - traceOnCommit(tid, head_inst); + // Check instruction execution if it successfully commits and + // is not carrying a fault. + if (cpu->checker) { + cpu->checker->verify(head_inst); + } - head_inst->staticInst->advancePC(*pc[tid]); + cpu->traceFunctions(pc[tid]->instAddr()); + traceOnCommit(tid, head_inst); - // Keep track of the last sequence number commited - lastCommitedSeqNum[tid] = head_inst->seqNum; + head_inst->staticInst->advancePC(*pc[tid]); - // If this is an instruction that doesn't play nicely with - // others squash everything and restart fetch - if (head_inst->isSquashAfter()) - squashAfter(tid, head_inst); + // Keep track of the last sequence number commited + lastCommitedSeqNum[tid] = head_inst->seqNum; - if (drainPending) { - if (pc[tid]->microPC() == 0 && interrupt == NoFault && - !thread[tid]->trapPending) { - // Last architectually committed instruction. - // Squash the pipeline, stall fetch, and use - // drainImminent to disable interrupts - DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]); + // If this is an instruction that doesn't play nicely with + // others squash everything and restart fetch + if (head_inst->isSquashAfter()) squashAfter(tid, head_inst); - cpu->commitDrained(tid); - drainImminent = true; - } - } - bool onInstBoundary = !head_inst->isMicroop() || - head_inst->isLastMicroop() || - !head_inst->isDelayedCommit(); - - if (onInstBoundary) { - int count = 0; - Addr oldpc; - // Make sure we're not currently updating state while - // handling PC events. - assert(!thread[tid]->noSquashFromTC && - !thread[tid]->trapPending); - do { - oldpc = pc[tid]->instAddr(); - thread[tid]->pcEventQueue.service( - oldpc, thread[tid]->getTC()); - count++; - } while (oldpc != pc[tid]->instAddr()); - if (count > 1) { - DPRINTF(Commit, - "PC skip function event, stopping commit\n"); - break; - } - traceOnMacroCommit(tid); + if (drainPending) { + if (pc[tid]->microPC() == 0 && interrupt == NoFault && + !thread[tid]->trapPending) { + // Last architectually committed instruction. + // Squash the pipeline, stall fetch, and use + // drainImminent to disable interrupts + DPRINTF(Drain, "Draining: %i:%s\n", tid, *pc[tid]); + squashAfter(tid, head_inst); + cpu->commitDrained(tid); + drainImminent = true; + } } - // Check if an instruction just enabled interrupts and we've - // previously had an interrupt pending that was not handled - // because interrupts were subsequently disabled before the - // pipeline reached a place to handle the interrupt. In that - // case squash now to make sure the interrupt is handled. - // - // If we don't do this, we might end up in a live lock - // situation. - if (!interrupt && avoidQuiesceLiveLock && - onInstBoundary && cpu->checkInterrupts(0)) - squashAfter(tid, head_inst); - } else { - DPRINTF(Commit, "Unable to commit head instruction PC:%s " - "[tid:%i] [sn:%llu].\n", - head_inst->pcState(), tid ,head_inst->seqNum); - break; + bool onInstBoundary = !head_inst->isMicroop() || + head_inst->isLastMicroop() || + !head_inst->isDelayedCommit(); + + if (onInstBoundary) { + int count = 0; + Addr oldpc; + // Make sure we're not currently updating state while + // handling PC events. + assert(!thread[tid]->noSquashFromTC && + !thread[tid]->trapPending); + do { + oldpc = pc[tid]->instAddr(); + thread[tid]->pcEventQueue.service( + oldpc, thread[tid]->getTC()); + count++; + } while (oldpc != pc[tid]->instAddr()); + if (count > 1) { + DPRINTF(Commit, + "PC skip function event, stopping commit\n"); + break; + } + traceOnMacroCommit(tid); + } + + // Check if an instruction just enabled interrupts and we've + // previously had an interrupt pending that was not handled + // because interrupts were subsequently disabled before the + // pipeline reached a place to handle the interrupt. In that + // case squash now to make sure the interrupt is handled. + // + // If we don't do this, we might end up in a live lock + // situation. + if (!interrupt && avoidQuiesceLiveLock && + onInstBoundary && cpu->checkInterrupts(0)) + squashAfter(tid, head_inst); + } else { + DPRINTF(Commit, "Unable to commit head instruction PC:%s " + "[tid:%i] [sn:%llu].\n", + head_inst->pcState(), tid ,head_inst->seqNum); + break; + } } } } @@ -1596,6 +1646,8 @@ Commit::diffInst(ThreadID tid, const DynInstPtr &inst) { cpu->diffInfo.physEffAddr = inst->physEffAddr; cpu->diffInfo.effSize = inst->effSize; cpu->diffInfo.goldenValue = inst->getGolden(); + cpu->diffInfo.amoOldGoldenValue = inst->getAmoOldGoldenValue(); + cpu->recordCommittedStore(tid, inst); cpu->difftestStep(tid, inst->seqNum); } @@ -1990,6 +2042,13 @@ Commit::squashInflightAndUpdateVersion(ThreadID tid) DPRINTF(Commit, "Squashing in-flight renamed instructions\n"); for (unsigned i_idx = 0; i_idx < fromRename->size; i_idx++) { const DynInstPtr &inst = fromRename->insts[i_idx]; + if (inst->threadNumber != tid) { + DPRINTF(Commit, + "[tid:%i] [sn:%llu] Preserving other-thread in-flight " + "instruction during squash for tid %i\n", + inst->threadNumber, inst->seqNum, tid); + continue; + } DPRINTF(Commit, "[tid:%i] [sn:%llu] Squashing in-flight " "instruction PC %s\n", inst->threadNumber, inst->seqNum, inst->pcState()); diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 465732ea0e..3c83b610e5 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -196,7 +196,7 @@ class Commit }; std::list branchLog; - uint64_t lastCommitCycle = 0; + uint64_t lastCommitCycle[MaxThreads] = {0}; EventFunctionWrapper stuckCheckEvent; @@ -215,8 +215,6 @@ class Commit /** Returns the name of the Commit. */ std::string name() const; - uint64_t getLastCommitCycle() const { return lastCommitCycle; } - /** Registers probes. */ void regProbePoints(); diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index d2381123ab..f95738bd2c 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -592,10 +592,34 @@ Fetch::processMultiCacheLineCompletion(ThreadID tid, PacketPtr pkt) DPRINTF(Fetch, "[tid:%i] Waiting for remaining packets. Completed: %d, Total: %d\n", tid, threads[tid].cacheReq.completedPackets, threads[tid].cacheReq.packets.size()); - if (cacheBlocked && !retryPkt.empty()) { - DPRINTF(Fetch, "[tid:%i] Cache response arrived with queued retries pending; " - "trying one response-driven retry pass\n", tid); - retryPendingIcacheRequests(); + bool waitingOnRetry = false; + for (const auto status : threads[tid].cacheReq.requestStatus) { + if (status == CacheWaitRetry) { + waitingOnRetry = true; + break; + } + } + + if (waitingOnRetry && cacheBlocked && !retryPkt.empty()) { + PacketPtr queuedPkt = retryPkt.front(); + const ThreadID queuedTid = + cpu->contextToThread(queuedPkt->req->contextId()); + const bool sameThreadRetry = queuedTid == tid && + threads[tid].cacheReq.findRequestIndex(queuedPkt->req) != SIZE_MAX; + + if (sameThreadRetry && icachePort.sendTimingReq(queuedPkt)) { + DPRINTF(Fetch, + "[tid:%i] Retrying matching queued I-cache packet %#lx " + "after sibling response\n", + tid, queuedPkt->req->getVaddr()); + updateCacheRequestStatusByRequest(tid, queuedPkt->req, + CacheWaitResponse); + ppFetchRequestSent->notify(queuedPkt->req); + retryPkt.erase(retryPkt.begin()); + if (retryPkt.empty()) { + cacheBlocked = false; + } + } } return false; // Return false to indicate we're still waiting @@ -2094,8 +2118,22 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { assert(dbpbtb); const auto &stream = dbpbtb->ftqFetchingTarget(tid); const Addr start_pc = stream.startPC; + const Addr current_pc = pc_state.instAddr(); threads[tid].startPC = start_pc; + if (current_pc < stream.startPC || + current_pc >= stream.predEndPC) { + auto &reset_pc = threads[tid].fetchpc->as(); + reset_pc.pc(stream.startPC); + reset_pc.npc(stream.startPC + 4); + reset_pc.uReset(); + DPRINTF(Fetch, + "[tid:%i] Resetting fetch PC to new FTQ stream start %s " + "(previous PC %#lx outside [%#lx, %#lx))\n", + tid, *threads[tid].fetchpc, current_pc, + stream.startPC, stream.predEndPC); + } + DPRINTF(Fetch, "[tid:%i] Issuing a pipelined I-cache access for new FSQ entry, " "starting at PC %#x (endPC %#x; original PC %s)\n", tid, start_pc, stream.predEndPC, pc_state); diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 0f3b005a8f..e070f076d0 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -387,6 +387,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) params.StoreCompletionWidth); thread[tid].init(cpu, iew_ptr, params, this, tid); thread[tid].setDcachePort(&dcachePort); + _storeBufferFlushing[tid] = false; } std::vector store_buffer_entries; @@ -757,18 +758,26 @@ LSQ::processWriteback() if (storeBufferBlocked()) { - // dont offload store to sbuffer when sbuffer is flushing DPRINTF(StoreBuffer, "Store buffer is blocking, skip SQ offload\n"); return; } + std::vector offload_quota(numThreads, 0); std::vector offload_demand(numThreads, 0); std::vector requester_tids; requester_tids.reserve(activeThreads->size()); + uint32_t sbuffer_flush_bitset = 0; + for (ThreadID tid : *activeThreads) { + bool sbuffer_flushing = storeBufferFlushing(tid); + sbuffer_flush_bitset |= (sbuffer_flushing << tid); + } + for (ThreadID tid : *activeThreads) { offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries( maxStoreBufferEntriesAcceptedFromSQPerCycle); - if (offload_demand[tid] != 0) { + // when other thread is flushing sbuffer, stop current thread sq offloading + bool conti = (sbuffer_flush_bitset & ~(1 << tid)) == 0; + if (conti && offload_demand[tid] != 0) { requester_tids.push_back(tid); } } @@ -812,17 +821,20 @@ LSQ::processWriteback() ThreadID tid = *threads++; thread[tid].offloadToStoreBuffer(offload_quota[tid]); } -} -void -LSQ::storeBufferWriteback() -{ - bool can_evict = true; + // If the store buffer is flushing and no entries remain to be sent, + // clear the flushing state to avoid deadlock. if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] { assert(storeBuffer.unsentSize() == 0); clearStoreBufferFlushing(); cpu->activityThisCycle(); } +} + +void +LSQ::storeBufferWriteback() +{ + bool can_evict = true; // write request will stall one cycle // so 2 cycle send one write request @@ -1536,7 +1548,7 @@ LSQ::hasStoresToWB(ThreadID tid) bool LSQ::flushStores(ThreadID tid) { - _storeBufferFlushing = true; + _storeBufferFlushing[tid] = true; // TODO:high performance shared SMT storebuffer flushing bool t = !hasStoresToWB(tid) && storeBufferEmpty(); return t; diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 604df7c0f1..fc2c73a80c 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -64,6 +64,7 @@ #include "cpu/inst_seq.hh" #include "cpu/o3/dyn_inst_ptr.hh" #include "cpu/o3/dyn_inst_xsmeta.hh" +#include "cpu/o3/limits.hh" #include "cpu/utils.hh" #include "enums/SMTQueuePolicy.hh" #include "mem/packet.hh" @@ -1081,8 +1082,21 @@ class LSQ bool getDcacheWriteStall() { return dcacheWriteStall; } StoreBuffer &getStoreBuffer() { return storeBuffer; } bool storeBufferEmpty() const { return storeBuffer.size() == 0; } - bool storeBufferFlushing() const { return _storeBufferFlushing; } - void clearStoreBufferFlushing() { _storeBufferFlushing = false; } + bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; } + bool storeBufferFlushing() const + { + for (auto tid : *activeThreads) { + if (_storeBufferFlushing[tid]) + return true; + } + return false; + } + void clearStoreBufferFlushing(ThreadID tid) { _storeBufferFlushing[tid] = false; } + void clearStoreBufferFlushing() { + for (auto tid : *activeThreads) { + _storeBufferFlushing[tid] = false; + } + } uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; } uint32_t getSbufferEntries() const { return sbufferEntries; } uint64_t getStoreBufferInactiveCycles() const @@ -1171,7 +1185,7 @@ class LSQ const uint64_t storeBufferInactiveThreshold; const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2; StoreBuffer storeBuffer; - bool _storeBufferFlushing = false; + bool _storeBufferFlushing[MaxThreads] = {false}; uint64_t storeBufferWritebackInactive = 0; StoreBufferEntry *blockedSbufferEntry = nullptr; ThreadID nextStoreBufferOffloadTid = InvalidThreadID; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 9cfc4d791f..6be535e5df 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -349,20 +349,47 @@ LSQUnit::completeDataAccess(PacketPtr pkt) if (inst->isLoad() || inst->isAtomic()) { Addr addr = pkt->getAddr(); auto [enable_diff, diff_all_states] = cpu->getDiffAllStates(); + if (system->multiContextDifftest() && enable_diff && + request->_sbufferBypass && + inst->isLoad() && + cpu->goldenMemManager()->inPmem(addr)) { + // A store-forwarded load may legitimately observe a value that + // is newer than the current shared golden memory snapshot. + // Keep the observed value on the instruction so difftest can + // repair the reference state for this hart if needed. + inst->setGolden(pkt->getPtr()); + } if (system->multiContextDifftest() && enable_diff && !request->_sbufferBypass && cpu->goldenMemManager()->inPmem(addr)) { - // check data with golden mem - uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); uint8_t *loaded_data = pkt->getPtr(); size_t size = pkt->getSize(); - if (memcmp(golden_data, loaded_data, size) == 0) { - assert(size == inst->effSize); - inst->setGolden(golden_data); + assert(size == inst->effSize); + + if (inst->isAtomic()) { + uint8_t *golden_old = + reinterpret_cast(inst->getAmoOldGoldenValuePtr()); + cpu->goldenMemManager()->readGoldenMem(addr, golden_old, size); + if (memcmp(golden_old, loaded_data, size) != 0) { + panic("[tid:%d] [sn:%llu] Atomic old value error at addr %#lx, " + "size %d. %s\n", + inst->threadNumber, inst->seqNum, addr, size, + goldenDiffStr(loaded_data, golden_old, size).c_str()); + } } else { - panic("Data error at addr %#lx, size %d. %s\n", - addr, size, - goldenDiffStr(loaded_data, golden_data, size).c_str()); + // check data with golden mem + uint8_t *golden_data = + (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); + if (memcmp(golden_data, loaded_data, size) == 0) { + inst->setGolden(golden_data); + } else { + DPRINTF(Diff, + "[tid:%d] [sn:%llu] Load sees value different from " + "current golden memory at addr %#lx, size %d. " + "Treating as concurrent update window. %s\n", + inst->threadNumber, inst->seqNum, addr, size, + goldenDiffStr(loaded_data, golden_data, size).c_str()); + } } } } @@ -2016,6 +2043,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) { assert(!lsq->storeBufferBlocked()); if (isStoreBlocked) return; + if (max_entries == 0) return; uint32_t accepted_entries = 0; while (storesToWB > 0 && @@ -2527,23 +2555,21 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe request->_size); } else { uint8_t tmp_data[8]; - memset(tmp_data, 0, 8); - memcpy(tmp_data, store_inst->memData, request->_size); + memset(tmp_data, 0, sizeof(tmp_data)); assert(request->req()->getAtomicOpFunctor()); - // read golden memory to get the global latest value before this AMO is executed for further compare - cpu->goldenMemManager()->readGoldenMem(paddr, - store_inst->getAmoOldGoldenValuePtr(), request->_size); + // The AMO response returns the old memory value. Capture it on the + // instruction so commit/difftest can use a per-inst copy under SMT. cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue(); + memcpy(tmp_data, store_inst->getAmoOldGoldenValuePtr(), request->_size); - // before amo operate on golden memory (*(request->req()->getAtomicOpFunctor()))(tmp_data); - // after amo operate on golden memory DPRINTF(LSQUnit, "AMO writing to golden memory at addr %#x, data %#lx, mask %#x, size %d\n", paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size); cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff, request->_size); + store_inst->setGolden(tmp_data); } } diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc index 4e007804c2..410d7dcfac 100644 --- a/src/cpu/o3/rob.cc +++ b/src/cpu/o3/rob.cc @@ -297,15 +297,23 @@ ROB::countInsts(ThreadID tid) return instList[tid].size(); } +uint32_t +ROB::countInstsOfGroups(ThreadID tid, int groups) +{ + int sum = 0; + auto it = threadGroups[tid].begin(); + for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) { + sum += *it; + } + return sum; +} + uint32_t ROB::countInstsOfGroups(int groups) { int sum = 0; for (ThreadID tid = 0; tid < numThreads; tid++) { - auto it = threadGroups[tid].begin(); - for (int i = 0; i < groups && it != threadGroups[tid].end(); i++, it++) { - sum += *it; - } + sum += countInstsOfGroups(tid, groups); } return sum; } diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index d9b3e9999b..1fdcbf0857 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -256,6 +256,7 @@ class ROB return sum; } + uint32_t countInstsOfGroups(ThreadID tid, int groups); uint32_t countInstsOfGroups(int groups); bool (ROB::*allocateNewGroup)(const DynInstPtr inst, ThreadID tid); diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index f25b77be68..3171928e1b 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1021,6 +1021,7 @@ class BTBRAS(TimedBaseBTBPredictor): cxx_class = 'gem5::branch_prediction::btb_pred::BTBRAS' cxx_header = 'cpu/pred/btb/ras.hh' + numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") numEntries = Param.Unsigned(32, "Number of entries in the RAS") ctrWidth = Param.Unsigned(8, "Width of the counter") numInflightEntries = Param.Unsigned(384, "Number of inflight entries") diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 2e272047eb..bb87772263 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -212,11 +212,12 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid) DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC); - - // Initialize prediction state for each stage + // Reset all stage-local prediction fields before components fill them. + clearPreds(tid); for (int i = 0; i < numStages; i++) { predsOfEachStage[i].tid = tid; predsOfEachStage[i].bbStart = thread.s0PC; + predsOfEachStage[i].predSource = i; } // Query each predictor component with current PC and history diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 38adad4115..2552ce9e44 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -187,10 +187,9 @@ class DecoupledBPUWithBTB : public BPredUnit void generateFinalPredAndCreateBubbles(ThreadID tid); void clearPreds(ThreadID tid) { - for (auto &stagePred : threads[tid].predsOfEachStage) { - stagePred.condTakens.clear(); - stagePred.indirectTargets.clear(); - stagePred.btbEntries.clear(); + for (int i = 0; i < threads[tid].predsOfEachStage.size(); ++i) { + threads[tid].predsOfEachStage[i] = FullBTBPrediction(); + threads[tid].predsOfEachStage[i].predSource = i; } } diff --git a/src/cpu/pred/btb/ras.cc b/src/cpu/pred/btb/ras.cc index 7f279bdb8e..8dd5b80aea 100644 --- a/src/cpu/pred/btb/ras.cc +++ b/src/cpu/pred/btb/ras.cc @@ -21,28 +21,13 @@ namespace btb_pred { : TimedBaseBTBPredictor(), numEntries(numEntries), ctrWidth(ctrWidth), - numInflightEntries(numInflightEntries) + numInflightEntries(numInflightEntries), + maxCtr((1 << ctrWidth) - 1), + numThreads(1), + threadStates(numThreads) { - // Initialize RAS state - ssp = 0; - nsp = 0; - sctr = 0; - stack.resize(numEntries); - maxCtr = (1 << ctrWidth) - 1; - TOSW = 0; - TOSR = 0; - inflightPtrDec(TOSR); - BOS = 0; - inflightStack.resize(numInflightEntries); - - // Initialize stack entries - for (auto &entry : stack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; - } - for (auto &entry : inflightStack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; + for (auto &state : threadStates) { + initThreadState(state); } } #else @@ -51,49 +36,61 @@ namespace btb_pred { : TimedBaseBTBPredictor(p), numEntries(p.numEntries), ctrWidth(p.ctrWidth), - numInflightEntries(p.numInflightEntries), - rasStats(this) + numInflightEntries(p.numInflightEntries), + maxCtr((1 << ctrWidth) - 1), + numThreads(p.numThreads), + threadStates(numThreads), + rasStats(this) { - // Initialize RAS state - ssp = 0; - nsp = 0; - sctr = 0; - stack.resize(numEntries); - maxCtr = (1 << ctrWidth) - 1; - TOSW = 0; - TOSR = 0; - inflightPtrDec(TOSR); - BOS = 0; - inflightStack.resize(numInflightEntries); - - // Initialize stack entries - for (auto &entry : stack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; - } - for (auto &entry : inflightStack) { - entry.data.ctr = 0; - entry.data.retAddr = 0x80000000L; + for (auto &state : threadStates) { + initThreadState(state); } } #endif void -BTBRAS::checkCorrectness() { +BTBRAS::initThreadState(ThreadRASState &state) +{ + state.TOSW = 0; + state.TOSR = 0; + inflightPtrDec(state.TOSR); + state.BOS = 0; + state.ssp = 0; + state.nsp = 0; + state.sctr = 0; + state.meta.reset(); + + state.stack.resize(numEntries); + state.inflightStack.resize(numInflightEntries); + + for (auto &entry : state.stack) { + entry.data.ctr = 0; + entry.data.retAddr = 0x80000000L; + } + for (auto &entry : state.inflightStack) { + entry.data.ctr = 0; + entry.data.retAddr = 0x80000000L; + entry.nos = 0; + } +} + +void +BTBRAS::checkCorrectness(ThreadID tid) { + auto &state = threadStates[tid]; /* - auto tosr = TOSR; - int checkssp = ssp; - while (inflightInRange(tosr)) { - if (!inflightStack[tosr].data.ctr) { + auto tosr = state.TOSR; + int checkssp = state.ssp; + while (inflightInRange(state, tosr)) { + if (!state.inflightStack[tosr].data.ctr) { checkssp = (checkssp - 1 + numEntries) % numEntries; } else { // just dec sctr, fixme here } - tosr = inflightStack[tosr].nos; + tosr = state.inflightStack[tosr].nos; } - if (checkssp != (nsp + numEntries - 1) % numEntries) { + if (checkssp != (state.nsp + numEntries - 1) % numEntries) { DPRINTF(RAS, "NSP and SSP check failed\n"); - printStack("checkCorrectness"); + printStack("checkCorrectness", tid); }*/ } @@ -102,15 +99,19 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { assert(getDelay() < stagePreds.size()); - meta = std::make_shared(); + const ThreadID tid = stagePreds.back().tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; + state.meta = std::make_shared(); DPRINTFR(RAS, "putPC startAddr %lx", startAddr); - // checkCorrectness(); + // checkCorrectness(tid); + auto top = getTop_meta(tid); for (int i = getDelay(); i < stagePreds.size(); i++) { - stagePreds[i].returnTarget = getTop_meta().retAddr; // stack[sp].retAddr; + stagePreds[i].returnTarget = top.retAddr; } /* if (stagePreds.back().btbEntry.slots[0].isCall || stagePreds.back().btbEntry.slots[0].isReturn || stagePreds.back().btbEntry.slots[1].isCall || stagePreds.back().btbEntry.slots[1].isReturn) { - printStack("putPCHistory"); + printStack("putPCHistory", tid); } */ } @@ -118,13 +119,19 @@ BTBRAS::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::shared_ptr BTBRAS::getPredictionMeta(ThreadID tid) { - (void)tid; - return meta; + if (tid >= threadStates.size()) { + return nullptr; + } + return threadStates[tid].meta; } void BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction &pred) { + const ThreadID tid = pred.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; + assert(state.meta); // do push & pops on prediction // pred.returnTarget = stack[sp].retAddr; auto takenEntry = pred.getTakenEntry(); @@ -132,11 +139,11 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction if (takenEntry.isCall) { Addr retAddr = takenEntry.pc + takenEntry.size; - push(retAddr); + push(tid, retAddr); } if (takenEntry.isReturn) { // do pop - pop(); + pop(tid); } if (takenEntry.isCall) { DPRINTFR(RAS, "IsCall spec PC %lx\n", takenEntry.pc); @@ -146,36 +153,39 @@ BTBRAS::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPrediction } if (takenEntry.isCall || takenEntry.isReturn) - printStack("after specUpdateHist"); - DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", meta->TOSR, meta->TOSW); + printStack("after specUpdateHist", tid); + DPRINTFR(RAS, "meta TOSR %d TOSW %d\n", state.meta->TOSR, state.meta->TOSW); } void BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + const ThreadID tid = entry.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; auto takenEntry = entry.exeBranchInfo; /* if (takenEntry.isCall || takenEntry.isReturn) { - printStack("before recoverHist"); + printStack("before recoverHist", tid); }*/ // recover sp and tos first auto meta_ptr = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); DPRINTF(RAS, "recover called, meta TOSR %d TOSW %d ssp %d sctr %u entry PC %lx end PC %lx\n", meta_ptr->TOSR, meta_ptr->TOSW, meta_ptr->ssp, meta_ptr->sctr, entry.startPC, entry.predEndPC); - TOSR = meta_ptr->TOSR; - TOSW = meta_ptr->TOSW; - ssp = meta_ptr->ssp; - sctr = meta_ptr->sctr; + state.TOSR = meta_ptr->TOSR; + state.TOSW = meta_ptr->TOSW; + state.ssp = meta_ptr->ssp; + state.sctr = meta_ptr->sctr; Addr retAddr = takenEntry.pc + takenEntry.size; // do push & pops on control squash if (entry.exeTaken) { if (takenEntry.isCall) { - push(retAddr); + push(tid, retAddr); } if (takenEntry.isReturn) { - pop(); + pop(tid); //TOSW = (TOSR + 1) % numInflightEntries; } } @@ -187,7 +197,7 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e DPRINTF(RAS, "IsRet expect target %lx, preded %lx, pred taken %d pred target %lx\n", takenEntry.target, meta_ptr->target, entry.predTaken, entry.predBranchInfo.target); } - printStack("after recoverHist"); + printStack("after recoverHist", tid); } } @@ -195,83 +205,89 @@ BTBRAS::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &e void BTBRAS::update(const FetchTarget &entry) { + const ThreadID tid = entry.tid; + assert(tid < numThreads); + auto &state = threadStates[tid]; auto meta_ptr = std::static_pointer_cast(entry.predMetas[getComponentIdx()]); auto takenEntry = entry.exeBranchInfo; if (entry.exeTaken) { - if (meta_ptr->ssp != nsp || meta_ptr->sctr != stack[nsp].data.ctr) { + if (meta_ptr->ssp != state.nsp || meta_ptr->sctr != state.stack[state.nsp].data.ctr) { DPRINTF(RAS, "ssp and nsp mismatch, recovering, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n", - meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr); - nsp = meta_ptr->ssp; + meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr); + state.nsp = meta_ptr->ssp; } else DPRINTF(RAS, "ssp and nsp match, ssp = %d, sctr = %d, nsp = %d, nctr = %d\n", - meta_ptr->ssp, meta_ptr->sctr, nsp, stack[nsp].data.ctr); + meta_ptr->ssp, meta_ptr->sctr, state.nsp, state.stack[state.nsp].data.ctr); if (takenEntry.isCall) { DPRINTF(RAS, "real update call BTB hit %d meta TOSR %d TOSW %d\n entry PC %lx", entry.isHit, meta_ptr->TOSR, meta_ptr->TOSW, entry.startPC); Addr retAddr = takenEntry.pc + takenEntry.size; - push_stack(retAddr); - BOS = inflightPtrPlus1(meta_ptr->TOSW); + push_stack(tid, retAddr); + state.BOS = inflightPtrPlus1(meta_ptr->TOSW); } if (takenEntry.isReturn) { DPRINTF(RAS, "update ret entry PC %lx\n", entry.startPC); - pop_stack(); + pop_stack(tid); } } if (takenEntry.isCall || takenEntry.isReturn) { - printStack("after update(commit)"); + printStack("after update(commit)", tid); } } void -BTBRAS::push_stack(Addr retAddr) +BTBRAS::push_stack(ThreadID tid, Addr retAddr) { - auto tos = stack[nsp]; + auto &state = threadStates[tid]; + auto tos = state.stack[state.nsp]; if (tos.data.retAddr == retAddr && tos.data.ctr < maxCtr) { - stack[nsp].data.ctr++; + state.stack[state.nsp].data.ctr++; } else { // push new entry - ptrInc(nsp); - stack[nsp].data.retAddr = retAddr; - stack[nsp].data.ctr = 0; + ptrInc(state.nsp); + state.stack[state.nsp].data.retAddr = retAddr; + state.stack[state.nsp].data.ctr = 0; } // ++ndepth; } void -BTBRAS::push(Addr retAddr) +BTBRAS::push(ThreadID tid, Addr retAddr) { + auto &state = threadStates[tid]; rasStats.Pushes++; DPRINTF(RAS, "doing push "); // update ssp and sctr first // meta has recorded their old value - auto topAddr = getTop(); - if (retAddr == topAddr.retAddr && sctr < maxCtr) { - sctr++; + auto topAddr = getTop(tid); + if (retAddr == topAddr.retAddr && state.sctr < maxCtr) { + state.sctr++; } else { - ptrInc(ssp); - sctr = 0; + ptrInc(state.ssp); + state.sctr = 0; // do not update non-spec stack here } // push will always enter inflight queue RASInflightEntry t; t.data.retAddr = retAddr; - t.data.ctr = sctr; - t.nos = TOSR; - inflightStack[TOSW] = t; - TOSR = TOSW; - inflightPtrInc(TOSW); + t.data.ctr = state.sctr; + t.nos = state.TOSR; + state.inflightStack[state.TOSW] = t; + state.TOSR = state.TOSW; + inflightPtrInc(state.TOSW); } void -BTBRAS::pop_stack() +BTBRAS::pop_stack(ThreadID tid) { + auto &state = threadStates[tid]; //if (ndepth) { - auto tos = stack[nsp]; + auto tos = state.stack[state.nsp]; if (tos.data.ctr > 0) { - stack[nsp].data.ctr--; + state.stack[state.nsp].data.ctr--; } else { - ptrDec(nsp); + ptrDec(state.nsp); } //--ndepth; //} else { @@ -281,30 +297,31 @@ BTBRAS::pop_stack() } void -BTBRAS::pop() +BTBRAS::pop(ThreadID tid) { + auto &state = threadStates[tid]; // DPRINTFR(RAS, "doing pop ndepth = %d", ndepth); rasStats.Pops++; // pop may need to deal with committed stack - if (inflightInRange(TOSR)) { - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); - TOSR = inflightStack[TOSR].nos; - if (sctr > 0) { - sctr--; + if (inflightInRange(state, state.TOSR)) { + DPRINTF(RAS, "Select from inflight, addr %lx\n", state.inflightStack[state.TOSR].data.retAddr); + state.TOSR = state.inflightStack[state.TOSR].nos; + if (state.sctr > 0) { + state.sctr--; } else { - ptrDec(ssp); - auto newTop = getTop(); - sctr = newTop.ctr; + ptrDec(state.ssp); + auto newTop = getTop(tid); + state.sctr = newTop.ctr; } } else /*if (ndepth)*/ { // TOSR not valid, operate on committed stack DPRINTF(RAS, "in committed range\n"); - if (sctr > 0) { - sctr--; + if (state.sctr > 0) { + state.sctr--; } else { - ptrDec(ssp); - auto newTop = getTop(); - sctr = newTop.ctr; + ptrDec(state.ssp); + auto newTop = getTop(tid); + state.sctr = newTop.ctr; } } //else { @@ -352,12 +369,12 @@ BTBRAS::inflightPtrPlus1(int ptr) { } bool -BTBRAS::inflightInRange(int &ptr) +BTBRAS::inflightInRange(const ThreadRASState &state, int ptr) { - if (TOSW > BOS) { - return ptr >= BOS && ptr < TOSW; - } else if (TOSW < BOS) { - return ptr < TOSW || ptr >= BOS; + if (state.TOSW > state.BOS) { + return ptr >= state.BOS && ptr < state.TOSW; + } else if (state.TOSW < state.BOS) { + return ptr < state.TOSW || ptr >= state.BOS; } else { // empty inflight queue return false; @@ -365,64 +382,79 @@ BTBRAS::inflightInRange(int &ptr) } BTBRAS::RASEssential -BTBRAS::getTop() +BTBRAS::getTop(ThreadID tid) { + auto &state = threadStates[tid]; // results may come from two sources: inflight queue and committed stack - if (inflightInRange(TOSR)) { + if (inflightInRange(state, state.TOSR)) { // result come from inflight queue - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); + DPRINTF(RAS, "Select from inflight, addr %lx\n", + state.inflightStack[state.TOSR].data.retAddr); // additional check: if nos is out of bound, check if commit stack top == inflight[nos] /* - if (!inflightInRange(inflightStack[TOSR].nos)) { - auto top = stack[nsp]; - if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) { + if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) { + auto top = state.stack[state.nsp]; + if (top.data.retAddr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.retAddr || + top.data.ctr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.ctr) { // inflight[nos] is not the same as stack[nsp] DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n"); - printStack("Error case stack dump"); + printStack("Error case stack dump", tid); } }*/ - return inflightStack[TOSR].data; + return state.inflightStack[state.TOSR].data; } else { // result come from commit queue - DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr); - return stack[ssp].data; + DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr); + return state.stack[state.ssp].data; } } BTBRAS::RASEssential -BTBRAS::getTop_meta() { +BTBRAS::getTop_meta(ThreadID tid) { + auto &state = threadStates[tid]; + assert(state.meta); // results may come from two sources: inflight queue and committed stack - if (inflightInRange(TOSR)) { + if (inflightInRange(state, state.TOSR)) { // result come from inflight queue - DPRINTF(RAS, "Select from inflight, addr %lx\n", inflightStack[TOSR].data.retAddr); - meta->ssp = ssp; - meta->sctr = sctr; - meta->TOSR = TOSR; - meta->TOSW = TOSW; - meta->target = inflightStack[TOSR].data.retAddr; + DPRINTF(RAS, "Select from inflight, addr %lx\n", + state.inflightStack[state.TOSR].data.retAddr); + state.meta->ssp = state.ssp; + state.meta->sctr = state.sctr; + state.meta->TOSR = state.TOSR; + state.meta->TOSW = state.TOSW; + state.meta->target = state.inflightStack[state.TOSR].data.retAddr; // additional check: if nos is out of bound, check if commit stack top == inflight[nos] /* - if (!inflightInRange(inflightStack[TOSR].nos)) { - auto top = stack[nsp]; - if (top.data.retAddr != inflightStack[inflightStack[TOSR].nos].data.retAddr || top.data.ctr != inflightStack[inflightStack[TOSR].nos].data.ctr) { + if (!inflightInRange(state, state.inflightStack[state.TOSR].nos)) { + auto top = state.stack[state.nsp]; + if (top.data.retAddr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.retAddr || + top.data.ctr != + state.inflightStack[ + state.inflightStack[state.TOSR].nos].data.ctr) { // inflight[nos] is not the same as stack[nsp] DPRINTF(RAS, "Error: inflight[nos] is not the same as stack[nsp]\n"); - printStack("Error case stack dump"); + printStack("Error case stack dump", tid); } }*/ - return inflightStack[TOSR].data; + return state.inflightStack[state.TOSR].data; } else { // result come from commit queue - meta->ssp = ssp; - meta->sctr = sctr; - meta->TOSR = TOSR; - meta->TOSW = TOSW; - meta->target = stack[ssp].data.retAddr; - DPRINTF(RAS, "Select from stack, addr %lx\n", stack[ssp].data.retAddr); - return stack[ssp].data; + state.meta->ssp = state.ssp; + state.meta->sctr = state.sctr; + state.meta->TOSR = state.TOSR; + state.meta->TOSW = state.TOSW; + state.meta->target = state.stack[state.ssp].data.retAddr; + DPRINTF(RAS, "Select from stack, addr %lx\n", state.stack[state.ssp].data.retAddr); + return state.stack[state.ssp].data; } } diff --git a/src/cpu/pred/btb/ras.hh b/src/cpu/pred/btb/ras.hh index b0b31c6d94..19bb1f0e15 100644 --- a/src/cpu/pred/btb/ras.hh +++ b/src/cpu/pred/btb/ras.hh @@ -112,14 +112,28 @@ namespace btb_pred { Addr getTopAddrFromMetas(const FetchTarget &stream); private: + struct ThreadRASState + { + int TOSW = 0; // inflight pointer to the write top of stack + int TOSR = 0; // inflight pointer to the read top of stack + int BOS = 0; // inflight pointer to the bottom of stack + int ssp = 0; // speculative stack pointer + int nsp = 0; // committed stack pointer + int sctr = 0; + std::vector stack; + std::vector inflightStack; + std::shared_ptr meta; + }; - void push(Addr retAddr); + void initThreadState(ThreadRASState &state); - void pop(); + void push(ThreadID tid, Addr retAddr); - void push_stack(Addr retAddr); - - void pop_stack(); + void pop(ThreadID tid); + + void push_stack(ThreadID tid, Addr retAddr); + + void pop_stack(ThreadID tid); void ptrInc(int &ptr); @@ -129,38 +143,43 @@ namespace btb_pred { void inflightPtrDec(int &ptr); - bool inflightInRange(int &ptr); + bool inflightInRange(const ThreadRASState &state, int ptr); int inflightPtrPlus1(int ptr); - void checkCorrectness(); + void checkCorrectness(ThreadID tid); - RASEssential getTop(); + RASEssential getTop(ThreadID tid); - RASEssential getTop_meta(); + RASEssential getTop_meta(ThreadID tid); - void printStack(const char *when) { - DPRINTF(RAS, "printStack when %s: \n", when); + void printStack(const char *when, ThreadID tid) { + auto &state = threadStates[tid]; + DPRINTF(RAS, "[tid:%u] printStack when %s: \n", tid, when); for (int i = 0; i < numEntries; i++) { - DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, stack[i].data.retAddr, stack[i].data.ctr); - if (ssp == i) { + DPRINTFR(RAS, "entry [%d], retAddr %#lx, ctr %d", i, + state.stack[i].data.retAddr, state.stack[i].data.ctr); + if (state.ssp == i) { DPRINTFR(RAS, " <-- SSP"); } - if (nsp == i) { + if (state.nsp == i) { DPRINTFR(RAS, " <-- NSP"); } DPRINTFR(RAS, "\n"); } DPRINTFR(RAS, "non-volatile stack:\n"); for (int i = 0; i < numInflightEntries; i++) { - DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, inflightStack[i].data.retAddr, inflightStack[i].data.ctr, inflightStack[i].nos); - if (TOSW == i) { + DPRINTFR(RAS, "entry [%d] retAddr %#lx, ctr %u nos %d", i, + state.inflightStack[i].data.retAddr, + state.inflightStack[i].data.ctr, + state.inflightStack[i].nos); + if (state.TOSW == i) { DPRINTFR(RAS, " <-- TOSW"); } - if (TOSR == i) { + if (state.TOSR == i) { DPRINTFR(RAS, " <-- TOSR"); } - if (BOS == i) { + if (state.BOS == i) { DPRINTFR(RAS, " <-- BOS"); } DPRINTFR(RAS, "\n"); @@ -190,27 +209,11 @@ namespace btb_pred { unsigned numInflightEntries; - int TOSW; // inflight pointer to the write top of stack - - int TOSR; // inflight pointer to the read top of stack - - int BOS; // inflight pointer to the bottom of stack - int maxCtr; - int ssp; // spec sp - - int nsp; // non-spec sp - - int sctr; - - //int ndepth; - - std::vector stack; - - std::vector inflightStack; + unsigned numThreads; - std::shared_ptr meta; + std::vector threadStates; #ifdef UNIT_TEST typedef uint64_t Scalar; diff --git a/src/dev/riscv/HartCtrl.py b/src/dev/riscv/HartCtrl.py new file mode 100644 index 0000000000..242c10cccd --- /dev/null +++ b/src/dev/riscv/HartCtrl.py @@ -0,0 +1,13 @@ +from m5.params import * +from m5.proxy import * + +from m5.objects.Device import BasicPioDevice + + +class HartCtrl(BasicPioDevice): + type = 'HartCtrl' + cxx_header = "dev/riscv/hart_ctrl.hh" + cxx_class = 'gem5::HartCtrl' + pio_addr = 0x39001000 + pio_size = Param.Addr(0x1000, "Hart control register space size") + num_threads = Param.Int("Number of threads in the system.") diff --git a/src/dev/riscv/SConscript b/src/dev/riscv/SConscript index 15bf707400..267399e9c0 100755 --- a/src/dev/riscv/SConscript +++ b/src/dev/riscv/SConscript @@ -34,6 +34,7 @@ SimObject('HiFive.py', sim_objects=['HiFive', 'GenericRiscvPciHost'], SimObject('LupV.py', sim_objects=['LupV'], tags='riscv isa') SimObject('Clint.py', sim_objects=['Clint'], tags='riscv isa') SimObject('Lint.py', sim_objects=['Lint'], tags='riscv isa') +SimObject('HartCtrl.py', sim_objects=['HartCtrl'], tags='riscv isa') SimObject('PlicDevice.py', sim_objects=['PlicIntDevice'], tags='riscv isa') SimObject('Plic.py', sim_objects=['Plic'], tags='riscv isa') SimObject('RTC.py', sim_objects=['RiscvRTC'], tags='riscv isa') @@ -55,6 +56,7 @@ Source('hifive.cc', tags='riscv isa') Source('lupv.cc', tags='riscv isa') Source('clint.cc', tags='riscv isa') Source('lint.cc', tags='riscv isa') +Source('hart_ctrl.cc', tags='riscv isa') Source('plic_device.cc', tags='riscv isa') Source('plic.cc', tags='riscv isa') Source('rtc.cc', tags='riscv isa') diff --git a/src/dev/riscv/hart_ctrl.cc b/src/dev/riscv/hart_ctrl.cc new file mode 100644 index 0000000000..b0afe6c8a9 --- /dev/null +++ b/src/dev/riscv/hart_ctrl.cc @@ -0,0 +1,98 @@ +#include "dev/riscv/hart_ctrl.hh" + +#include "cpu/thread_context.hh" +#include "mem/packet_access.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +HartCtrl::HartCtrl(const Params &p) + : BasicPioDevice(p, p.pio_size), + hartResetState(p.num_threads, 1) +{ + if (!hartResetState.empty()) { + // Hart 0 is the boot hart and is considered released by default. + hartResetState[0] = 0; + } +} + +Tick +HartCtrl::read(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize); + assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t)); + + const Addr offset = pkt->getAddr() - pioAddr; + panic_if(offset % sizeof(uint64_t) != 0, + "HartCtrl only supports 64-bit aligned accesses: addr=%#lx", + pkt->getAddr()); + + const ThreadID tid = offset / sizeof(uint64_t); + panic_if(tid >= hartResetState.size(), + "HartCtrl access out of range: tid=%u addr=%#lx", + tid, pkt->getAddr()); + + pkt->setLE(hartResetState[tid]); + pkt->makeAtomicResponse(); + return pioDelay; +} + +Tick +HartCtrl::write(PacketPtr pkt) +{ + assert(pkt->getAddr() >= pioAddr && pkt->getAddr() < pioAddr + pioSize); + assert(pkt->getSize() > 0 && pkt->getSize() <= sizeof(uint64_t)); + + const Addr offset = pkt->getAddr() - pioAddr; + panic_if(offset % sizeof(uint64_t) != 0, + "HartCtrl only supports 64-bit aligned accesses: addr=%#lx", + pkt->getAddr()); + + const ThreadID tid = offset / sizeof(uint64_t); + panic_if(tid >= hartResetState.size(), + "HartCtrl access out of range: tid=%u addr=%#lx", + tid, pkt->getAddr()); + + uint64_t value = 0; + switch (pkt->getSize()) { + case sizeof(uint8_t): + value = pkt->getLE(); + break; + case sizeof(uint16_t): + value = pkt->getLE(); + break; + case sizeof(uint32_t): + value = pkt->getLE(); + break; + case sizeof(uint64_t): + value = pkt->getLE(); + break; + default: + panic("Unsupported HartCtrl write size %u\n", pkt->getSize()); + } + + hartResetState[tid] = value; + + if (value == 0) { + tryWakeHart(tid); + } + + pkt->makeAtomicResponse(); + return pioDelay; +} + +void +HartCtrl::tryWakeHart(ThreadID tid) +{ + panic_if(tid >= sys->threads.size(), + "HartCtrl wake target %u out of system thread range %zu", + tid, sys->threads.size()); + + auto *tc = sys->threads[tid]; + panic_if(!tc, "HartCtrl target %u has no thread context", tid); + + tc->activate(); +} + +} // namespace gem5 diff --git a/src/dev/riscv/hart_ctrl.hh b/src/dev/riscv/hart_ctrl.hh new file mode 100644 index 0000000000..5fe47306f6 --- /dev/null +++ b/src/dev/riscv/hart_ctrl.hh @@ -0,0 +1,33 @@ +// +// Created for Xiangshan bare-metal hart control MMIO. +// + +#ifndef GEM5_HART_CTRL_HH +#define GEM5_HART_CTRL_HH + +#include + +#include "dev/io_device.hh" +#include "params/HartCtrl.hh" + +namespace gem5 +{ + +class HartCtrl : public BasicPioDevice +{ + public: + typedef HartCtrlParams Params; + explicit HartCtrl(const Params &p); + + Tick read(PacketPtr pkt) override; + Tick write(PacketPtr pkt) override; + + private: + void tryWakeHart(ThreadID tid); + + std::vector hartResetState; +}; + +} // namespace gem5 + +#endif // GEM5_HART_CTRL_HH From 76939fc592731101b6bd34df00989f64db7e0e09 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Fri, 27 Mar 2026 18:31:54 +0800 Subject: [PATCH 06/38] cpu-o3: fix smt shared sbuffer Change-Id: Ifbeb947f5f27ebdc9dc39dcfe0172eaa308f8e6f --- src/cpu/base.cc | 74 +++++-- src/cpu/base.hh | 11 + src/cpu/o3/commit.cc | 16 +- src/cpu/o3/iew.hh | 4 + src/cpu/o3/lsq.cc | 477 ++++++++++++++++++++++++++++++++++++++--- src/cpu/o3/lsq.hh | 67 +++++- src/cpu/o3/lsq_unit.cc | 383 ++++++++++++++++++++++++++++----- src/cpu/o3/lsq_unit.hh | 14 +- 8 files changed, 929 insertions(+), 117 deletions(-) diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 68808f3b3a..264e17bf4d 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -211,6 +211,7 @@ BaseCPU::BaseCPU(const Params &p, bool is_checker) diffAllStates.resize(numThreads); recentCommittedStores.resize(numThreads); + syncVisibleStoreReplayArmed.resize(numThreads, false); if (enableDifftest) { assert(params().difftest_ref_so.length() > 2); for (ThreadID tid = 0; tid < numThreads; ++tid) { @@ -1484,12 +1485,23 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) if (system->multiContextDifftest() && (diffInfo.inst->isLoad() || diffInfo.inst->isAtomic()) && _goldenMemManager->inPmem(diffInfo.physEffAddr)) { - warn("Difference on %s instr found in multicore mode, check in golden memory\n", - diffInfo.inst->isLoad() ? "load" : "amo"); - uint8_t *golden_ptr = diffInfo.goldenValue; + DPRINTF(Diff, + "Difference on %s instr found in multicore mode, " + "check in golden memory\n", + diffInfo.inst->isLoad() ? "load" : "amo"); + uint8_t current_golden_data[16] = {}; + panic_if(diffInfo.effSize > sizeof(current_golden_data), + "Unexpected large mem diff size: %u\n", + diffInfo.effSize); + _goldenMemManager->readGoldenMem(diffInfo.physEffAddr, + current_golden_data, + diffInfo.effSize); + uint8_t *golden_ptr = current_golden_data; + uint8_t *exec_golden_ptr = diffInfo.goldenValue; const RecentCommittedStore *matched_recent_store = nullptr; if (diffInfo.inst->isLoad()) { - const auto &recent_history = recentCommittedStores.at(tid); + const auto &recent_history = + recentCommittedStores.at(tid); for (auto it = recent_history.rbegin(); it != recent_history.rend(); ++it) { if (!it->valid || @@ -1506,21 +1518,39 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) } } } + auto sync_reg = [&]() { + diffAllStates->referenceRegFile[dest_tag] = gem5_val; + diffAllStates->proxy->regcpy( + &(diffAllStates->referenceRegFile), DUT_TO_REF); + }; - // a lambda function to sync memory and register from golden results to ref + // Sync both memory and register when the value is already + // globally visible in golden memory. auto sync_mem_reg = [&](const uint8_t *mem_src) { diffAllStates->proxy->memcpy(diffInfo.physEffAddr, const_cast(mem_src), diffInfo.effSize, DIFFTEST_TO_REF); - diffAllStates->referenceRegFile[dest_tag] = gem5_val; - diffAllStates->proxy->regcpy(&(diffAllStates->referenceRegFile), DUT_TO_REF); + sync_reg(); }; - if (diffInfo.inst->isLoad() && memcmp(golden_ptr, &gem5_val, diffInfo.effSize) == 0) { - DPRINTF(Diff, "Load content matched in golden memory. Sync from golden to ref\n"); + if (diffInfo.inst->isLoad() && + memcmp(golden_ptr, &gem5_val, + diffInfo.effSize) == 0) { + DPRINTF(Diff, + "Load content matched in golden memory. " + "Sync from golden to ref\n"); sync_mem_reg(golden_ptr); continue; + } else if (diffInfo.inst->isLoad() && exec_golden_ptr && + memcmp(exec_golden_ptr, &gem5_val, + diffInfo.effSize) == 0) { + DPRINTF(Diff, + "Load content matched the execution-time " + "golden snapshot. Sync from the recorded " + "snapshot to ref\n"); + sync_mem_reg(exec_golden_ptr); + continue; } else if (matched_recent_store) { DPRINTF(Diff, "Load content matched recent committed store " @@ -1534,13 +1564,22 @@ BaseCPU::diffWithNEMU(ThreadID tid, InstSeqNum seq) DPRINTF(Diff, "Golden mem old value: %#lx, GEM5 old value: %#lx\n", diffInfo.amoOldGoldenValue, gem5_val); DPRINTF(Diff, "New golden value: %#lx\n", *(uint64_t *)golden_ptr); - if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, diffInfo.effSize) == 0) { + if (memcmp(&diffInfo.amoOldGoldenValue, &gem5_val, + diffInfo.effSize) == 0) { DPRINTF(Diff, "Atomic encountered, old value matched. Sync from golden to ref\n"); sync_mem_reg(golden_ptr); continue; - } else { - warn("Atomic old value not matched!\n"); } + } else if (diffInfo.inst->isLoad()) { + DPRINTF(Diff, + "Unresolved shared-memory load mismatch at " + "addr=%#lx gem5=%#lx current_golden=%#lx " + "exec_snapshot=%#lx; falling back to normal " + "difftest reporting.\n", + diffInfo.physEffAddr, gem5_val, + *(uint64_t *)golden_ptr, + exec_golden_ptr ? + *(uint64_t *)exec_golden_ptr : 0); } } @@ -1638,25 +1677,22 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) diffAllStates->gem5RegFile.pc = diffInfo.pc->instAddr(); if (noHypeMode) { auto start = pmemStart + pmemSize * difftestHartId(tid); - warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)start, pmemSize); diffAllStates->proxy->memcpy(0x80000000u, start, pmemSize, DUT_TO_REF); } else if (enableMemDedup) { if (system->multiContextDifftest()) { - warn("Let ref share the multi-context golden memory\n"); assert(goldenMemPtr); assert(diffAllStates->proxy->ref_get_backed_memory); - diffAllStates->proxy->ref_get_backed_memory(goldenMemPtr, pmemSize); + diffAllStates->proxy->ref_get_backed_memory( + system->createCopyOnWriteBranch(), pmemSize); + diffAllStates->proxy->memcpy_init( + 0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF); } else { - warn("Let ref share a COW mirror of root memory\n"); assert(diffAllStates->proxy->ref_get_backed_memory); diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); } } else { - warn("MemDedup disabled, copying pmem to NEMU\n"); - warn("Start memcpy to NEMU from %#lx, size=%lu\n", (uint64_t)pmemStart, pmemSize); diffAllStates->proxy->memcpy_init(0x80000000u, pmemStart, pmemSize, DUT_TO_REF); } - warn("Start regcpy to NEMU\n"); diffAllStates->proxy->regcpy(&(diffAllStates->gem5RegFile), DUT_TO_REF); } } diff --git a/src/cpu/base.hh b/src/cpu/base.hh index feaf6e13cd..21c13388db 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -149,6 +149,7 @@ class BaseCPU : public ClockedObject }; std::vector> recentCommittedStores; + std::vector syncVisibleStoreReplayArmed; const unsigned IntRegIndexBase = 0; const unsigned FPRegIndexBase = 32; @@ -790,6 +791,16 @@ class BaseCPU : public ClockedObject void difftestStep(ThreadID tid, InstSeqNum seq); void recordCommittedStore(ThreadID tid, const o3::DynInstPtr &inst); + void armSyncVisibleStoreReplay(ThreadID tid) + { + syncVisibleStoreReplayArmed.at(tid) = true; + } + bool consumeSyncVisibleStoreReplay(ThreadID tid) + { + bool armed = syncVisibleStoreReplayArmed.at(tid); + syncVisibleStoreReplayArmed.at(tid) = false; + return armed; + } inline bool difftestEnabled() const { return enableDifftest; } diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index e1b20025ce..e289754896 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -1482,6 +1482,11 @@ Commit::commitInsts() } + if (head_inst->isReadBarrier() || + head_inst->isWriteBarrier()) { + cpu->armSyncVisibleStoreReplay(tid); + } + if (cpu->difftestEnabled()) { diffInst(tid, head_inst); } @@ -1678,9 +1683,12 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) // Memory-ordering instructions such as sfence.vma must not execute // until older stores are visible; otherwise page-table updates may // race with the TLB invalidation. - if ((head_inst->isMemRef() || head_inst->isReturn() || - head_inst->isReadBarrier() || head_inst->isWriteBarrier()) && - (inst_num > 0 || !iewStage->flushStores(tid))) { + const bool needs_store_drain = + head_inst->isMemRef() || head_inst->isReturn() || + head_inst->isReadBarrier() || head_inst->isWriteBarrier(); + const bool stores_drained = + !needs_store_drain || iewStage->flushStores(tid, head_inst->seqNum); + if (needs_store_drain && (inst_num > 0 || !stores_drained)) { DPRINTF(Commit, "[tid:%i] [sn:%llu] " "Waiting for all stores to writeback.\n", @@ -1734,7 +1742,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) if (inst_fault != NoFault) { traceLogInstFault(head_inst, inst_fault); - if (!iewStage->flushStores(tid) || inst_num > 0) { + if (!iewStage->flushStores(tid, head_inst->seqNum) || inst_num > 0) { DPRINTF(Commit, "[tid:%i] [sn:%llu] " "Stores outstanding, fault must wait.\n", diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index e23d0fb490..94cfbcb8cc 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -260,6 +260,10 @@ class IEW * the store queue or the store buffer to write back to. */ bool flushStores(ThreadID tid) { return ldstQueue.flushStores(tid); } + bool flushStores(ThreadID tid, InstSeqNum seq_num) + { + return ldstQueue.flushStores(tid, seq_num); + } /** Check if we need to squash after a load/store/branch is executed. */ void SquashCheckAfterExe(DynInstPtr inst); diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index e070f076d0..9cc59f560d 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -89,23 +89,96 @@ LSQ::DcachePort::DcachePort(LSQ *_lsq, CPU *_cpu) : std::list LSQ::SingleDataRequest::singleList; +namespace +{ + +bool +storeBufferEntryEligibleForLoad(const LSQ::StoreBufferEntry *entry, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return false; + } + + if (entry->tid == load_tid) { + return entry->seqNum < load_seq; + } + + return entry->generation != 0 && entry->generation <= visible_generation; +} + +bool +storeBufferByteEligibleForLoad(const LSQ::StoreBufferEntry *entry, + size_t byte_idx, ThreadID load_tid, + InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return false; + } + + if (entry->tid == load_tid) { + return entry->seqNum < load_seq; + } + + if (!entry->sending) { + return false; + } + + return byte_idx < entry->byteGenerations.size() && + entry->byteGenerations[byte_idx] != 0 && + entry->byteGenerations[byte_idx] <= visible_generation; +} + +uint64_t +storeBufferEligibleGeneration(const LSQ::StoreBufferEntry *entry, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) +{ + if (!entry) { + return 0; + } + + uint64_t best_generation = 0; + if (storeBufferEntryEligibleForLoad(entry, load_tid, load_seq, + visible_generation)) { + best_generation = entry->generation; + } + if (storeBufferEntryEligibleForLoad(entry->vice, load_tid, load_seq, + visible_generation)) { + best_generation = std::max(best_generation, entry->vice->generation); + } + return best_generation; +} + +} // anonymous namespace + void -LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr, +LSQ::StoreBufferEntry::reset(ThreadID tid, InstSeqNum seq_num, + uint64_t block_vaddr, uint64_t block_paddr, uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask) + const std::vector &mask, + uint64_t generation) { std::fill(validMask.begin(), validMask.begin() + offset, false); + std::fill(byteGenerations.begin(), byteGenerations.end(), 0); for (int i = 0; i < size; i++) { validMask[offset + i] = mask[i]; + if (mask[i]) { + byteGenerations[offset + i] = generation; + } } std::fill(validMask.begin() + offset + size, validMask.end(), false); memcpy(blockDatas.data() + offset, datas, size); this->tid = tid; + this->seqNum = seq_num; this->blockVaddr = block_vaddr; this->blockPaddr = block_paddr; + this->generation = generation; this->sending = false; this->request = nullptr; this->vice = nullptr; @@ -113,19 +186,23 @@ LSQ::StoreBufferEntry::reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_ void LSQ::StoreBufferEntry::merge(uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask) + const std::vector &mask, + uint64_t generation) { assert(offset + size <= validMask.size()); for (uint64_t i = 0; i < size; ++i) { if (mask[i]) { blockDatas[offset + i] = datas[i]; validMask[offset + i] = true; + byteGenerations[offset + i] = generation; } } } bool -LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq) +LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation) { int offset = req->getPaddr() & (validMask.size() - 1); // the offset in the split request @@ -136,13 +213,21 @@ LSQ::StoreBufferEntry::recordForward(RequestPtr req, LSQRequest *lsqreq) bool full_forward = true; for (int i = 0; i < req->getSize(); i++) { assert(goffset + i < lsqreq->_size); - if (vice && vice->validMask[offset + i]) { + const bool vice_eligible = + vice && vice->validMask[offset + i] && + storeBufferByteEligibleForLoad(vice, offset + i, load_tid, + load_seq, visible_generation); + const bool self_eligible = + validMask[offset + i] && + storeBufferByteEligibleForLoad(this, offset + i, load_tid, + load_seq, visible_generation); + if (vice_eligible) { // vice is newer assert(vice->blockVaddr == blockVaddr); lsqreq->SBforwardPackets.push_back( LSQRequest::FWDPacket{ .idx = goffset + i, .byte = vice->blockDatas[offset + i]}); - } else if (validMask[offset + i]) { + } else if (self_eligible) { lsqreq->SBforwardPackets.push_back( LSQRequest::FWDPacket{ .idx = goffset + i, .byte = blockDatas[offset + i]}); @@ -182,6 +267,40 @@ LSQ::StoreBuffer::size() const return _size; } +uint64_t +LSQ::StoreBuffer::size(ThreadID tid) const +{ + uint64_t count = 0; + for (size_t index = 0; index < data_vec.size(); ++index) { + if (!data_vld[index]) { + continue; + } + + auto *entry = data_vec[index]; + if (entry && entry->tid == tid) { + ++count; + } + } + return count; +} + +uint64_t +LSQ::StoreBuffer::size(ThreadID tid, InstSeqNum seq_num) const +{ + uint64_t count = 0; + for (size_t index = 0; index < data_vec.size(); ++index) { + if (!data_vld[index]) { + continue; + } + + auto *entry = data_vec[index]; + if (entry && entry->tid == tid && entry->seqNum < seq_num) { + ++count; + } + } + return count; +} + uint64_t LSQ::StoreBuffer::unsentSize() const { @@ -243,6 +362,47 @@ LSQ::StoreBuffer::getEvict() return data_vec[index]; } +LSQ::StoreBufferEntry * +LSQ::StoreBuffer::getEvict(const bool *eligible_tids, size_t num_threads) +{ + return getEvict(eligible_tids, nullptr, num_threads); +} + +LSQ::StoreBufferEntry * +LSQ::StoreBuffer::getEvict(const bool *eligible_tids, + const InstSeqNum *eligible_seq, + size_t num_threads) +{ + if (eligible_tids == nullptr && eligible_seq == nullptr) { + return getEvict(); + } + + for (auto it = lru_index.rbegin(); it != lru_index.rend(); ++it) { + auto *entry = data_vec[*it]; + if (!entry) { + continue; + } + + const ThreadID tid = entry->tid; + if (tid >= num_threads) { + continue; + } + if (eligible_tids && !eligible_tids[tid]) { + continue; + } + if (eligible_seq && + eligible_seq[tid] != static_cast(-1) && + entry->seqNum >= eligible_seq[tid]) { + continue; + } + + lru_index.erase(std::find(lru_index.begin(), lru_index.end(), *it)); + return entry; + } + + return nullptr; +} + LSQ::StoreBufferEntry * LSQ::StoreBuffer::createVice(StoreBufferEntry *entry) { @@ -766,17 +926,17 @@ LSQ::processWriteback() std::vector offload_demand(numThreads, 0); std::vector requester_tids; requester_tids.reserve(activeThreads->size()); - uint32_t sbuffer_flush_bitset = 0; - for (ThreadID tid : *activeThreads) { - bool sbuffer_flushing = storeBufferFlushing(tid); - sbuffer_flush_bitset |= (sbuffer_flushing << tid); - } for (ThreadID tid : *activeThreads) { offload_demand[tid] = thread[tid].countStoreBufferOffloadableEntries( maxStoreBufferEntriesAcceptedFromSQPerCycle); - // when other thread is flushing sbuffer, stop current thread sq offloading - bool conti = (sbuffer_flush_bitset & ~(1 << tid)) == 0; + // During a global sbuffer flush, only threads that requested the + // flush may keep draining older committed stores from their SQ. + // If both SMT threads are flushing simultaneously, both must still be + // allowed to make forward progress, otherwise they can deadlock while + // waiting on each other's flush bit. + const bool conti = + !storeBufferFlushing() || storeBufferFlushing(tid); if (conti && offload_demand[tid] != 0) { requester_tids.push_back(tid); } @@ -822,11 +982,14 @@ LSQ::processWriteback() thread[tid].offloadToStoreBuffer(offload_quota[tid]); } - // If the store buffer is flushing and no entries remain to be sent, - // clear the flushing state to avoid deadlock. - if (storeBufferFlushing() && storeBuffer.size() == 0) [[unlikely]] { - assert(storeBuffer.unsentSize() == 0); - clearStoreBufferFlushing(); + // A fence/flush only waits for the requesting thread's sbuffer domain. + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (!storeBufferFlushing(tid) || + !storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid])) { + continue; + } + + clearStoreBufferFlushing(tid); cpu->activityThisCycle(); } } @@ -874,12 +1037,23 @@ LSQ::storeBufferWriteback() } if (cause) { - StoreBufferEntry *entry = storeBuffer.getEvict(); + StoreBufferEntry *entry = nullptr; + if (*cause == StoreBufferEvictCause::Flush) { + entry = storeBuffer.getEvict( + _storeBufferFlushing, _storeBufferFlushBeforeSeq, + numThreads); + } else { + entry = storeBuffer.getEvict(); + } + if (!entry) { + /* Disabled with the broad sbuffer watchdog above. */ + return; + } + /* Disabled with the broad sbuffer watchdog above. */ auto &owner_unit = thread[entry->tid]; recordStoreBufferEviction(*cause); DPRINTF(StoreBuffer, "Evicting sbuffer entry[%#x]\n", entry->blockPaddr); - if (debug::StoreBuffer) { DPRINTFR(StoreBuffer, "Dumping sbuffer entry data\n"); for (int i = 0; i < owner_unit.cacheLineSize(); i++) { @@ -969,6 +1143,20 @@ void LSQ::completeSbufferEvict(PacketPtr pkt) { auto request = dynamic_cast(pkt->senderState); + const Addr block_paddr = request->sbuffer_entry->blockPaddr; + invalidateOtherThreadStoreBufferBytes(request->sbuffer_entry->tid, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), + request->sbuffer_entry->generation); + markStoreBufferBlockVisible(block_paddr, + request->sbuffer_entry->generation); + const bool replay_executed_loads = + cpu->consumeSyncVisibleStoreReplay(request->sbuffer_entry->tid); + notifyOtherThreadsStoreVisible(request->sbuffer_entry->tid, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), + request->sbuffer_entry->seqNum, + replay_executed_loads); if (cpu->goldenMemManager() && cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { Addr paddr = request->mainReq()->getPaddr(); @@ -980,6 +1168,7 @@ LSQ::completeSbufferEvict(PacketPtr pkt) } storeBuffer.release(request->sbuffer_entry); + reclaimStoreBufferBlockMetadata(block_paddr); DPRINTF(StoreBuffer, "finish entry[%#x] evict to cache, sbuffer size: %d, " "unsentsize: %d\n", @@ -1142,7 +1331,6 @@ LSQ::recvTimingResp(PacketPtr pkt) LSQRequest *request = dynamic_cast(pkt->senderState); panic_if(!request, "Got packet back with unknown sender state\n"); - thread[request->_port.lsqID].recvTimingResp(pkt); if (pkt->isInvalidate()) { @@ -1546,12 +1734,245 @@ LSQ::hasStoresToWB(ThreadID tid) return thread.at(tid).hasStoresToWB(); } -bool LSQ::flushStores(ThreadID tid) +bool +LSQ::hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num) +{ + return thread.at(tid).hasStoresToWBBefore(seq_num); +} + +bool +LSQ::flushStores(ThreadID tid) +{ + _storeBufferFlushing[tid] = true; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + const bool has_stores = hasStoresToWB(tid); + const bool sbuffer_empty = + storeBufferEmpty(tid, _storeBufferFlushBeforeSeq[tid]); + if (!has_stores && sbuffer_empty) { + clearStoreBufferFlushing(tid); + return true; + } + + return false; +} + +bool +LSQ::flushStores(ThreadID tid, InstSeqNum seq_num) { _storeBufferFlushing[tid] = true; - // TODO:high performance shared SMT storebuffer flushing - bool t = !hasStoresToWB(tid) && storeBufferEmpty(); - return t; + _storeBufferFlushBeforeSeq[tid] = seq_num; + const bool has_older_stores = hasStoresToWBBefore(tid, seq_num); + const bool sbuffer_empty = storeBufferEmpty(tid, seq_num); + if (!has_older_stores && sbuffer_empty) { + clearStoreBufferFlushing(tid); + return true; + } + + return false; +} + +void +LSQ::requestGlobalStoreBufferFlush() +{ + for (ThreadID tid = 0; tid < numThreads; ++tid) { + _storeBufferFlushing[tid] = true; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + } +} + +bool +LSQ::storeBufferHasConflict(ThreadID tid, Addr block_paddr) const +{ + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + + if (storeBuffer.get(other_tid, block_paddr)) { + return true; + } + } + + return false; +} + +uint64_t +LSQ::bumpStoreBufferBlockVersion(Addr block_paddr) +{ + auto &version = storeBufferBlockVersion[block_paddr]; + ++version; + if (version == 0) { + version = 1; + } + return version; +} + +uint64_t +LSQ::currentStoreBufferBlockVersion(Addr block_paddr) const +{ + auto it = storeBufferBlockVersion.find(block_paddr); + return it == storeBufferBlockVersion.end() ? 0 : it->second; +} + +void +LSQ::markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation) +{ + auto &visible = storeBufferVisibleVersion[block_paddr]; + visible = std::max(visible, generation); + reclaimStoreBufferBlockMetadata(block_paddr); +} + +uint64_t +LSQ::currentStoreBufferVisibleVersion(Addr block_paddr) const +{ + auto it = storeBufferVisibleVersion.find(block_paddr); + return it == storeBufferVisibleVersion.end() ? 0 : it->second; +} + +LSQ::StoreBufferEntry * +LSQ::findForwardingStoreBufferEntry(Addr block_paddr, ThreadID load_tid, + InstSeqNum load_seq) const +{ + StoreBufferEntry *best_entry = nullptr; + uint64_t best_generation = 0; + const auto visible_generation = + currentStoreBufferVisibleVersion(block_paddr); + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + auto entry = storeBuffer.get(tid, block_paddr); + if (!entry) { + continue; + } + + const uint64_t entry_generation = + storeBufferEligibleGeneration(entry, load_tid, load_seq, + visible_generation); + if (entry_generation == 0) { + continue; + } + + if (!best_entry || entry_generation > best_generation) { + best_entry = entry; + best_generation = entry_generation; + } + } + + return best_entry; +} + +bool +LSQ::hasLiveStoreBufferBlock(Addr block_paddr) const +{ + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (storeBuffer.get(tid, block_paddr)) { + return true; + } + } + return false; +} + +void +LSQ::reclaimStoreBufferBlockMetadata(Addr block_paddr) +{ + if (hasLiveStoreBufferBlock(block_paddr)) { + return; + } + + auto version_it = storeBufferBlockVersion.find(block_paddr); + if (version_it == storeBufferBlockVersion.end()) { + storeBufferVisibleVersion.erase(block_paddr); + return; + } + + auto visible_it = storeBufferVisibleVersion.find(block_paddr); + const uint64_t visible_generation = + visible_it == storeBufferVisibleVersion.end() ? 0 : visible_it->second; + if (visible_generation < version_it->second) { + return; + } + + storeBufferBlockVersion.erase(version_it); + if (visible_it != storeBufferVisibleVersion.end()) { + storeBufferVisibleVersion.erase(visible_it); + } +} + +void +LSQ::invalidateOtherThreadStoreBufferBytes( + ThreadID tid, Addr paddr, const std::vector &mask, + uint64_t generation) +{ + const Addr cache_block_mask = + ~((static_cast(cpu->cacheLineSize())) - 1); + const Addr block_paddr = paddr & cache_block_mask; + const Addr offset = paddr & ~cache_block_mask; + auto invalidate_entry = [&](StoreBufferEntry *entry) { + if (!entry || offset + mask.size() > entry->validMask.size()) { + return; + } + + if (!entry->sending) { + return; + } + + for (size_t i = 0; i < mask.size(); ++i) { + if (mask[i] && + entry->byteGenerations[offset + i] != 0 && + entry->byteGenerations[offset + i] <= generation) { + entry->validMask[offset + i] = false; + } + } + }; + + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + + auto entry = storeBuffer.get(other_tid, block_paddr); + if (!entry) { + continue; + } + + invalidate_entry(entry); + invalidate_entry(entry->vice); + } +} + +void +LSQ::notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr, + const std::vector &byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads) +{ + if (numThreads <= 1) { + return; + } + + Request::Flags flags; + const Addr cache_block_mask = + ~((static_cast(cpu->cacheLineSize())) - 1); + RequestPtr req = std::make_shared( + store_paddr & cache_block_mask, cpu->cacheLineSize(), flags, + cpu->dataRequestorId()); + Packet pkt(req, MemCmd::InvalidateReq); + + for (ThreadID context_id = 0; context_id < numThreads; ++context_id) { + gem5::ThreadContext *tc = cpu->getContext(context_id); + bool no_squash = cpu->thread[context_id]->noSquashFromTC; + cpu->thread[context_id]->noSquashFromTC = true; + tc->getIsaPtr()->handleLockedSnoop(&pkt, cache_block_mask); + cpu->thread[context_id]->noSquashFromTC = no_squash; + } + + for (ThreadID other_tid = 0; other_tid < numThreads; ++other_tid) { + if (other_tid == tid) { + continue; + } + thread[other_tid].checkLocalStoreVisible(store_paddr, byte_enable, + store_seq, + replay_executed_loads); + } } int @@ -2110,11 +2531,6 @@ LSQ::LSQRequest::forward() LSQ::LSQRequest::~LSQRequest() { - if (isAnyOutstandingRequest()) { - warn("numInTranslationFragments = %u, _numOutstandingPackets = %u\n", - numInTranslationFragments, _numOutstandingPackets); - std::raise(SIGINT); - } assert(!isAnyOutstandingRequest()); if (_inst && _inst->savedRequest == this) { DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum); @@ -2205,7 +2621,6 @@ LSQ::SingleDataRequest::recvTimingResp(PacketPtr pkt) mainReq()->isUncacheable(), cacheHit, *((uint64_t*)buffer)); } - if (isLoad()) { auto it = std::find(lsqUnit()->inflightLoads.begin(), lsqUnit()->inflightLoads.end(), this); if (it != lsqUnit()->inflightLoads.end()) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index fc2c73a80c..83f47b5b91 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -147,10 +147,13 @@ class LSQ public: const int index; ThreadID tid; + InstSeqNum seqNum = 0; Addr blockVaddr; Addr blockPaddr; std::vector blockDatas; std::vector validMask; + std::vector byteGenerations; + uint64_t generation = 0; bool sending; // the another same addr entry when sending // another cannot sending until self sending finished @@ -162,16 +165,20 @@ class LSQ { blockDatas.resize(size, 0); validMask.resize(size, false); + byteGenerations.resize(size, 0); } - void reset(ThreadID tid, uint64_t block_vaddr, uint64_t block_paddr, - uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask); + void reset(ThreadID tid, InstSeqNum seq_num, uint64_t block_vaddr, + uint64_t block_paddr, uint64_t offset, uint8_t *datas, + uint64_t size, const std::vector &mask, + uint64_t generation); void merge(uint64_t offset, uint8_t *datas, uint64_t size, - const std::vector &mask); + const std::vector &mask, uint64_t generation); - bool recordForward(RequestPtr req, LSQRequest *lsqreq); + bool recordForward(RequestPtr req, LSQRequest *lsqreq, + ThreadID load_tid, InstSeqNum load_seq, + uint64_t visible_generation); }; class StoreBuffer @@ -198,12 +205,19 @@ class LSQ void setData(std::vector &data_vec); bool full() const; uint64_t size() const; + uint64_t size(ThreadID tid) const; + uint64_t size(ThreadID tid, InstSeqNum seq_num) const; uint64_t unsentSize() const; StoreBufferEntry *getEmpty(); void insert(StoreBufferEntry *entry); StoreBufferEntry *get(ThreadID tid, uint64_t addr) const; void update(int index); StoreBufferEntry *getEvict(); + StoreBufferEntry *getEvict(const bool *eligible_tids, + size_t num_threads); + StoreBufferEntry *getEvict(const bool *eligible_tids, + const InstSeqNum *eligible_seq, + size_t num_threads); StoreBufferEntry *createVice(StoreBufferEntry *entry); void release(StoreBufferEntry *entry); }; @@ -351,6 +365,8 @@ class LSQ AtomicOpFunctorPtr _amo_op; bool _hasStaleTranslation; bool _sbufferBypass; + bool _goldenSnapshotCaptured = false; + uint64_t _storeBufferGeneration = 0; struct FWDPacket { @@ -477,6 +493,7 @@ class LSQ RequestPtr req(int idx = 0) { return _reqs.at(idx); } const RequestPtr req(int idx = 0) const { return _reqs.at(idx); } + size_t numReqs() const { return _reqs.size(); } Addr getVaddr(int idx = 0) const { return req(idx)->getVaddr(); } virtual void initiateTranslation() = 0; @@ -977,9 +994,29 @@ class LSQ * to memory. */ bool hasStoresToWB(ThreadID tid); + bool hasStoresToWBBefore(ThreadID tid, InstSeqNum seq_num); // true if all stores are flushed bool flushStores(ThreadID tid); + bool flushStores(ThreadID tid, InstSeqNum seq_num); + void requestGlobalStoreBufferFlush(); + bool storeBufferHasConflict(ThreadID tid, Addr block_paddr) const; + uint64_t bumpStoreBufferBlockVersion(Addr block_paddr); + uint64_t currentStoreBufferBlockVersion(Addr block_paddr) const; + void markStoreBufferBlockVisible(Addr block_paddr, uint64_t generation); + uint64_t currentStoreBufferVisibleVersion(Addr block_paddr) const; + StoreBufferEntry *findForwardingStoreBufferEntry(Addr block_paddr, + ThreadID load_tid, + InstSeqNum load_seq) const; + bool hasLiveStoreBufferBlock(Addr block_paddr) const; + void reclaimStoreBufferBlockMetadata(Addr block_paddr); + void invalidateOtherThreadStoreBufferBytes( + ThreadID tid, Addr paddr, const std::vector &mask, + uint64_t generation); + void notifyOtherThreadsStoreVisible(ThreadID tid, Addr store_paddr, + const std::vector &byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads); /** Returns the number of stores a specific thread has to write back. */ int numStoresToSbuffer(ThreadID tid); @@ -1082,6 +1119,14 @@ class LSQ bool getDcacheWriteStall() { return dcacheWriteStall; } StoreBuffer &getStoreBuffer() { return storeBuffer; } bool storeBufferEmpty() const { return storeBuffer.size() == 0; } + bool storeBufferEmpty(ThreadID tid) const + { + return storeBuffer.size(tid) == 0; + } + bool storeBufferEmpty(ThreadID tid, InstSeqNum seq_num) const + { + return storeBuffer.size(tid, seq_num) == 0; + } bool storeBufferFlushing(ThreadID tid) const { return _storeBufferFlushing[tid]; } bool storeBufferFlushing() const { @@ -1091,10 +1136,15 @@ class LSQ } return false; } - void clearStoreBufferFlushing(ThreadID tid) { _storeBufferFlushing[tid] = false; } + void clearStoreBufferFlushing(ThreadID tid) + { + _storeBufferFlushing[tid] = false; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); + } void clearStoreBufferFlushing() { for (auto tid : *activeThreads) { _storeBufferFlushing[tid] = false; + _storeBufferFlushBeforeSeq[tid] = static_cast(-1); } } uint32_t getSbufferEvictThreshold() const { return sbufferEvictThreshold; } @@ -1185,7 +1235,12 @@ class LSQ const uint64_t storeBufferInactiveThreshold; const uint32_t maxStoreBufferEntriesAcceptedFromSQPerCycle = 2; StoreBuffer storeBuffer; + std::unordered_map storeBufferBlockVersion; + std::unordered_map storeBufferVisibleVersion; bool _storeBufferFlushing[MaxThreads] = {false}; + InstSeqNum _storeBufferFlushBeforeSeq[MaxThreads] = { + static_cast(-1) + }; uint64_t storeBufferWritebackInactive = 0; StoreBufferEntry *blockedSbufferEntry = nullptr; ThreadID nextStoreBufferOffloadTid = InvalidThreadID; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 6be535e5df..9d170af470 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -367,22 +367,24 @@ LSQUnit::completeDataAccess(PacketPtr pkt) assert(size == inst->effSize); if (inst->isAtomic()) { - uint8_t *golden_old = - reinterpret_cast(inst->getAmoOldGoldenValuePtr()); - cpu->goldenMemManager()->readGoldenMem(addr, golden_old, size); - if (memcmp(golden_old, loaded_data, size) != 0) { - panic("[tid:%d] [sn:%llu] Atomic old value error at addr %#lx, " - "size %d. %s\n", - inst->threadNumber, inst->seqNum, addr, size, - goldenDiffStr(loaded_data, golden_old, size).c_str()); - } + uint8_t current_golden[8] = {}; + panic_if(size > sizeof(current_golden), + "Unexpected AMO size %u at addr %#lx\n", + size, addr); + cpu->goldenMemManager()->readGoldenMem(addr, current_golden, + size); + + // Preserve the DUT-observed old value until completeStore() + // derives the post-AMO memory image. The golden old-value + // snapshot used by difftest is captured when the request + // is first sent, before later concurrent updates can + // advance shared memory. + inst->setGolden(loaded_data); } else { // check data with golden mem uint8_t *golden_data = (uint8_t *)cpu->goldenMemManager()->guestToHost(addr); - if (memcmp(golden_data, loaded_data, size) == 0) { - inst->setGolden(golden_data); - } else { + if (memcmp(golden_data, loaded_data, size) != 0) { DPRINTF(Diff, "[tid:%d] [sn:%llu] Load sees value different from " "current golden memory at addr %#lx, size %d. " @@ -980,6 +982,103 @@ LSQUnit::checkSnoop(PacketPtr pkt) return; } +namespace +{ + +bool +overlapsVisibleStore(const o3::LSQ::LSQRequest *load_req, Addr store_paddr, + const std::vector &store_byte_enable) +{ + if (!load_req) { + return false; + } + + for (size_t req_idx = 0; req_idx < load_req->numReqs(); ++req_idx) { + const auto req = load_req->req(req_idx); + if (!req->hasPaddr()) { + continue; + } + + const Addr load_start = req->getPaddr(); + const Addr load_end = load_start + req->getSize(); + for (size_t byte_idx = 0; byte_idx < store_byte_enable.size(); + ++byte_idx) { + if (!store_byte_enable[byte_idx]) { + continue; + } + + const Addr byte_addr = store_paddr + byte_idx; + if (byte_addr >= load_start && byte_addr < load_end) { + return true; + } + } + } + + return false; +} + +} // anonymous namespace + +void +LSQUnit::checkLocalStoreVisible(Addr store_paddr, + const std::vector &store_byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads) +{ + [[maybe_unused]] const InstSeqNum visible_store_seq = store_seq; + [[maybe_unused]] const bool replay_visible_loads = replay_executed_loads; + + if (loadQueue.empty()) { + return; + } + + const Addr block_addr = store_paddr & cacheBlockMask; + DynInstPtr oldest_violator = memDepViolator; + + for (auto it = loadQueue.begin(); it != loadQueue.end(); ++it) { + DynInstPtr ld_inst = it->instruction(); + if (!ld_inst || ld_inst->isSquashed() || ld_inst->needReplay() || + !ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { + continue; + } + + LSQRequest *request = ld_inst->savedRequest; + if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) { + continue; + } + if (!overlapsVisibleStore(request, store_paddr, store_byte_enable)) { + continue; + } + if (ld_inst->memReqFlags & Request::LLSC) { + ld_inst->tcBase()->getIsaPtr()->handleLockedSnoopHit(ld_inst.get()); + } + + if (ld_inst->isExecuted()) { + DPRINTF(LSQUnit, + "Local visible store ignores already executed load " + "[sn:%lli] on addr %#x\n", + ld_inst->seqNum, store_paddr); + continue; + } + + ld_inst->hitExternalSnoop(true); + ld_inst->possibleLoadViolation(true); + DPRINTF(LSQUnit, + "Local visible store replays not-yet-executed load [sn:%lli] " + "on addr %#x\n", + ld_inst->seqNum, store_paddr); + ld_inst->setNukeReplay(); + loadSetReplay(ld_inst, request, true); + } + + if (oldest_violator && + (!memDepViolator || oldest_violator->seqNum < memDepViolator->seqNum)) { + memDepViolator = oldest_violator; + cpu->activityThisCycle(); + iewStage->SquashCheckAfterExe(oldest_violator); + } +} + Fault LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, const DynInstPtr& inst) @@ -1102,10 +1201,7 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow) // clear request in loadQueue loadQueue[inst->lqIdx].setRequest(nullptr); if (dropReqNow) { - // discard this request request->discard(); - // TODO: is this essential? - inst->savedRequest = nullptr; } DPRINTF(LoadPipeline, "Load [sn:%ld] set replay, dropReqNow: %d\n", inst->seqNum, dropReqNow); @@ -1523,9 +1619,9 @@ LSQUnit::executeLoadPipeSx() else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst); else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst); else if (inst->needNukeReplay()) { - if (inst->cacheHit()) { + if (inst->savedRequest && inst->cacheHit()) { loadSetReplay(inst, inst->savedRequest, true); - } else if (inst->hasPendingCacheReq()) { + } else if (inst->savedRequest && inst->hasPendingCacheReq()) { loadSetReplay(inst, inst->savedRequest, false); } inst->issueQue->retryMem(inst); @@ -1902,7 +1998,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst) if (x.instruction()->seqNum > youngest_inst) { break; } - assert(x.instruction()->isSplitStoreAddr() ? x.splitStoreFinish() : true); + // Commit can publish a new squash to IEW one cycle after IEW has + // already received an older doneMemSeqNum. If that stale + // doneMemSeqNum reaches here in the same cycle that ROB marks this + // store squashed, do not advance SQ writeback state past the + // squashed entry; IEW's next-cycle squash will remove it. + if (x.instruction()->isSquashed()) { + break; + } + if (x.instruction()->isSplitStoreAddr() && !x.splitStoreFinish()) { + panic("Split store reached commitStores unfinished: tid=%d " + "seq=%llu pc=%#lx youngest=%llu canCommit=%d " + "executed=%d squashed=%d addrReady=%d dataReady=%d " + "staFinish=%d stdFinish=%d canWB=%d completed=%d\n", + x.instruction()->threadNumber, + static_cast( + x.instruction()->seqNum), + x.instruction()->pcState().instAddr(), + static_cast(youngest_inst), + x.instruction()->readyToCommit(), + x.instruction()->isExecuted(), + x.instruction()->isSquashed(), + x.addrReady(), x.dataReady(), + x.staFinish(), x.stdFinish(), + x.canWB(), x.completed()); + } DPRINTF(LSQUnit, "Marking store as able to write back, PC " "%s [sn:%lli]\n", x.instruction()->pcState(), @@ -1915,6 +2035,31 @@ LSQUnit::commitStores(InstSeqNum &youngest_inst) } } +bool +LSQUnit::hasStoresToWBBefore(InstSeqNum seq_num) const +{ + if (storesToWB == 0) { + return false; + } + + for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) { + if (!it->valid() || !it->instruction()) { + continue; + } + + const auto &inst = it->instruction(); + if (inst->seqNum >= seq_num) { + break; + } + + if (it->canWB() && !it->completed()) { + return true; + } + } + + return false; +} + bool LSQUnit::writebackBlockedStore() { @@ -1922,8 +2067,25 @@ LSQUnit::writebackBlockedStore() return false; } - storeWBIt->request()->sendPacketToCache(); - if (storeWBIt->request()->isSent()) { + auto *request = storeWBIt->request(); + const auto &inst = storeWBIt->instruction(); + + if (request->mainReq()->hasPaddr() && + system->multiContextDifftest() && inst->isAtomic() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { + uint8_t issue_golden[8] = {}; + panic_if(request->_size > sizeof(issue_golden), + "Unexpected AMO size %u at addr %#lx\n", + request->_size, request->mainReq()->getPaddr()); + cpu->goldenMemManager()->readGoldenMem( + request->mainReq()->getPaddr(), issue_golden, request->_size); + std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden, + request->_size); + } + + request->sendPacketToCache(); + if (request->isSent()) { storePostSend(); } return isStoreBlocked; @@ -1934,6 +2096,7 @@ LSQUnit::directStoreToCache() { DynInstPtr inst = storeWBIt->instruction(); LSQRequest* request = storeWBIt->request(); + if ((request->mainReq()->isLLSC() || request->mainReq()->isRelease()) && (storeWBIt.idx() != storeQueue.head())) { DPRINTF(LSQUnit, "Store idx:%i PC:%s to Addr:%#x " @@ -1982,6 +2145,28 @@ LSQUnit::directStoreToCache() } } + if (request->mainReq()->hasPaddr()) { + if (request->_storeBufferGeneration == 0) { + const Addr block_paddr = + request->mainReq()->getPaddr() & cacheBlockMask; + request->_storeBufferGeneration = + lsq->bumpStoreBufferBlockVersion(block_paddr); + } + + if (system->multiContextDifftest() && inst->isAtomic() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { + uint8_t issue_golden[8] = {}; + panic_if(request->_size > sizeof(issue_golden), + "Unexpected AMO size %u at addr %#lx\n", + request->_size, request->mainReq()->getPaddr()); + cpu->goldenMemManager()->readGoldenMem( + request->mainReq()->getPaddr(), issue_golden, request->_size); + std::memcpy(inst->getAmoOldGoldenValuePtr(), issue_golden, + request->_size); + } + } + if (request->mainReq()->isLocalAccess()) { assert(!inst->isStoreConditional()); assert(!inst->inHtmTransactionalState()); @@ -2074,17 +2259,20 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) request->mainReq()->isRelease() || request->mainReq()->isStrictlyOrdered() || inst->isStoreConditional()) { - DPRINTF(StoreBuffer, "Find atomic/SC store [sn:%llu]\n", storeWBIt->instruction()->seqNum); if (!(storeWBIt.idx() == storeQueue.head())) { - DPRINTF(StoreBuffer, "atomic/SC store waiting\n"); break; } - if (!storeBufferEmpty()) { - DPRINTF(StoreBuffer, "sbuffer need flush\n"); + if (request->mainReq()->hasPaddr()) { + const Addr block_paddr = + request->mainReq()->getPaddr() & cacheBlockMask; + if (lsq->storeBufferHasConflict(lsqID, block_paddr)) { + lsq->requestGlobalStoreBufferFlush(); + break; + } + } + if (!storeBufferEmpty(lsqID)) { lsq->flushStores(lsqID); break; - } else { - DPRINTF(StoreBuffer, "sbuffer finishing flushed\n"); } bool contin = directStoreToCache(); if (isStoreBlocked) { @@ -2107,8 +2295,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) uint64_t offset = vaddr - vbase; DPRINTF(LSQUnit, "Spilt store idx %d [sn:%lli] insert into sbuffer\n", i, inst->seqNum); assert(offset + req->getSize() <= storeWBIt->size()); - bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, req->getSize(), - req->getByteEnable()); + bool success = insertStoreBuffer( + vaddr, paddr, (uint8_t *)storeWBIt->data() + offset, + req->getSize(), req->getByteEnable(), inst->seqNum); if (success) { request->_numOutstandingPackets++; } else { @@ -2128,8 +2317,9 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) Addr vaddr = request->getVaddr(); Addr paddr = request->mainReq()->getPaddr(); DPRINTF(LSQUnit, "Store [sn:%lli] insert into sbuffer\n", inst->seqNum); - bool success = insertStoreBuffer(vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size, - request->mainReq()->getByteEnable()); + bool success = insertStoreBuffer( + vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size, + request->mainReq()->getByteEnable(), inst->seqNum); if (!success) { break; } @@ -2141,7 +2331,10 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) } } -bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector& mask) +bool +LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, + uint64_t size, const std::vector& mask, + InstSeqNum store_seq) { auto &storeBuffer = lsq->getStoreBuffer(); // access range must in a cache block @@ -2149,15 +2342,19 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t Addr blockVaddr = vaddr & cacheBlockMask; Addr blockPaddr = paddr & cacheBlockMask; Addr offset = paddr & ~cacheBlockMask; + // check request is not already in the storebuffer auto entry = storeBuffer.get(lsqID, blockPaddr); + const auto generation = lsq->bumpStoreBufferBlockVersion(blockPaddr); + if (entry) { if (entry->sending) { if (entry->vice) { // merge into vice stats.sbufferMerge++; entry = entry->vice; - entry->merge(offset, datas, size, mask); + entry->merge(offset, datas, size, mask, generation); + entry->generation = generation; DPRINTF(StoreBuffer, "Merging vice entry[%#x] for addr %#x\n", blockPaddr, paddr); } else { @@ -2170,7 +2367,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t stats.sbufferNewline++; stats.sbufferCreateVice++; auto vice = storeBuffer.createVice(entry); - vice->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask); + vice->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, + datas, size, mask, generation); + vice->generation = generation; DPRINTF(StoreBuffer, "Create new vice entry[%#x] for addr %#x\n", blockPaddr, paddr); } @@ -2178,7 +2377,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t // merge into unsent stats.sbufferMerge++; storeBuffer.update(entry->index); - entry->merge(offset, datas, size, mask); + entry->merge(offset, datas, size, mask, generation); + entry->seqNum = std::max(entry->seqNum, store_seq); + entry->generation = generation; DPRINTF(StoreBuffer, "Merging entry[%#x] for addr %#x\n", blockPaddr, paddr); } @@ -2192,7 +2393,9 @@ bool LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t // insert stats.sbufferNewline++; auto entry = storeBuffer.getEmpty(); - entry->reset(lsqID, blockVaddr, blockPaddr, offset, datas, size, mask); + entry->reset(lsqID, store_seq, blockVaddr, blockPaddr, offset, datas, + size, mask, generation); + entry->generation = generation; storeBuffer.insert(entry); DPRINTF(StoreBuffer, "Create new entry[%#x] for addr %#x\n", blockPaddr, paddr); @@ -2411,6 +2614,7 @@ LSQUnit::squash(const InstSeqNum &squashed_num) break; } } + } uint64_t @@ -2538,11 +2742,41 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe * store queue. */ DynInstPtr store_inst = store_idx->instruction(); auto request = store_idx->request(); - DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " "idx:%i\n", store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); + if (!from_sbuffer && + (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && + request->mainReq()->hasPaddr()) { + const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask; + auto generation = request->_storeBufferGeneration; + const bool replay_executed_loads = + store_inst->isAtomic() || cpu->consumeSyncVisibleStoreReplay(lsqID); + if (generation == 0) { + generation = lsq->bumpStoreBufferBlockVersion(block_paddr); + } + lsq->invalidateOtherThreadStoreBufferBytes( + lsqID, request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), generation); + lsq->markStoreBufferBlockVisible(block_paddr, generation); + lsq->notifyOtherThreadsStoreVisible(lsqID, + request->mainReq()->getPaddr(), + request->mainReq()->getByteEnable(), store_inst->seqNum, + replay_executed_loads); + } + + if (from_sbuffer && + (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && + request->mainReq()->hasPaddr()) { + auto generation = request->_storeBufferGeneration; + if (generation == 0) { + generation = lsq->bumpStoreBufferBlockVersion( + request->mainReq()->getPaddr() & cacheBlockMask); + request->_storeBufferGeneration = generation; + } + } + if (!from_sbuffer && (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && cpu->goldenMemManager() && @@ -2559,9 +2793,10 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe assert(request->req()->getAtomicOpFunctor()); // The AMO response returns the old memory value. Capture it on the - // instruction so commit/difftest can use a per-inst copy under SMT. - cpu->diffInfo.amoOldGoldenValue = store_inst->getAmoOldGoldenValue(); - memcpy(tmp_data, store_inst->getAmoOldGoldenValuePtr(), request->_size); + // instruction so commit/difftest can use a per-inst golden copy + // under SMT, but derive the new memory image from the DUT-observed + // old value captured in goldenData. + memcpy(tmp_data, store_inst->getGolden(), request->_size); (*(request->req()->getAtomicOpFunctor()))(tmp_data); @@ -2675,11 +2910,15 @@ LSQUnit::trySendPacket(bool isLoad, PacketPtr data_pkt, bool &bank_conflict, boo request->packetSent(); if (isLoad) { - auto &storeBuffer = lsq->getStoreBuffer(); - auto entry = storeBuffer.get(lsqID, pkt->getAddr() & cacheBlockMask); + const Addr block_addr = pkt->getAddr() & cacheBlockMask; + auto entry = lsq->findForwardingStoreBufferEntry( + block_addr, lsqID, request->instruction()->seqNum); if (entry) { DPRINTF(StoreBuffer, "sbuffer entry[%#x] coverage %s\n", entry->blockPaddr, pkt->print()); - if (entry->recordForward(pkt->req, request)) { + if (entry->recordForward( + pkt->req, request, lsqID, + request->instruction()->seqNum, + lsq->currentStoreBufferVisibleVersion(block_addr))) { assert(request->isSplit()); // here must be split request stats.sbufferFullForward++; } else if (!request->SBforwardPackets.empty()) { @@ -2864,8 +3103,12 @@ LSQUnit::dumpInsts() const for (auto it = storeQueue.begin(); it != storeQueue.end(); ++it) { if (it->valid()) { const DynInstPtr &inst(it->instruction()); - cprintf("idx:%d %s.[sn:%llu] %s\n", it.idx(), inst->pcState(), inst->seqNum, - it->addrReady() ? "AddrReady" : "Not AddrReady"); + cprintf("idx:%d %s.[sn:%llu] %s squashed=%d canWB=%d completed=%d " + "dataReady=%d staFinish=%d stdFinish=%d\n", + it.idx(), inst->pcState(), inst->seqNum, + it->addrReady() ? "AddrReady" : "Not AddrReady", + inst->isSquashed(), it->canWB(), it->completed(), + it->dataReady(), it->staFinish(), it->stdFinish()); } } @@ -3097,19 +3340,37 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) } if (request) { + request->SBforwardPackets.clear(); request->SQforwardPackets.clear(); + request->_sbufferBypass = false; + if (!load_inst->hasPendingCacheReq()) { + request->_goldenSnapshotCaptured = false; + } } // Check the SQ for any previous stores that might lead to forwarding auto store_it = load_inst->sqIt; - panic_if(store_it < storeWBIt, "[sn:%llu] Load instruction's store index is younger than store writeback index", - load_inst->seqNum); - // End once we've reached the top of the LSQ - while (store_it != storeWBIt && !load_inst->isDataPrefetch()) { + if (storeWBIt.dereferenceable()) { + panic_if(store_it < storeWBIt, + "[sn:%llu] Load instruction's store index is younger than " + "store writeback index", + load_inst->seqNum); + } + // End once we've reached the top of the LSQ. If storeWBIt is end(), there + // is no outstanding SQ forwarding window to scan. + while (storeWBIt.dereferenceable() && + store_it != storeWBIt && + !load_inst->isDataPrefetch()) { // Move the index to one younger store_it--; assert(store_it->valid()); assert(store_it->instruction()->seqNum < load_inst->seqNum); + auto store_req = store_it->request(); + + if (store_it->completed()) { + continue; + } + int store_size = store_it->size(); // Cache maintenance instructions go down via the store @@ -3244,9 +3505,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) "addr %#x, data: %#lx\n", store_it->instruction()->seqNum, load_inst->seqNum, request->mainReq()->getPaddr(), *((uint64_t*)buffer)); } - - - load_inst->setFullForward(); // Don't need to do anything special for split loads. @@ -3298,11 +3556,13 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) // sbuffer forward if (!load_inst->isDataPrefetch() && !request->isSplit()) { Addr blk_addr = request->mainReq()->getPaddr() & cacheBlockMask; - int offset = request->mainReq()->getPaddr() & ~cacheBlockMask; - auto &storeBuffer = lsq->getStoreBuffer(); - auto entry = storeBuffer.get(lsqID, blk_addr); + auto entry = lsq->findForwardingStoreBufferEntry( + blk_addr, lsqID, load_inst->seqNum); if (entry) { - if (entry->recordForward(request->mainReq(), request)) { + if (entry->recordForward(request->mainReq(), request, lsqID, + load_inst->seqNum, + lsq->currentStoreBufferVisibleVersion( + blk_addr))) { // full forward // no need to send to cache stats.sbufferFullForward++; @@ -3317,7 +3577,6 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) DPRINTF(LoadPipeline, "Load [sn:%llu] forward from sbuffer, data: %lx\n", load_inst->seqNum, *((uint64_t*)buffer)); } - return NoFault; } } @@ -3363,9 +3622,21 @@ LSQUnit::read(LSQRequest *request, ssize_t load_idx) } else { DPRINTF(LoadPipeline, "Load [sn:%llu] sendPacketToCache\n", load_inst->seqNum); // if cannot forward from bus, do real cache access + bool should_capture_golden = + system->multiContextDifftest() && + cpu->goldenMemManager() && + cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr()) && + !request->_goldenSnapshotCaptured; request->buildPackets(); // if the cache is not blocked, do cache access request->sendPacketToCache(); + if (request->isSent() && should_capture_golden) { + uint8_t *issue_golden = + (uint8_t *)cpu->goldenMemManager()->guestToHost( + request->mainReq()->getPaddr()); + load_inst->setGolden(issue_golden); + request->_goldenSnapshotCaptured = true; + } if (!request->isSent() && !load_inst->needBankConflictReplay() && !load_inst->needMshrArbFailReplay() && !load_inst->needMshrAliasFailReplay() &&!load_inst->needHitInWriteBufferReplay()) { iewStage->blockMemInst(load_inst); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 4dace5eb99..76496f94d1 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -212,6 +212,8 @@ class LSQUnit bool addrReady() const { return _addrReady; } bool dataReady() const { return _dataReady; } + bool staFinish() const { return _staFinish; } + bool stdFinish() const { return _stdFinish; } bool canForwardToLoad() const { return _addrReady && _dataReady; } bool splitStoreFinish() const { return _staFinish && _stdFinish; } @@ -326,6 +328,10 @@ class LSQUnit * of the intermediate invalidate. */ void checkSnoop(PacketPtr pkt); + void checkLocalStoreVisible(Addr store_paddr, + const std::vector &store_byte_enable, + InstSeqNum store_seq, + bool replay_executed_loads); /** Iq issues a load to load pipeline. */ void issueToLoadPipe(const DynInstPtr &inst); @@ -353,9 +359,12 @@ class LSQUnit /** Writes back stores. */ void offloadToStoreBuffer(uint32_t max_entries); - bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector& mask); + bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, + uint64_t size, const std::vector& mask, + InstSeqNum store_seq); bool storeBufferEmpty() { return lsq->storeBufferEmpty(); } + bool storeBufferEmpty(ThreadID tid) { return lsq->storeBufferEmpty(tid); } bool storeBufferSQWillFull() const { return storeQueue.size() > sqFullUpperLimit; @@ -438,6 +447,9 @@ class LSQUnit /** Returns if there are any stores to writeback. */ bool hasStoresToWB() { return storesToWB > 0; } + /** Returns if there are older stores/atomics still pending writeback. */ + bool hasStoresToWBBefore(InstSeqNum seq_num) const; + /** Returns the number of stores to writeback. */ int numStoresToSbuffer() { return storesToWB; } From 8485ee612541f8c96b985d8644df548c0236a52e Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 30 Mar 2026 17:37:59 +0800 Subject: [PATCH 07/38] cpu-o3: fix storeData uop squash Change-Id: I146d1ac20d06015e98713f30bae71fef3f5d7bcf --- src/cpu/o3/iew.cc | 5 +++++ src/cpu/o3/lsq.cc | 8 ++++++++ src/cpu/o3/lsq.hh | 1 + src/cpu/o3/lsq_unit.cc | 28 ++++++++++++++++++++++++++++ src/cpu/o3/lsq_unit.hh | 1 + 5 files changed, 43 insertions(+) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 7ea9c872ba..412fc9ab57 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1576,6 +1576,11 @@ IEW::executeInsts() // executing ppExecute->notify(inst); + if (inst->isSplitStoreData() && + ldstQueue.splitStoreAddrSquashed(inst)) { + inst->setSquashed(); + } + // Check if the instruction is squashed; if so then skip it if (inst->isSquashed()) { DPRINTF(IEW, "Execute: Instruction was squashed. PC: %s, [tid:%i]" diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 9cc59f560d..c7d0c60a9d 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -850,6 +850,14 @@ LSQ::insertStore(const DynInstPtr &store_inst) thread[tid].insertStore(store_inst); } +bool +LSQ::splitStoreAddrSquashed(const DynInstPtr &inst) +{ + ThreadID tid = inst->threadNumber; + + return thread[tid].splitStoreAddrSquashed(inst); +} + void LSQ::issueToLoadPipe(const DynInstPtr &inst) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 83f47b5b91..257cf48354 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -841,6 +841,7 @@ class LSQ void insertLoad(const DynInstPtr &load_inst); /** Inserts a store into the LSQ. */ void insertStore(const DynInstPtr &store_inst); + bool splitStoreAddrSquashed(const DynInstPtr &inst); /** Executes an amo inst. */ Fault executeAmo(const DynInstPtr &inst); diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 9d170af470..76978531c1 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -833,6 +833,30 @@ LSQUnit::insertStore(const DynInstPtr& store_inst) storeQueue.back().set(store_inst); } +bool +LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst) +{ + if (!inst->isSplitStoreData()) { + return false; + } + + if (!storeQueue.isValidIdx(inst->sqIdx)) { + return true; + } + + auto sq_it = storeQueue.getIterator(inst->sqIdx); + if (!sq_it->valid()) { + return true; + } + + const auto &sta_inst = sq_it->instruction(); + if (!sta_inst || sta_inst->seqNum != inst->seqNum) { + return true; + } + + return sta_inst->isSquashed(); +} + bool LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst) { @@ -1780,6 +1804,10 @@ LSQUnit::executeStorePipeSx() continue; } + if (splitStoreAddrSquashed(inst)) { + inst->setSquashed(); + } + if (inst->isSquashed()) { DPRINTF(StorePipeline, "Execute: Instruction was squashed. PC: %s, [tid:%i]" " [sn:%llu]\n", inst->pcState(), inst->threadNumber, diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 76496f94d1..633c952d8f 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -304,6 +304,7 @@ class LSQUnit void insertLoad(const DynInstPtr &load_inst); /** Inserts a store instruction. */ void insertStore(const DynInstPtr &store_inst); + bool splitStoreAddrSquashed(const DynInstPtr &inst); /** Check for ordering violations in the LSQ. For a store squash if we * ever find a conflicting load. For a load, only squash if we From 1fcfb256aabf7f5307229df1120935b16cd61098 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 31 Mar 2026 11:26:19 +0800 Subject: [PATCH 08/38] cpu-o3: fix squash drain and wakeup recovery Change-Id: Icc05a7320ee5bf1495ef98694c3e92847613d79e --- src/cpu/o3/commit.cc | 2 +- src/cpu/o3/issue_queue.cc | 26 +++++++++++++++++++------- src/cpu/o3/issue_queue.hh | 3 ++- src/cpu/o3/rename.cc | 7 +++++-- src/cpu/o3/rob.cc | 30 ++++++++++++++++++++++++++++++ src/cpu/o3/rob.hh | 5 +++++ 6 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index e289754896..746a39872b 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -1283,7 +1283,7 @@ Commit::commitInsts() DPRINTF(Commit, "Retiring squashed instruction from " "ROB.\n"); - rob->retireHead(commit_thread); + rob->drainSquashedHead(commit_thread); ++stats.commitSquashedInsts; // Notify potential listeners that this instruction is squashed diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index f2d09e17de..d8eaaae2cb 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -393,7 +393,9 @@ IssueQue::checkScoreboard(const DynInstPtr& inst) } // check bypass data ready or not if (!scheduler->bypassScoreboard[src->flatIndex()]) [[unlikely]] { - auto dst_inst = scheduler->getInstByDstReg(src->flatIndex()); + auto dst_inst = scheduler->getInstByDstReg(src->flatIndex(), + inst->threadNumber, + inst->seqNum); assert(dst_inst); if (!dst_inst->isLoad()) panic("dst[sn:%llu] is not load, src[sn:%llu]", dst_inst->seqNum, inst->seqNum); warn_once( @@ -1270,18 +1272,28 @@ Scheduler::ready(OpClass op, int disp_seq) } DynInstPtr -Scheduler::getInstByDstReg(RegIndex flatIdx) +Scheduler::getInstByDstReg(RegIndex flatIdx, ThreadID tid, + InstSeqNum consumerSeqNum) { + DynInstPtr candidate = nullptr; + for (auto iq : issueQues) { - for (auto& inst : iq->instList) { - for (auto i = 0; i < inst->numDestRegs(); i++) { - if (inst->renamedDestIdx(i)->flatIndex() == flatIdx) { - return inst; + for (auto &inst : iq->instList) { + if (inst->threadNumber != tid || inst->seqNum >= consumerSeqNum) { + continue; + } + for (int i = 0; i < inst->numDestRegs(); i++) { + if (inst->renamedDestIdx(i)->flatIndex() != flatIdx) { + continue; + } + if (!candidate || inst->seqNum > candidate->seqNum) { + candidate = inst; } } } } - return nullptr; + + return candidate; } void diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index a91da979db..a4416663a0 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -372,7 +372,8 @@ class Scheduler : public SimObject void issueAndSelect(); void lookahead(std::deque& insts); bool ready(const DynInstPtr& inst, int disp_seq); - DynInstPtr getInstByDstReg(RegIndex flatIdx); + DynInstPtr getInstByDstReg(RegIndex flatIdx, ThreadID tid, + InstSeqNum consumerSeqNum); void addProducer(const DynInstPtr& inst); // return true if insert successful diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 84e3e0e031..02c4f40144 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -449,9 +449,12 @@ Rename::releasePhysRegs() } removeFromHistory(releaseSeq[tid], tid); - // If we committed this cycle then doneSeqNum will be > 0 + // doneSeqNum is also reused as a squash-progress marker while the + // ROB is walking younger entries. Only real commit progress should + // release physical registers. if (fromCommit->commitInfo[tid].doneSeqNum != 0 && - !fromCommit->commitInfo[tid].squash) { + !fromCommit->commitInfo[tid].squash && + !fromCommit->commitInfo[tid].robSquashing) { finalCommitSeq[tid] = fromCommit->commitInfo[tid].doneSeqNum; releaseSeq[tid] = diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc index 410d7dcfac..d57ea8b0df 100644 --- a/src/cpu/o3/rob.cc +++ b/src/cpu/o3/rob.cc @@ -428,6 +428,36 @@ ROB::retireHead(ThreadID tid) cpu->removeFrontInst(head_inst); } +void +ROB::drainSquashedHead(ThreadID tid) +{ + stats.writes++; + + assert(numInstsInROB > 0); + + InstIt head_it = instList[tid].begin(); + + DynInstPtr head_inst = std::move(*head_it); + instList[tid].erase(head_it); + + assert(head_inst->readyToCommit()); + assert(head_inst->isSquashed()); + + DPRINTF(ROB, "[tid:%i] Draining squashed head instruction, " + "instruction PC %s, [sn:%llu]\n", tid, head_inst->pcState(), + head_inst->seqNum); + + --numInstsInROB; + + commitGroup(head_inst, tid); + + head_inst->clearInROB(); + + updateHead(); + + cpu->removeFrontInst(head_inst); +} + bool ROB::isHeadGroupReady(ThreadID tid) { diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index 1fdcbf0857..94b93d2593 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -164,6 +164,11 @@ class ROB */ void retireHead(ThreadID tid); + /** Drains a squashed head instruction from a specific thread without + * marking it committed. + */ + void drainSquashedHead(ThreadID tid); + /** Is the oldest instruction across all threads ready. */ // bool isHeadReady(); From a516e81504b6c7a4984c70e0ed13e5e8090ce736 Mon Sep 17 00:00:00 2001 From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com> Date: Tue, 31 Mar 2026 19:51:53 +0800 Subject: [PATCH 09/38] cpu-o3: fix iew smt squash (#809) Co-authored-by: mo haonan --- src/cpu/o3/iew.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 412fc9ab57..a98f92f1ff 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -830,7 +830,6 @@ IEW::checkSquash() fetchRedirect[i] = false; iewStats.stallEvents[ROBWalk]++; setAllStalls(StallReason::CommitSquash); - return; } if (fromCommit->commitInfo[i].robSquashing) { From 0b4960f828010acb6a9a00836bcc6f118f33561c Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 2 Apr 2026 10:50:17 +0800 Subject: [PATCH 10/38] cpu-o3: fix smt fetch squash & load wakeup & iq init Change-Id: Ic416b537a4f2c87059c92c7b5be81618b1898e22 --- src/arch/riscv/tlb.cc | 1 - src/cpu/o3/issue_queue.cc | 1 + src/cpu/o3/lsq.cc | 42 +++++++++++++++++++++++++++++ src/cpu/o3/lsq.hh | 6 +++++ src/cpu/o3/lsq_unit.cc | 16 ++++++++--- src/cpu/o3/lsq_unit.hh | 2 +- src/cpu/o3/smt_sched.hh | 8 +++--- src/cpu/pred/btb/decoupled_bpred.cc | 3 ++- src/cpu/pred/btb/ftq.cc | 15 +++++++++++ src/cpu/pred/btb/ftq.hh | 1 + 10 files changed, 85 insertions(+), 10 deletions(-) diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc index 96077f8273..050e0735b8 100644 --- a/src/arch/riscv/tlb.cc +++ b/src/arch/riscv/tlb.cc @@ -2146,7 +2146,6 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc, return NoFault; } - PrivilegeMode TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode) { diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index d8eaaae2cb..c3739031aa 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -1750,6 +1750,7 @@ Scheduler::initIQICountSmtScheduler(int numThreads) InstsCounter* counter = iq->getInstsCounter(); assert(counter); iq->initIndependentIQICountScheduler(numThreads); + iq->selector->setparent(this, iq); } } diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index c7d0c60a9d..3d72ae1930 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -2038,6 +2038,48 @@ LSQ::dumpInsts(ThreadID tid) const thread.at(tid).dumpInsts(); } +void +LSQ::dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const +{ + cprintf("Store buffer state for tid %i:\n", tid); + cprintf(" flushing=%d flushBeforeSeq=%llu\n", + _storeBufferFlushing[tid], + static_cast(_storeBufferFlushBeforeSeq[tid])); + cprintf(" storesToWB=%d hasStoresToWBBefore=%d\n", + thread.at(tid).numStoresToSbuffer(), + thread.at(tid).hasStoresToWBBefore(seq_num)); + cprintf(" sbufferSize(tid)=%llu sbufferSizeBeforeSeq=%llu\n", + static_cast(storeBuffer.size(tid)), + static_cast(storeBuffer.size(tid, seq_num))); +} + +void +LSQ::dumpStoreBuffer(ThreadID tid) const +{ + cprintf("Store buffer entries for tid %i:\n", tid); + const auto &entries = storeBuffer.entries(); + for (size_t index = 0; index < entries.size(); ++index) { + if (!storeBuffer.valid(index)) { + continue; + } + + auto *entry = entries[index]; + if (!entry || entry->tid != tid) { + continue; + } + + cprintf(" idx:%d seq:%llu paddr:%#lx vaddr:%#lx sending=%d vice=%d generation=%llu request=%p\n", + entry->index, + static_cast(entry->seqNum), + entry->blockPaddr, + entry->blockVaddr, + entry->sending, + entry->vice != nullptr, + static_cast(entry->generation), + entry->request); + } +} + bool LSQ::isMisaligned(const DynInstPtr& inst, Addr vaddr, int size) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 257cf48354..66038bf154 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -208,6 +208,8 @@ class LSQ uint64_t size(ThreadID tid) const; uint64_t size(ThreadID tid, InstSeqNum seq_num) const; uint64_t unsentSize() const; + const std::vector &entries() const { return data_vec; } + bool valid(size_t index) const { return data_vld.at(index); } StoreBufferEntry *getEmpty(); void insert(StoreBufferEntry *entry); StoreBufferEntry *get(ThreadID tid, uint64_t addr) const; @@ -1033,6 +1035,10 @@ class LSQ void dumpInsts() const; /** Debugging function to print out instructions from a specific thread. */ void dumpInsts(ThreadID tid) const; + /** Debugging function to print store-buffer flush state for a thread. */ + void dumpStoreBufferState(ThreadID tid, InstSeqNum seq_num) const; + /** Debugging function to print store-buffer entries for a thread. */ + void dumpStoreBuffer(ThreadID tid) const; bool isMisaligned(const DynInstPtr& inst, Addr vaddr, int size); diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 76978531c1..389931080d 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -1571,10 +1571,20 @@ LSQUnit::executeLoadPipeSx() fault = loadDoTranslate(inst); break; case 1: - iewStage->getScheduler()->specWakeUpFromLoadPipe(inst); - // Loads will mark themselves as executed, and their writeback - // event adds the instruction to the queue to commit fault = loadDoSendRequest(inst); + if (fault == NoFault && + !inst->replayOrSkipFollowingPipe() && + inst->readPredicate() && + inst->readMemAccPredicate() && + inst->savedRequest && + inst->savedRequest->isTranslationComplete() && + inst->savedRequest->isMemAccessRequired()) { + iewStage->getScheduler()->specWakeUpFromLoadPipe( + inst); + } + // Loads will mark themselves as executed, and their + // writeback event adds the instruction to the queue + // to commit. iewStage->SquashCheckAfterExe(inst); break; case 2: diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 633c952d8f..2bfa53db38 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -452,7 +452,7 @@ class LSQUnit bool hasStoresToWBBefore(InstSeqNum seq_num) const; /** Returns the number of stores to writeback. */ - int numStoresToSbuffer() { return storesToWB; } + int numStoresToSbuffer() const { return storesToWB; } /** Update loadCompletedIdx and storeCompletedIdx */ void updateCompletedIdx(); diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh index e6b00ab4d8..74198c44fd 100644 --- a/src/cpu/o3/smt_sched.hh +++ b/src/cpu/o3/smt_sched.hh @@ -137,12 +137,12 @@ public: ThreadID getThread() override { ThreadID selectedTid = 0; - uint64_t maxCount = counter->getCounter(0); - + uint64_t minCount = counter->getCounter(0); + for (ThreadID tid = 1; tid < numThreads; ++tid) { uint64_t count = counter->getCounter(tid); - if (count > maxCount) { - maxCount = count; + if (count < minCount) { + minCount = count; selectedTid = tid; } } diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index bb87772263..01e9b78ac3 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -330,7 +330,7 @@ DecoupledBPUWithBTB::generateFinalPredAndCreateBubbles(ThreadID tid) if (ubtb->isEnabled()) { ubtb->updateUsingS3Pred(predsOfEachStage[numStages - 1]); } - if (abtb->isEnabled() && ftq.backId(tid)) { + if (abtb->isEnabled() && !ftq.empty(tid)) { auto previous_block_startpc = ftq.back(tid).startPC; abtb->updateUsingS3Pred(predsOfEachStage[numStages - 1], previous_block_startpc); } else if (abtb->isEnabled()) { @@ -462,6 +462,7 @@ DecoupledBPUWithBTB::handleSquash(ThreadID tid, unsigned target_id, "Ignore squash for tid %u on missing FTQ target %u; " "recovering predictor state from redirect PC %#lx\n", tid, target_id, redirect_pc); + ftq.clear(tid); clearPreds(tid); threads[tid].validprediction = false; threads[tid].s0PC = redirect_pc; diff --git a/src/cpu/pred/btb/ftq.cc b/src/cpu/pred/btb/ftq.cc index 3642ef7162..b8abfe7996 100644 --- a/src/cpu/pred/btb/ftq.cc +++ b/src/cpu/pred/btb/ftq.cc @@ -1,3 +1,5 @@ +#include + #include "ftq.hh" namespace gem5 @@ -53,6 +55,19 @@ FetchTargetQueue::squashAfter(FetchTargetId squashId, ThreadID tid) queue[tid].fetchptr = squashId + 1; } +void +FetchTargetQueue::clear(ThreadID tid) +{ + const FetchTargetId nextTargetId = std::max( + queue[tid].fetchptr, + queue[tid].baseTargetId + + static_cast(queue[tid].cap.size())); + + queue[tid].cap.clear(); + queue[tid].baseTargetId = nextTargetId; + queue[tid].fetchptr = nextTargetId; +} + } } diff --git a/src/cpu/pred/btb/ftq.hh b/src/cpu/pred/btb/ftq.hh index c43d071447..c762cd0b83 100644 --- a/src/cpu/pred/btb/ftq.hh +++ b/src/cpu/pred/btb/ftq.hh @@ -80,6 +80,7 @@ public: void finishTarget(ThreadID tid); void commitTarget(ThreadID tid); void squashAfter(FetchTargetId targetId, ThreadID tid); + void clear(ThreadID tid); }; } From b11b00ed446a86b72aac2c423573e0701e93a864 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 2 Apr 2026 20:45:22 +0800 Subject: [PATCH 11/38] cpu-o3: fix lsq request lifetime and store completion Change-Id: Ieb232b296c7c99ea216c14c23f135e6e081870a6 --- src/cpu/o3/iew.cc | 4 +- src/cpu/o3/lsq.cc | 67 ++++++++++++++++++++++ src/cpu/o3/lsq.hh | 20 +++++++ src/cpu/o3/lsq_unit.cc | 124 ++++++++++++++++++++++++++++++----------- src/cpu/o3/lsq_unit.hh | 10 ++++ 5 files changed, 190 insertions(+), 35 deletions(-) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index a98f92f1ff..04c2b893ca 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1714,8 +1714,8 @@ IEW::writebackInsts() DynInstPtr inst = toCommit->insts[inst_num]; ThreadID tid = inst->threadNumber; - if (inst->savedRequest && inst->isLoad()) { - inst->pf_source = inst->savedRequest->mainReq()->getPFSource(); + if (inst->isLoad()) { + inst->pf_source = ldstQueue.getLoadPFSource(inst); } DPRINTF(IEW, "Sending instructions to commit, [sn:%lli] PC %s.\n", diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 3d72ae1930..4fe227f6ac 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -1695,6 +1695,29 @@ LSQ::getLSQHeadInst(ThreadID tid, bool isLoad) } } +int +LSQ::getLoadPFSource(const DynInstPtr &inst) const +{ + if (!inst || !inst->isLoad() || inst->lqIdx < 0) { + return -1; + } + + const auto &entry = thread[inst->threadNumber].loadQueue[inst->lqIdx]; + auto *request = entry.request(); + if (!request) { + return -1; + } + + // A load can retire through a split request or after replay/discard has + // detached some request state. Prefetch source is best-effort metadata, so + // only query a live sub-request when one still exists. + if (request->numReqs() == 0) { + return -1; + } + + return request->req()->getPFSource(); +} + bool LSQ::isStalled() { @@ -2371,6 +2394,12 @@ LSQ::SplitDataRequest::mainReq() return _mainReq; } +RequestPtr +LSQ::SplitDataRequest::mainReq() const +{ + return _mainReq; +} + void LSQ::SplitDataRequest::initiateTranslation() { @@ -2579,9 +2608,47 @@ LSQ::LSQRequest::forward() } } +void +LSQ::LSQRequest::detachLSQEntry() +{ + if (!_inst) { + return; + } + + if (isLoad() && _inst->lqIdx >= 0 && + _port.loadQueue[_inst->lqIdx].request() == this) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from LQ entry\n", + _inst->seqNum); + _port.loadQueue[_inst->lqIdx].setRequest(nullptr); + } else if ((isAtomic() || _inst->isStore()) && _inst->sqIdx >= 0 && + _port.storeQueue[_inst->sqIdx].request() == this) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from SQ entry\n", + _inst->seqNum); + _port.storeQueue[_inst->sqIdx].setRequest(nullptr); + } +} + +void +LSQ::LSQRequest::detachInflightLoad() +{ + if (!isLoad()) { + return; + } + + auto &inflight = _port.inflightLoads; + auto it = std::find(inflight.begin(), inflight.end(), this); + if (it != inflight.end()) { + DPRINTF(LSQ, "inst [sn:%llu] Detach LSQRequest from inflightLoads\n", + _inst ? _inst->seqNum : 0); + inflight.erase(it); + } +} + LSQ::LSQRequest::~LSQRequest() { assert(!isAnyOutstandingRequest()); + detachLSQEntry(); + detachInflightLoad(); if (_inst && _inst->savedRequest == this) { DPRINTF(LSQ, "inst [sn:%llu] Deleting LSQRequest, savedRequest\n", _inst->seqNum); _inst->savedRequest = nullptr; diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 66038bf154..159eaa0ab5 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -389,6 +389,14 @@ class LSQ /** Install the request in the LQ/SQ. */ void install(); + /** If the request is still installed in the current LQ/SQ slot, + * detach that slot so later scans do not observe a discarded or + * deleted request through the queue entry. */ + void detachLSQEntry(); + + /** Remove the request from the in-flight load tracker if present. */ + void detachInflightLoad(); + bool squashed() const override; @@ -516,6 +524,13 @@ class LSQ return req(); } + virtual RequestPtr + mainReq() const + { + assert (_reqs.size() == 1); + return req(); + } + /** * Test if there is any in-flight translation or mem access request */ @@ -655,6 +670,8 @@ class LSQ void discard() { + detachLSQEntry(); + detachInflightLoad(); release(Flag::Discarded); } @@ -786,6 +803,7 @@ class LSQ virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask); virtual RequestPtr mainReq(); + virtual RequestPtr mainReq() const; virtual PacketPtr mainPacket(); virtual std::string name() const { return "SplitDataRequest"; } }; @@ -979,6 +997,8 @@ class LSQ /** Returns whether the head instruction of sq has completed*/ const DynInstPtr& getLSQHeadInst(ThreadID tid, bool isLoad); + int getLoadPFSource(const DynInstPtr &inst) const; + /** * Returns if the LSQ is stalled due to a memory operation that must be * replayed. diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 389931080d..467fd73160 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -182,10 +182,10 @@ LSQUnit::SQEntry::setStatus(SplitStoreStatus status) LSQUnit::WritebackRegEvent::WritebackRegEvent(const DynInstPtr &_inst, PacketPtr _pkt, LSQUnit *lsq_ptr) : Event(Default_Pri, AutoDelete), - inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr) + inst(_inst), request(_inst->savedRequest), pkt(_pkt), lsqPtr(lsq_ptr) { - assert(_inst->savedRequest); - _inst->savedRequest->writebackScheduled(); + assert(request); + request->writebackScheduled(); } void @@ -195,8 +195,8 @@ LSQUnit::WritebackRegEvent::process() lsqPtr->writebackReg(inst, pkt); - assert(inst->savedRequest); - inst->savedRequest->writebackDone(); + assert(request); + request->writebackDone(); delete pkt; } @@ -833,6 +833,20 @@ LSQUnit::insertStore(const DynInstPtr& store_inst) storeQueue.back().set(store_inst); } +LSQUnit::LSQRequest * +LSQUnit::currentLoadRequest(const DynInstPtr &inst) +{ + return (inst && inst->lqIdx >= 0) ? loadQueue[inst->lqIdx].request() + : nullptr; +} + +LSQUnit::LSQRequest * +LSQUnit::currentStoreRequest(const DynInstPtr &inst) +{ + return (inst && inst->sqIdx >= 0) ? storeQueue[inst->sqIdx].request() + : nullptr; +} + bool LSQUnit::splitStoreAddrSquashed(const DynInstPtr &inst) { @@ -866,9 +880,10 @@ LSQUnit::pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_ Addr store_eff_addr1 = store_inst->physEffAddr >> depCheckShift; Addr store_eff_addr2 = (store_inst->physEffAddr + store_inst->effSize - 1) >> depCheckShift; - LSQRequest* store_req = store_inst->savedRequest; + LSQRequest* store_req = currentStoreRequest(store_inst); + LSQRequest* load_req = currentLoadRequest(load_inst); // Dont perform pipe line nuke check for split load - bool load_is_splited = load_inst->savedRequest && load_inst->savedRequest->isSplit(); + bool load_is_splited = load_req && load_req->isSplit(); bool load_need_check = !load_is_splited && load_inst->effAddrValid() && (load_inst->lqIt >= store_inst->lqIt); bool store_need_check = store_req && store_req->isTranslationComplete() && @@ -948,7 +963,7 @@ LSQUnit::checkSnoop(PacketPtr pkt) DynInstPtr ld_inst = iter->instruction(); assert(ld_inst); - LSQRequest *request = ld_inst->savedRequest; + LSQRequest *request = iter->request(); // Check that this snoop didn't just invalidate our lock flag if (ld_inst->effAddrValid() && request && @@ -962,7 +977,7 @@ LSQUnit::checkSnoop(PacketPtr pkt) while (++iter != loadQueue.end()) { ld_inst = iter->instruction(); assert(ld_inst); - request = ld_inst->savedRequest;// iter->request(); + request = iter->request(); if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) continue; @@ -1066,7 +1081,10 @@ LSQUnit::checkLocalStoreVisible(Addr store_paddr, continue; } - LSQRequest *request = ld_inst->savedRequest; + LSQRequest *request = it->request(); + // Replay/cancel paths can leave the dyninst carrying a stale + // savedRequest pointer after the active LQ request has been replaced + // or dropped. Only the current queue entry request is safe here. if (!request || !request->isCacheBlockHit(block_addr, cacheBlockMask)) { continue; } @@ -1107,8 +1125,27 @@ Fault LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, const DynInstPtr& inst) { + LSQRequest *request = nullptr; + if (inst->isLoad()) { + if (inst->lqIdx >= 0) { + request = loadQueue[inst->lqIdx].request(); + } + } else if (inst->isStore() || inst->isAtomic()) { + if (inst->sqIdx >= 0) { + request = storeQueue[inst->sqIdx].request(); + } + } + + // Replay/cancel paths can drop the active LSQ request before the + // instruction is retried. In that window the dyninst may still carry a + // stale savedRequest pointer, so only the current LSQ entry request is + // safe to inspect here. + if (!request) { + return NoFault; + } + auto saved_it = loadIt; - for (auto req0 : inst->savedRequest->_reqs) { + for (auto req0 : request->_reqs) { Addr inst_eff_addr1 = req0->getPaddr() >> depCheckShift; Addr inst_eff_addr2 = (req0->getPaddr() + req0->getSize() - 1) >> depCheckShift; @@ -1222,6 +1259,7 @@ LSQUnit::loadSetReplay(DynInstPtr inst, LSQRequest* request, bool dropReqNow) // Reset DTB translation state inst->translationStarted(false); inst->translationCompleted(false); + inst->savedRequest = nullptr; // clear request in loadQueue loadQueue[inst->lqIdx].setRequest(nullptr); if (dropReqNow) { @@ -1291,8 +1329,9 @@ LSQUnit::loadDoTranslate(const DynInstPtr &inst) DPRINTF(LoadPipeline, "Load [sn:%llu] setTLBMissReplay\n", inst->seqNum); } - if (inst->savedRequest && inst->savedRequest->isTranslationComplete()) { - inst->setNormalLd(inst->savedRequest->isNormalLd()); + if (auto *request = currentLoadRequest(inst); + request && request->isTranslationComplete()) { + inst->setNormalLd(request->isNormalLd()); cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::VAddress, inst->effAddr); cpu->perfCCT->updateInstMeta(inst->seqNum, InstDetail::PAddress, inst->physEffAddr); @@ -1307,7 +1346,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst) DPRINTF(LoadPipeline, "loadDoSendRequest: load [sn:%lli]\n", inst->seqNum); assert(!inst->isSquashed()); Fault load_fault = inst->getFault(); - LSQRequest* request = inst->savedRequest; + LSQRequest* request = currentLoadRequest(inst); if (inst->effAddrValid()) { for (int i = 0; i < storePipeSx[1]->size; i++) { @@ -1353,9 +1392,9 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst) } if (load_fault != NoFault && inst->translationCompleted() && - inst->savedRequest->isPartialFault() - && !inst->savedRequest->isComplete()) { - assert(inst->savedRequest->isSplit()); + request && request->isPartialFault() + && !request->isComplete()) { + assert(request->isSplit()); // If we have a partial fault where the mem access is not complete yet // then the cache must have been blocked. This load will be re-executed // when the cache gets unblocked. We will handle the fault when the @@ -1398,7 +1437,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) DPRINTF(LoadPipeline, "loadDoRecvData: load [sn:%lli]\n", inst->seqNum); assert(!inst->isSquashed()); - LSQRequest* request = inst->savedRequest; + LSQRequest* request = currentLoadRequest(inst); bool earlyWakeupCacheMissReplay = false; if (inst->wakeUpEarly()) { @@ -1513,7 +1552,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) // No nuke happens, prepare the inst data // assert(request->isNormalLd() ? !request->isAnyOutstandingRequest() : true); - request = inst->savedRequest; + request = currentLoadRequest(inst); if (inst->fullForward()) { DPRINTF(LoadPipeline, "Load [sn:%llu] fullForward\n", inst->seqNum); assert(request); @@ -1570,15 +1609,16 @@ LSQUnit::executeLoadPipeSx() case 0: fault = loadDoTranslate(inst); break; - case 1: + case 1: { fault = loadDoSendRequest(inst); + auto *request = currentLoadRequest(inst); if (fault == NoFault && !inst->replayOrSkipFollowingPipe() && inst->readPredicate() && inst->readMemAccPredicate() && - inst->savedRequest && - inst->savedRequest->isTranslationComplete() && - inst->savedRequest->isMemAccessRequired()) { + request && + request->isTranslationComplete() && + request->isMemAccessRequired()) { iewStage->getScheduler()->specWakeUpFromLoadPipe( inst); } @@ -1587,6 +1627,7 @@ LSQUnit::executeLoadPipeSx() // to commit. iewStage->SquashCheckAfterExe(inst); break; + } case 2: fault = loadDoRecvData(inst); @@ -1653,10 +1694,12 @@ LSQUnit::executeLoadPipeSx() else if (inst->needCacheMissReplay()) iewStage->cacheMissLdReplay(inst); else if (inst->needMdpAddrReplay()) iewStage->mdpAddrReplayPipeDone(inst); else if (inst->needNukeReplay()) { - if (inst->savedRequest && inst->cacheHit()) { - loadSetReplay(inst, inst->savedRequest, true); - } else if (inst->savedRequest && inst->hasPendingCacheReq()) { - loadSetReplay(inst, inst->savedRequest, false); + if (auto *request = currentLoadRequest(inst); request) { + if (inst->cacheHit()) { + loadSetReplay(inst, request, true); + } else if (inst->hasPendingCacheReq()) { + loadSetReplay(inst, request, false); + } } inst->issueQue->retryMem(inst); } @@ -1686,7 +1729,10 @@ LSQUnit::executeLoadPipeSx() } if (i == loadPipeStages - 1 && !inst->needReplay()) { - if (inst->isNormalLd() || !inst->readMemAccPredicate()) iewStage->readyToFinish(inst); + if (inst->isExecuted() && + (inst->isNormalLd() || !inst->readMemAccPredicate())) { + iewStage->readyToFinish(inst); + } iewStage->activityThisCycle(); inst->endPipelining(); DPRINTF(LoadPipeline, "Load [sn:%llu] ready to finish\n", @@ -2734,7 +2780,8 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt) if (!htm_fault) { assert(dynamic_cast(inst->fault.get()) != nullptr || - inst->savedRequest->isPartialFault()); + (currentLoadRequest(inst) && + currentLoadRequest(inst)->isPartialFault())); } else if (!pkt->htmTransactionFailedInCache()) { // Situation in which the instruction has a hardware @@ -2755,8 +2802,12 @@ LSQUnit::writebackReg(const DynInstPtr &inst, PacketPtr pkt) } } - if (!inst->savedRequest->isNormalLd()) { - // Need to insert instruction into queue to commit + const bool finish_after_writeback = + !inst->isNormalLd() || !inst->inPipe(); + if (finish_after_writeback) { + // Normal loads usually wait for the last pipe stage to enqueue commit. + // If the response arrives after the load has already drained from the + // pipe, writeback must finish the instruction here. iewStage->readyToFinish(inst); iewStage->activityThisCycle(); } @@ -2780,13 +2831,19 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe * store queue. */ DynInstPtr store_inst = store_idx->instruction(); auto request = store_idx->request(); + // Predicated-off or zero-sized stores can legitimately reach completion + // without ever materializing a backing memory request. + const bool has_main_request = + request && request->numReqs() > 0; + const bool has_paddr = + has_main_request && request->mainReq()->hasPaddr(); DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " "idx:%i\n", store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); if (!from_sbuffer && (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && - request->mainReq()->hasPaddr()) { + has_paddr) { const Addr block_paddr = request->mainReq()->getPaddr() & cacheBlockMask; auto generation = request->_storeBufferGeneration; const bool replay_executed_loads = @@ -2806,7 +2863,7 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe if (from_sbuffer && (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && - request->mainReq()->hasPaddr()) { + has_paddr) { auto generation = request->_storeBufferGeneration; if (generation == 0) { generation = lsq->bumpStoreBufferBlockVersion( @@ -2818,6 +2875,7 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe if (!from_sbuffer && (!store_inst->isStoreConditional() || store_inst->lockedWriteSuccess()) && cpu->goldenMemManager() && + has_paddr && cpu->goldenMemManager()->inPmem(request->mainReq()->getPaddr())) { Addr paddr = request->mainReq()->getPaddr(); if (!store_inst->isAtomic()) { diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 2bfa53db38..837cc65506 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -154,6 +154,7 @@ class LSQUnit } LSQRequest* request() { return _request; } + const LSQRequest* request() const { return _request; } void setRequest(LSQRequest* r) { _request = r; } bool hasRequest() { return _request != nullptr; } /** Member accessors. */ @@ -390,6 +391,12 @@ class LSQUnit /** Check if there exists raw nuke between load and store. */ bool pipeLineNukeCheck(const DynInstPtr &load_inst, const DynInstPtr &store_inst); + /** Returns the current request attached to an active LQ entry. */ + LSQRequest *currentLoadRequest(const DynInstPtr &inst); + + /** Returns the current request attached to an active SQ entry. */ + LSQRequest *currentStoreRequest(const DynInstPtr &inst); + /** Returns the number of free LQ entries. */ unsigned numFreeLoadEntries(); @@ -583,6 +590,9 @@ class LSQUnit /** Instruction whose results are being written back. */ DynInstPtr inst; + /** Request that owns the delayed writeback lifecycle. */ + LSQRequest *request; + /** The packet that would have been sent to memory. */ PacketPtr pkt; From 40bf365d1d262123eb1328740099996f3ff4ebd2 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 2 Apr 2026 20:45:46 +0800 Subject: [PATCH 12/38] arch-riscv: fix agnostic vector load fill Change-Id: I11b460b6a6554998d052a020a02d84eb2b0664ad --- .../riscv/isa/vector/base/vector_mem.temp.isa | 28 +++++++++++++++++++ .../isa/vector/simple/vector_mem.temp.isa | 28 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa index e97eef0940..2448a9ad95 100644 --- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa @@ -1,5 +1,24 @@ output header {{ +#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ + std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) + +#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ + do { \ + for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ + const uint32_t _vdElemIdx = \ + (vmi.rs % (elem_num_per_vreg_)) + _i; \ + const size_t _ei = _i + vmi.rs; \ + const bool _is_tail = _ei >= rVl; \ + const bool _is_masked = !this->vm && !_is_tail && \ + !elem_mask(v0, _ei); \ + if ((_is_tail && machInst.vtype8.vta) || \ + (_is_masked && machInst.vtype8.vma)) { \ + FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ + } \ + } \ + } while (0) + inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -147,6 +166,7 @@ Fault { %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -172,6 +192,8 @@ Fault %(memacc_code)s; } + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); + %(op_wb)s; return fault; } @@ -261,6 +283,7 @@ Fault %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -299,6 +322,11 @@ Fault } } +#if %(is_vecWhole)s +#else + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); +#endif + %(vfof_get_code)s; %(op_wb)s; return NoFault; diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa index a8e5b71f99..4b64f5dac0 100644 --- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa @@ -1,5 +1,24 @@ output header {{ +#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ + std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) + +#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ + do { \ + for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ + const uint32_t _vdElemIdx = \ + (vmi.rs % (elem_num_per_vreg_)) + _i; \ + const size_t _ei = _i + vmi.rs; \ + const bool _is_tail = _ei >= rVl; \ + const bool _is_masked = !this->vm && !_is_tail && \ + !elem_mask(v0, _ei); \ + if ((_is_tail && machInst.vtype8.vta) || \ + (_is_masked && machInst.vtype8.vma)) { \ + FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ + } \ + } \ + } while (0) + inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -147,6 +166,7 @@ Fault { %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -172,6 +192,8 @@ Fault %(memacc_code)s; } + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); + %(op_wb)s; return fault; } @@ -261,6 +283,7 @@ Fault %(op_decl)s; %(op_rd)s; + auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -299,6 +322,11 @@ Fault } } +#if %(is_vecWhole)s +#else + APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); +#endif + %(vfof_get_code)s; %(op_wb)s; return NoFault; From a125904b9f30ee07218925db89d72eacea2bca46 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 8 Apr 2026 20:06:40 +0800 Subject: [PATCH 13/38] cpu: add asid hash to decoupled btb Change-Id: Ice8e66d841a40e8c8420bd4756237eb9399d1642 --- src/cpu/pred/btb/abtb.cc | 61 ++++++++++++++++++----------- src/cpu/pred/btb/abtb.hh | 14 ++++--- src/cpu/pred/btb/btb_ittage.cc | 33 +++++++++------- src/cpu/pred/btb/btb_ittage.hh | 11 +++--- src/cpu/pred/btb/btb_tage.cc | 57 +++++++++++++++------------ src/cpu/pred/btb/btb_tage.hh | 15 ++++--- src/cpu/pred/btb/btb_ubtb.cc | 43 ++++++++++++-------- src/cpu/pred/btb/btb_ubtb.hh | 13 +++--- src/cpu/pred/btb/common.hh | 48 +++++++++++++++++++++++ src/cpu/pred/btb/decoupled_bpred.cc | 17 ++++++++ src/cpu/pred/btb/decoupled_bpred.hh | 1 + src/cpu/pred/btb/mbtb.cc | 27 +++++++------ src/cpu/pred/btb/mbtb.hh | 16 ++++---- src/cpu/pred/btb/microtage.cc | 48 ++++++++++++++--------- src/cpu/pred/btb/microtage.hh | 13 +++--- 15 files changed, 274 insertions(+), 143 deletions(-) diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc index aeafc9bb38..8013900e83 100644 --- a/src/cpu/pred/btb/abtb.cc +++ b/src/cpu/pred/btb/abtb.cc @@ -166,28 +166,42 @@ AheadBTB::setTrace() std::vector AheadBTB::processEntries(const std::vector& entries, Addr startAddr) { - int hitNum = entries.size(); - bool hit = hitNum > 0; + auto processed_entries = entries; + // Sort by instruction order + std::sort(processed_entries.begin(), processed_entries.end(), + [](const BTBEntry &a, const BTBEntry &b) { + return a.pc < b.pc; + }); + + auto it = std::remove_if(processed_entries.begin(), processed_entries.end(), + [startAddr](const BTBEntry &e) { + return e.pc < startAddr; + }); + processed_entries.erase(it, processed_entries.end()); + + Addr abtb_end = (startAddr + predictWidth) & + ~mask(floorLog2(predictWidth) - 1); + it = std::remove_if(processed_entries.begin(), processed_entries.end(), + [abtb_end](const BTBEntry &e) { + return e.pc >= abtb_end; + }); + processed_entries.erase(it, processed_entries.end()); + + int hitNum = processed_entries.size(); + bool hit = hitNum > 0; + // Update prediction statistics if (hit) { DPRINTF(ABTB, "BTB: lookup hit, dumping hit entry\n"); btbStats.predHit += hitNum; - for (auto &entry: entries) { + for (auto &entry: processed_entries) { printTickedBTBEntry(entry); } } else { btbStats.predMiss++; DPRINTF(ABTB, "BTB: lookup miss\n"); } - - auto processed_entries = entries; - - // Sort by instruction order - std::sort(processed_entries.begin(), processed_entries.end(), - [](const BTBEntry &a, const BTBEntry &b) { - return a.pc < b.pc; - }); return processed_entries; } @@ -299,12 +313,13 @@ AheadBTB::putPCHistory(Addr startAddr, std::vector &stagePreds) { meta = std::make_shared(); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; // Lookup all matching entries in BTB - auto find_entries = lookup(startAddr); - + auto find_entries = lookup(startAddr, asidHash); + // Process BTB entries auto processed_entries = processEntries(find_entries, startAddr); - + // Fill predictions for each pipeline stage fillStagePredictions(processed_entries, stagePreds); @@ -343,13 +358,13 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget * @return Vector of matching BTB entries */ std::vector -AheadBTB::lookupSingleBlock(Addr block_pc) +AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { return res; // ignore false hit when lowest bit is 1 } - Addr btb_idx = getIndex(block_pc); + Addr btb_idx = getIndex(block_pc, asidHash); auto btb_set = btb[btb_idx]; assert(btb_idx < numSets); // AheadBTB always uses ahead-pipelined implementation: @@ -357,7 +372,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc) DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx); aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set)); - Addr tag_curStartpc = getTag(block_pc);// abtb uses current FB pc to get tag + Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag Addr pc = 0; Addr idx_prvStartpc = 0;// abtb uses previous FB pc to get index BTBSet set; @@ -392,7 +407,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc) } std::vector -AheadBTB::lookup(Addr block_pc) +AheadBTB::lookup(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { @@ -400,7 +415,7 @@ AheadBTB::lookup(Addr block_pc) } // AheadBTB always uses single block lookup - res = lookupSingleBlock(block_pc); + res = lookupSingleBlock(block_pc, asidHash); return res; } @@ -594,12 +609,12 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC) for (auto &entry : entries_to_update) { Addr startPC = s3Pred.bbStart; - Addr btb_tag = getTag(startPC); // use last pc to get tag + Addr btb_tag = getTag(startPC, s3Pred.asidHash); // use last pc to get tag if (previousPC == 0) { DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n"); return; } - Addr btb_idx = getIndex(previousPC); // use last pc to get idx + Addr btb_idx = getIndex(previousPC, s3Pred.asidHash); // use last pc to get idx BranchInfo takenbranchinfo; takenbranchinfo.pc = s3Pred.getTakenEntry().pc; takenbranchinfo.target = s3Pred.getTakenEntry().target; @@ -670,7 +685,7 @@ AheadBTB::update(const FetchTarget &stream) // 4. Update BTB entries - each entry uses its own PC to calculate index and tag for (auto &entry : entries_to_update) { Addr startPC = stream.getRealStartPC(); - Addr btb_tag = getTag(startPC); // use current pc to get tag + Addr btb_tag = getTag(startPC, stream.asidHash); // use current pc to get tag // AheadBTB always uses ahead-pipelined update logic Addr previousPC = getPreviousPC(stream); @@ -678,7 +693,7 @@ AheadBTB::update(const FetchTarget &stream) DPRINTF(ABTB, "AheadBTB: no previous PC, skipping update\n"); return; } - Addr btb_idx = getIndex(previousPC); // use last pc to get idx + Addr btb_idx = getIndex(previousPC, stream.asidHash); // use last pc to get idx entry.source = getComponentIdx(); // mark the entry source as AheadBTB updateBTBEntry(btb_idx, btb_tag, entry, stream.exeBranchInfo, stream.exeTaken); } diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh index 677f5f7f32..e5e29f7ffd 100644 --- a/src/cpu/pred/btb/abtb.hh +++ b/src/cpu/pred/btb/abtb.hh @@ -224,8 +224,9 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The branch to look up. * @return Returns the index into the BTB. */ - inline Addr getIndex(Addr instPC) { - return (instPC >> idxShiftAmt) & idxMask; + inline Addr getIndex(Addr instPC, uint8_t asidHash) { + Addr baseIndex = (instPC >> idxShiftAmt) & idxMask; + return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash); } /** Returns the tag bits of a given address. @@ -234,8 +235,9 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The branch's address. * @return Returns the tag bits. */ - inline Addr getTag(Addr instPC) { - return (instPC >> tagShiftAmt) & tagMask; + inline Addr getTag(Addr instPC, uint8_t asidHash) { + Addr baseTag = (instPC >> tagShiftAmt) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } @@ -365,13 +367,13 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The address of the block to look up. * @return Returns all hit BTB entries. */ - std::vector lookup(Addr block_pc); + std::vector lookup(Addr block_pc, uint8_t asidHash); /** Helper function to lookup entries in a single block * @param block_pc The aligned PC to lookup * @return Vector of matching BTB entries */ - std::vector lookupSingleBlock(Addr block_pc); + std::vector lookupSingleBlock(Addr block_pc, uint8_t asidHash); /** The BTB structure: * - Organized as numSets sets diff --git a/src/cpu/pred/btb/btb_ittage.cc b/src/cpu/pred/btb/btb_ittage.cc index e625650d10..dd5bc40008 100644 --- a/src/cpu/pred/btb/btb_ittage.cc +++ b/src/cpu/pred/btb/btb_ittage.cc @@ -102,8 +102,9 @@ BTBITTAGE::tick() {} void BTBITTAGE::lookupHelper(Addr startAddr, const std::vector &btbEntries, - IndirectTargets& results, ThreadID tid) + IndirectTargets& results, ThreadID tid, uint8_t asidHash) { + (void)asidHash; DPRINTF(ITTAGE, "lookupHelper startAddr: %#lx\n", startAddr); std::vector preds; for (auto &btb_entry : btbEntries) { @@ -192,6 +193,7 @@ BTBITTAGE::dryRunCycle(Addr startPC) { void BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector &stagePreds) { const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; const auto &state = historyState(tid); if (debugPC == stream_start) { debugFlag = true; @@ -212,9 +214,9 @@ BTBITTAGE::putPCHistory(Addr stream_start, const bitset &history, std::vector> floorLog2(blockSize)); - return (pcBits ^ foldedHist) & mask; + return xorAsidHashIntoIndex((pcBits ^ foldedHist) & mask, tableIndexBits[t], asidHash); } Addr -BTBITTAGE::getTageIndex(Addr pc, int t) +BTBITTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool diff --git a/src/cpu/pred/btb/btb_ittage.hh b/src/cpu/pred/btb/btb_ittage.hh index 8269fdaeb6..7db7e39350 100644 --- a/src/cpu/pred/btb/btb_ittage.hh +++ b/src/cpu/pred/btb/btb_ittage.hh @@ -125,19 +125,20 @@ class BTBITTAGE : public TimedBaseBTBPredictor // return provided void lookupHelper(Addr stream_start, const std::vector &btbEntries, - IndirectTargets& results, ThreadID tid); + IndirectTargets& results, ThreadID tid, uint8_t asidHash); // use blockPC - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // use blockPC (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // use blockPC - Addr getTageTag(Addr pc, int table); + Addr getTageTag(Addr pc, int table, uint8_t asidHash = 0); // use blockPC (uint64_t version for performance) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + uint8_t asidHash = 0); Addr getOffset(Addr pc) { return (pc & (blockSize - 1)) >> 1; diff --git a/src/cpu/pred/btb/btb_tage.cc b/src/cpu/pred/btb/btb_tage.cc index c81bfb1a1d..7623e591c3 100644 --- a/src/cpu/pred/btb/btb_tage.cc +++ b/src/cpu/pred/btb/btb_tage.cc @@ -297,7 +297,8 @@ BTBTAGE::TagePrediction BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, std::shared_ptr predMeta, - ThreadID tid) { + ThreadID tid, + uint8_t asidHash) { DPRINTF(TAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); const auto &state = historyState(tid); @@ -314,12 +315,13 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, for (int i = numPredictors - 1; i >= 0; --i) { // Calculate index and tag: use snapshot if provided, otherwise use current folded history // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) - Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i, state.indexFoldedHist[i].get()); + Addr index = predMeta ? getTageIndex(startPC, i, predMeta->indexFoldedHist[i].get(), asidHash) + : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash); Addr tag = predMeta ? getTageTag(startPC, i, - predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), position) + predMeta->tagFoldedHist[i].get(), predMeta->altTagFoldedHist[i].get(), + position, asidHash) : getTageTag(startPC, i, state.tagFoldedHist[i].get(), - state.altTagFoldedHist[i].get(), position); + state.altTagFoldedHist[i].get(), position, asidHash); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -416,7 +418,7 @@ BTBTAGE::generateSinglePrediction(const BTBEntry &btb_entry, void BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, std::unordered_map &tageInfoForMgscs, - CondTakens& results, ThreadID tid) + CondTakens& results, ThreadID tid, uint8_t asidHash) { DPRINTF(TAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -424,7 +426,7 @@ BTBTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntri for (auto &btb_entry : btbEntries) { // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { - auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid); + auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, tid, asidHash); threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); @@ -468,6 +470,7 @@ BTBTAGE::dryRunCycle(Addr startPC) { void BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); @@ -497,7 +500,7 @@ BTBTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector meta, + uint8_t asidHash, AllocationTraceInfo &allocInfo) { // Match RTL victim priority: // 1) invalid way @@ -738,9 +743,9 @@ BTBTAGE::handleNewEntryAllocation(const Addr &startPC, unsigned position = getBranchIndexInBlock(entry.pc, startPC); for (unsigned ti = start_table; ti < numPredictors; ++ti) { - Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get()); + Addr newIndex = getTageIndex(startPC, ti, meta->indexFoldedHist[ti].get(), asidHash); Addr newTag = getTageTag(startPC, ti, - meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position); + meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position, asidHash); auto &set = tageTable[ti][newIndex]; @@ -917,10 +922,12 @@ BTBTAGE::update(const FetchTarget &stream) { TagePrediction recomputed; if (updateOnRead || !has_original_pred) { - // Reconstruct providers when update-on-read is enabled or when a new - // BTB entry lacks prediction-time metadata. - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); - if (has_original_pred && recomputed.taken != original_pred.taken) { + // Re-read providers using snapshot (do not rely on prediction-time main/alt) + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); + // Track differences for statistics + auto it = predMeta->preds.find(btb_entry.pc); + if (has_original_pred && it != predMeta->preds.end() && recomputed.taken != original_pred.taken) { hasRecomputedVsOriginalDiff = true; } } else { // otherwise, use the prediction from the prediction-time main/alt @@ -944,7 +951,8 @@ BTBTAGE::update(const FetchTarget &stream) { start_table = main_info.table + 1; // start from the table after the main prediction table } handleNewEntryAllocation(startAddr, btb_entry, actual_taken, - start_table, predMeta, allocInfo); + start_table, predMeta, stream.asidHash, + allocInfo); } #ifndef UNIT_TEST @@ -1051,7 +1059,8 @@ BTBTAGE::updateCounter(bool taken, unsigned width, short &counter) { // Calculate TAGE tag with folded history - optimized version using bitwise operations Addr -BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position) +BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position, uint8_t asidHash) { // Create mask for tableTagBits[t] to limit result size Addr mask = (1ULL << tableTagBits[t]) - 1; @@ -1067,19 +1076,20 @@ BTBTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr altTagBits = (altFoldedHist << 1) & mask; // XOR all components together, including position (like RTL) - return pcBits ^ foldedBits ^ altTagBits ^ position; + return injectAsidHashIntoTag(pcBits ^ foldedBits ^ altTagBits ^ position, + tableTagBits[t], asidHash); } Addr -BTBTAGE::getTageTag(Addr pc, int t, Addr position) +BTBTAGE::getTageTag(Addr pc, int t, Addr position, uint8_t asidHash) { const auto &state = historyState(0); return getTageTag(pc, t, state.tagFoldedHist[t].get(), - state.altTagFoldedHist[t].get(), position); + state.altTagFoldedHist[t].get(), position, asidHash); } Addr -BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) +BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash) { // Create mask for tableIndexBits[t] to limit result size Addr mask = (1ULL << tableIndexBits[t]) - 1; @@ -1088,14 +1098,13 @@ BTBTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr pcBits = (pc >> pcShift) & mask; Addr foldedBits = foldedHist & mask; - // Support non-power-of-two table sizes when tuning capacities. - return (pcBits ^ foldedBits) % tableSizes[t]; + return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash) % tableSizes[t]; } Addr -BTBTAGE::getTageIndex(Addr pc, int t) +BTBTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool diff --git a/src/cpu/pred/btb/btb_tage.hh b/src/cpu/pred/btb/btb_tage.hh index 33bd6826ae..42650a6ea1 100644 --- a/src/cpu/pred/btb/btb_tage.hh +++ b/src/cpu/pred/btb/btb_tage.hh @@ -179,21 +179,22 @@ class BTBTAGE : public TimedBaseBTBPredictor // Look up predictions in TAGE tables for a stream of instructions void lookupHelper(const Addr &startPC, const std::vector &btbEntries, std::unordered_map &tageInfoForMgscs, - CondTakens& results, ThreadID tid); + CondTakens& results, ThreadID tid, uint8_t asidHash); // Calculate TAGE index for a given PC and table - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // Calculate TAGE index with folded history (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // Calculate TAGE tag for a given PC and table // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, Addr position = 0); + Addr getTageTag(Addr pc, int table, Addr position = 0, uint8_t asidHash = 0); // Calculate TAGE tag with folded history (uint64_t version for performance) // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position = 0, uint8_t asidHash = 0); // Get offset within a block for a given PC Addr getOffset(Addr pc) { @@ -466,7 +467,8 @@ private: TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const std::shared_ptr predMeta = nullptr, - ThreadID tid = 0); + ThreadID tid = 0, + uint8_t asidHash = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -483,6 +485,7 @@ private: bool actual_taken, unsigned main_table, std::shared_ptr meta, + uint8_t asidHash, AllocationTraceInfo &allocInfo); diff --git a/src/cpu/pred/btb/btb_ubtb.cc b/src/cpu/pred/btb/btb_ubtb.cc index 755d8d8460..5f809713c3 100644 --- a/src/cpu/pred/btb/btb_ubtb.cc +++ b/src/cpu/pred/btb/btb_ubtb.cc @@ -137,7 +137,8 @@ void UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { meta = std::make_shared(); - auto it = lookup(startAddr); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; + auto it = lookup(startAddr, asidHash); auto& entry = meta->hit_entry; entry = (it != ubtb.end()) ? *it : TickedUBTBEntry(); @@ -151,23 +152,29 @@ UBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std:: } UBTB::UBTBIter -UBTB::lookup(Addr startAddr) +UBTB::lookup(Addr startAddr, uint8_t asidHash) { if (startAddr & 0x1) { return ubtb.end(); // ignore false hit when lowest bit is 1 } - Addr current_tag = getTag(startAddr); + Addr current_tag = getTag(startAddr, asidHash); + Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1); DPRINTF(UBTB, "UBTB: Doing tag comparison for tag %#lx\n", current_tag); auto it = std::find_if(ubtb.begin(), ubtb.end(), - [current_tag](const TickedUBTBEntry &way) { return way.valid && way.tag == current_tag; }); + [current_tag, startAddr, block_end](const TickedUBTBEntry &way) { + return way.valid && way.tag == current_tag && + way.pc >= startAddr && way.pc < block_end; + }); if (it != ubtb.end()) { // Found a hit - verify no duplicates - auto duplicate = std::find_if(std::next(it), ubtb.end(), [current_tag](const TickedUBTBEntry &way) { - return way.valid && way.tag == current_tag; + auto duplicate = std::find_if(std::next(it), ubtb.end(), + [current_tag, startAddr, block_end](const TickedUBTBEntry &way) { + return way.valid && way.tag == current_tag && + way.pc >= startAddr && way.pc < block_end; }); if (duplicate != ubtb.end()) { DPRINTF(UBTB, "UBTB: Multiple hits found in uBTB for the same tag %#lx\n", current_tag); @@ -184,7 +191,8 @@ UBTB::lookup(Addr startAddr) void -UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr) +UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, + Addr startAddr, uint8_t asidHash) { assert(newTakenEntry.valid); TickedUBTBEntry newEntry = TickedUBTBEntry(newTakenEntry, curTick()); @@ -192,7 +200,7 @@ UBTB::replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr newEntry.target = newTakenEntry.target; newEntry.ctr = 0; // have a bug here:ubtb will accept ctr from mbtb, reset it to 0 at here // important: update tag (mbtb and ubtb have different tags, even diffferent tag length) - newEntry.tag = getTag(startAddr); + newEntry.tag = getTag(startAddr, asidHash); *oldEntryIter = newEntry; } @@ -213,13 +221,14 @@ UBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred) auto startAddr = s3Pred.bbStart; UBTBIter oldEntryIter = lastPred.hit_entry; takenEntry.source = getComponentIdx(); - updateNewEntry(oldEntryIter, takenEntry, startAddr); + updateNewEntry(oldEntryIter, takenEntry, startAddr, s3Pred.asidHash); } -void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr) +void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, + const Addr startAddr, uint8_t asidHash) { //using the FB final taken branch to update uBTB if (oldEntryIter != ubtb.end()) { @@ -259,7 +268,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con } // Replace the entry with the new prediction - replaceOldEntry(toBeReplacedIter, takenEntry, startAddr); + replaceOldEntry(toBeReplacedIter, takenEntry, startAddr, asidHash); } else if (oldEntryIter != ubtb.end() && takenEntry.valid) { ubtbStats.s1Hits3Taken++; @@ -269,7 +278,7 @@ void UBTB::updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, con updateUCtr(oldEntryIter->uctr, false); if (oldEntryIter->uctr == 0) { // replace the old entry with the new one - replaceOldEntry(oldEntryIter, takenEntry, startAddr); + replaceOldEntry(oldEntryIter, takenEntry, startAddr, asidHash); } } else { // S0 and S3 predict the same (brpc and target) @@ -294,13 +303,15 @@ UBTB::update(const FetchTarget &stream) // Use BTBEntry instead of BranchInfo; make it invalid when not taken BTBEntry takenEntry = stream.exeTaken ? BTBEntry(stream.exeBranchInfo) : BTBEntry(); auto startAddr = stream.getRealStartPC(); - Addr oldtag = getTag(startAddr); + Addr oldtag = getTag(startAddr, stream.asidHash); + Addr block_end = (startAddr + predictWidth) & ~mask(floorLog2(predictWidth) - 1); UBTBIter oldEntryIter = ubtb.end(); oldEntryIter = meta->hit_entry.valid ? - std::find_if(ubtb.begin(), ubtb.end(), [oldtag](const TickedUBTBEntry &e) { - return e.valid && e.tag == oldtag; + std::find_if(ubtb.begin(), ubtb.end(), [oldtag, startAddr, block_end](const TickedUBTBEntry &e) { + return e.valid && e.tag == oldtag && + e.pc >= startAddr && e.pc < block_end; }) : ubtb.end(); if (stream.exeTaken) { @@ -315,7 +326,7 @@ UBTB::update(const FetchTarget &stream) // Verify uBTB state assert(ubtb.size() <= numEntries); if (!usingS3Pred) { - updateNewEntry(oldEntryIter, takenEntry, startAddr); + updateNewEntry(oldEntryIter, takenEntry, startAddr, stream.asidHash); } } diff --git a/src/cpu/pred/btb/btb_ubtb.hh b/src/cpu/pred/btb/btb_ubtb.hh index 649641b420..4898cec009 100644 --- a/src/cpu/pred/btb/btb_ubtb.hh +++ b/src/cpu/pred/btb/btb_ubtb.hh @@ -218,8 +218,9 @@ class UBTB : public TimedBaseBTBPredictor * @param startPC The start address of the fetch block * @return Returns the tag bits. */ - inline Addr getTag(Addr startPC) { - return (startPC >> 1) & tagMask; + inline Addr getTag(Addr startPC, uint8_t asidHash) { + Addr baseTag = (startPC >> 1) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } void updateUCtr(unsigned &ctr, bool inc) { @@ -231,7 +232,7 @@ class UBTB : public TimedBaseBTBPredictor * @param startAddr The FB start address to look up * @return Iterator to the matching entry if found, or ubtb.end() if not found */ - UBTBIter lookup(Addr startAddr); + UBTBIter lookup(Addr startAddr, uint8_t asidHash); /** helper method called by putPCHistory: Check uBTB entry pc range and update statistics * @param entry The uBTB entry to check @@ -251,10 +252,12 @@ class UBTB : public TimedBaseBTBPredictor * @param oldEntry Iterator to the entry to replace * @param newPrediction The new prediction to store */ - void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, Addr startAddr); + void replaceOldEntry(UBTBIter oldEntryIter, const BTBEntry &newTakenEntry, + Addr startAddr, uint8_t asidHash); //using the FB final taken branch to update uBTB - void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, const Addr startAddr); + void updateNewEntry(UBTBIter oldEntryIter, const BTBEntry &takenEntry, + const Addr startAddr, uint8_t asidHash); /** The uBTB structure: diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index b61e459ff6..e40dee3cf2 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -1,6 +1,7 @@ #ifndef __CPU_PRED_BTB_STREAM_STRUCT_HH__ #define __CPU_PRED_BTB_STREAM_STRUCT_HH__ +#include #include #include @@ -18,6 +19,49 @@ namespace branch_prediction { namespace btb_pred { +inline uint8_t +foldAsidHash16To4(uint16_t asid) +{ + return (asid & 0xf) ^ ((asid >> 4) & 0xf) ^ + ((asid >> 8) & 0xf) ^ ((asid >> 12) & 0xf); +} + +inline Addr +expandAsidHash(uint8_t asid_hash, unsigned bits) +{ + if (bits == 0) { + return 0; + } + + Addr expanded = 0; + for (unsigned shift = 0; shift < bits; shift += 4) { + expanded |= static_cast(asid_hash) << shift; + } + return expanded & mask(bits); +} + +inline Addr +injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash) +{ + if (tag_bits == 0) { + return 0; + } + + const unsigned hash_bits = std::min(4, tag_bits); + const Addr hash_mask = mask(hash_bits); + return (base_tag & ~hash_mask) | (static_cast(asid_hash) & hash_mask); +} + +inline Addr +xorAsidHashIntoIndex(Addr base_index, unsigned index_bits, uint8_t asid_hash) +{ + if (index_bits == 0) { + return 0; + } + + return (base_index ^ expandAsidHash(asid_hash, index_bits)) & mask(index_bits); +} + enum EndType { END_CALL=0, @@ -276,6 +320,7 @@ using IndirectTargets = std::vector>; struct FetchTarget { ThreadID tid; + uint8_t asidHash; Addr startPC; // start pc of the stream bool predTaken; // whether the FetchTarget has taken branch Addr predEndPC; // predicted stream end pc (fall through pc) @@ -324,6 +369,7 @@ struct FetchTarget FetchTarget() : tid(0), + asidHash(0), startPC(0), predTaken(false), predEndPC(0), @@ -453,6 +499,7 @@ struct FetchTarget struct FullBTBPrediction { ThreadID tid; + uint8_t asidHash; Addr bbStart; std::vector btbEntries; // for BTB, only assigned when hit, sorted by inst order // for conditional branch predictors, mapped with lowest bits of branches @@ -474,6 +521,7 @@ struct FullBTBPrediction FullBTBPrediction() : tid(0), + asidHash(0), bbStart(0), btbEntries(), condTakens(), diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 01e9b78ac3..aec2222806 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -2,6 +2,7 @@ #include +#include "arch/riscv/regs/misc.hh" #include "base/debug_helper.hh" #include "base/output.hh" #include "cpu/o3/cpu.hh" @@ -22,6 +23,19 @@ namespace branch_prediction namespace btb_pred { +uint8_t +DecoupledBPUWithBTB::getThreadAsidHash(ThreadID tid) const +{ + if (!cpu) { + return 0; + } + + const RegVal satp = + cpu->readMiscRegNoEffect(RiscvISA::MiscRegIndex::MISCREG_SATP, tid); + const uint16_t asid = (satp >> 44) & mask(16); + return foldAsidHash16To4(asid); +} + void DecoupledBPUWithBTB::consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid) { @@ -209,6 +223,7 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid) { auto& thread = threads[tid]; auto& predsOfEachStage = threads[tid].predsOfEachStage; + const uint8_t asid_hash = getThreadAsidHash(tid); DPRINTF(Override, "Requesting new prediction for PC %#lx\n", thread.s0PC); @@ -216,6 +231,7 @@ DecoupledBPUWithBTB::requestNewPrediction(ThreadID tid) clearPreds(tid); for (int i = 0; i < numStages; i++) { predsOfEachStage[i].tid = tid; + predsOfEachStage[i].asidHash = asid_hash; predsOfEachStage[i].bbStart = thread.s0PC; predsOfEachStage[i].predSource = i; } @@ -781,6 +797,7 @@ DecoupledBPUWithBTB::createFetchTargetEntry(ThreadID tid) // Create a new fetch target entry FetchTarget entry; entry.tid = tid; + entry.asidHash = finalPred.asidHash; entry.startPC = s0PC; // Extract branch prediction information diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 2552ce9e44..0a46c1a4e5 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -330,6 +330,7 @@ class DecoupledBPUWithBTB : public BPredUnit } void setCpu(CPU *_cpu) { cpu = _cpu; } + uint8_t getThreadAsidHash(ThreadID tid) const; void consumeFetchTarget(unsigned fetched_inst_num, ThreadID tid); diff --git a/src/cpu/pred/btb/mbtb.cc b/src/cpu/pred/btb/mbtb.cc index abd2923739..de1e764fce 100644 --- a/src/cpu/pred/btb/mbtb.cc +++ b/src/cpu/pred/btb/mbtb.cc @@ -299,8 +299,9 @@ MBTB::putPCHistory(Addr startAddr, std::vector &stagePreds) { meta = std::make_shared(); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; // Lookup all matching entries in BTB - auto find_entries = lookup(startAddr, meta); + auto find_entries = lookup(startAddr, asidHash, meta); // Process BTB entries auto processed_entries = processEntries(find_entries, startAddr); @@ -335,7 +336,7 @@ MBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &ent * @return Vector of matching BTB entries */ std::vector -MBTB::lookupSingleBlock(Addr block_pc) +MBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { @@ -346,11 +347,11 @@ MBTB::lookupSingleBlock(Addr block_pc) auto& target_sram = (sram_id == 0) ? sram0 : sram1; auto& target_mru = (sram_id == 0) ? mru0 : mru1; - Addr btb_idx = getIndex(block_pc); + Addr btb_idx = getIndex(block_pc, asidHash); auto& btb_set = target_sram[btb_idx]; assert(btb_idx < numSets); - Addr current_tag = getTag(block_pc); + Addr current_tag = getTag(block_pc, asidHash); DPRINTF(BTB, "BTB: Doing tag comparison for SRAM%d index 0x%lx tag %#lx\n", sram_id, btb_idx, current_tag); @@ -365,7 +366,7 @@ MBTB::lookupSingleBlock(Addr block_pc) } std::vector -MBTB::lookup(Addr block_pc, std::shared_ptr meta) +MBTB::lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr meta) { std::vector res; if (block_pc & 0x1) { @@ -376,15 +377,15 @@ MBTB::lookup(Addr block_pc, std::shared_ptr meta) // Calculate 32B aligned address Addr alignedPC = block_pc & ~(blockSize - 1); // Lookup first 32B block - res = lookupSingleBlock(alignedPC); + res = lookupSingleBlock(alignedPC, asidHash); // Lookup next 32B block - auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize); + auto nextBlockRes = lookupSingleBlock(alignedPC + blockSize, asidHash); // Merge results res.insert(res.end(), nextBlockRes.begin(), nextBlockRes.end()); // lookup victim cache if victim cache is enabled if (victimCacheSize > 0) { - auto victimResults = lookupVictimCache(block_pc); + auto victimResults = lookupVictimCache(block_pc, asidHash); if (!victimResults.empty()) { DPRINTF(BTB, "Victim cache hit for lookup at %#lx\n", block_pc); btbStats.victimCacheHit++; @@ -460,7 +461,7 @@ MBTB::getAndSetNewBTBEntry(FetchTarget &stream) } // Set tag and update stream metadata for use in update() - entry_to_write.tag = getTag(entry_to_write.pc); + entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash); stream.updateNewBTBEntry = entry_to_write; stream.updateIsOldEntry = is_old_entry; } @@ -508,7 +509,7 @@ MBTB::updateBTBEntry(const BTBEntry& entry, const FetchTarget &stream) auto& target_mru = (sram_id == 0) ? mru0 : mru1; // Calculate index and tag for this entry - Addr btb_idx = getIndex(entry.pc); + Addr btb_idx = getIndex(entry.pc, stream.asidHash); // Look for matching entry in the target SRAM bool found = false; @@ -564,7 +565,7 @@ MBTB::buildUpdatedEntry(const BTBEntry& req_entry, ? BTBEntry(*existing_entry) : req_entry; // Always recalculate tag based on the actual PC being written - entry_to_write.tag = getTag(entry_to_write.pc); + entry_to_write.tag = getTag(entry_to_write.pc, stream.asidHash); entry_to_write.resolved = false; // reset resolved status // Update saturating counter and alwaysTaken @@ -723,7 +724,7 @@ MBTB::prepareUpdateEntries(const FetchTarget &stream) { * Victim cache operations implementation */ std::vector -MBTB::lookupVictimCache(Addr block_pc) +MBTB::lookupVictimCache(Addr block_pc, uint8_t asidHash) { std::vector results; Addr alignedPC = block_pc & ~(blockSize - 1); @@ -735,7 +736,7 @@ MBTB::lookupVictimCache(Addr block_pc) Addr entryAlignedPC = entry.pc & ~(blockSize - 1); // Check if this entry is in either of the two 32B blocks we're looking for if (entryAlignedPC == alignedPC || entryAlignedPC == (alignedPC + blockSize)) { - Addr current_tag = getTag(entry.pc); + Addr current_tag = getTag(entry.pc, asidHash); if (entry.tag == current_tag) { results.push_back(entry); DPRINTF(BTB, "Victim cache hit for pc %#lx\n", entry.pc); diff --git a/src/cpu/pred/btb/mbtb.hh b/src/cpu/pred/btb/mbtb.hh index b4f587a141..3b2ec76fe4 100644 --- a/src/cpu/pred/btb/mbtb.hh +++ b/src/cpu/pred/btb/mbtb.hh @@ -215,8 +215,9 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The branch to look up. * @return Returns the index into the BTB. */ - inline Addr getIndex(Addr instPC) { - return (instPC >> idxShiftAmt) & idxMask; + inline Addr getIndex(Addr instPC, uint8_t asidHash) { + Addr baseIndex = (instPC >> idxShiftAmt) & idxMask; + return xorAsidHashIntoIndex(baseIndex, floorLog2(numSets), asidHash); } /** Returns the tag bits of a given address. @@ -225,8 +226,9 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The branch's address. * @return Returns the tag bits. */ - inline Addr getTag(Addr instPC) { - return (instPC >> tagShiftAmt) & tagMask; + inline Addr getTag(Addr instPC, uint8_t asidHash) { + Addr baseTag = (instPC >> tagShiftAmt) & tagMask; + return injectAsidHashIntoTag(baseTag, tagBits, asidHash); } /** Update the 2-bit saturating counter for conditional branches @@ -340,16 +342,16 @@ class MBTB : public TimedBaseBTBPredictor * @param inst_PC The address of the block to look up. * @return Returns all hit BTB entries. */ - std::vector lookup(Addr block_pc, std::shared_ptr meta); + std::vector lookup(Addr block_pc, uint8_t asidHash, std::shared_ptr meta); /** Helper function to lookup entries in a single block * @param block_pc The aligned PC to lookup * @return Vector of matching BTB entries */ - std::vector lookupSingleBlock(Addr block_pc); + std::vector lookupSingleBlock(Addr block_pc, uint8_t asidHash); /** Victim cache operations */ - std::vector lookupVictimCache(Addr block_pc); + std::vector lookupVictimCache(Addr block_pc, uint8_t asidHash); void insertVictimCache(const TickedBTBEntry& evicted_entry); bool eraseFromVictimCacheByPC(Addr pc); diff --git a/src/cpu/pred/btb/microtage.cc b/src/cpu/pred/btb/microtage.cc index 7fd88b0845..d4cb7b4533 100644 --- a/src/cpu/pred/btb/microtage.cc +++ b/src/cpu/pred/btb/microtage.cc @@ -212,7 +212,8 @@ MicroTAGE::TagePrediction MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, std::shared_ptr predMeta, - ThreadID tid) { + ThreadID tid, + uint8_t asidHash) { DPRINTF(UTAGE, "generateSinglePrediction for btbEntry: %#lx\n", btb_entry.pc); const auto &state = historyState(tid); @@ -227,12 +228,13 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, // Calculate index and tag: use snapshot if provided, otherwise use current folded history // Tag includes position XOR (like RTL: tag = tempTag ^ cfiPosition) Addr index = predMeta ? getTageIndex(startPC, i, - predMeta->indexFoldedHist[i].get()) - : getTageIndex(startPC, i, state.indexFoldedHist[i].get()); + predMeta->indexFoldedHist[i].get(), asidHash) + : getTageIndex(startPC, i, state.indexFoldedHist[i].get(), asidHash); Addr tag = predMeta ? getTageTag(startPC, i, - predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), position) + predMeta->tagFoldedHist[i].get(),predMeta->altTagFoldedHist[i].get(), + position, asidHash) : getTageTag(startPC, i, state.tagFoldedHist[i].get(), - state.altTagFoldedHist[i].get(), position); + state.altTagFoldedHist[i].get(), position, asidHash); bool match = false; // for each table, only one way can be matched TageEntry matching_entry; @@ -289,7 +291,7 @@ MicroTAGE::generateSinglePrediction(const BTBEntry &btb_entry, */ void MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEntries, - CondTakens& results, ThreadID tid) + CondTakens& results, ThreadID tid, uint8_t asidHash) { DPRINTF(UTAGE, "lookupHelper startAddr: %#lx\n", startPC); @@ -298,7 +300,7 @@ MicroTAGE::lookupHelper(const Addr &startPC, const std::vector &btbEnt // Only predict for valid conditional branches if (btb_entry.isCond && btb_entry.valid) { auto pred = generateSinglePrediction(btb_entry, startPC, nullptr, - tid); + tid, asidHash); threadMeta[tid]->preds[btb_entry.pc] = pred; tageStats.updateStatsWithTagePrediction(pred, true); results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); @@ -331,6 +333,7 @@ MicroTAGE::dryRunCycle(Addr startPC) { void MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector &stagePreds) { const ThreadID tid = predictorTid(stagePreds); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; const auto &state = historyState(tid); // Record prediction bank for next tick's conflict detection lastPredBankId = getBankId(startPC); @@ -368,7 +371,7 @@ MicroTAGE::putPCHistory(Addr startPC, const bitset &history, std::vector meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way) { @@ -549,9 +553,10 @@ MicroTAGE::handleNewEntryAllocation(const Addr &startPC, for (unsigned ti = start_table; ti < numPredictors; ++ti) { Addr newIndex = getTageIndex(startPC, ti, - meta->indexFoldedHist[ti].get()); + meta->indexFoldedHist[ti].get(), asidHash); Addr newTag = getTageTag(startPC, ti, - meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), position); + meta->tagFoldedHist[ti].get(), meta->altTagFoldedHist[ti].get(), + position, asidHash); auto &set = tageTable[ti][newIndex]; @@ -679,7 +684,8 @@ MicroTAGE::update(const FetchTarget &stream) { TagePrediction recomputed; if (updateOnRead) { // if update on read is enabled, re-read providers using snapshot // Re-read providers using snapshot (do not rely on prediction-time main/alt) - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); } else { // otherwise, use the prediction from the prediction-time main/alt auto pred_it = predMeta->preds.find(btb_entry.pc); if (pred_it != predMeta->preds.end()) { @@ -687,7 +693,8 @@ MicroTAGE::update(const FetchTarget &stream) { } else { DPRINTF(UTAGE, "update: missing predMeta entry for pc %#lx, recompute with snapshot\n", btb_entry.pc); - recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta); + recomputed = generateSinglePrediction(btb_entry, startAddr, predMeta, + stream.tid, stream.asidHash); } } if (recomputed.mainprovided) { @@ -710,7 +717,8 @@ MicroTAGE::update(const FetchTarget &stream) { start_table = main_info.table + 1; // start from the table after the main prediction table } alloc_success = handleNewEntryAllocation(startAddr, btb_entry, actual_taken, - start_table, predMeta, allocated_table, allocated_index, allocated_way); + start_table, predMeta, stream.asidHash, + allocated_table, allocated_index, allocated_way); } #ifndef UNIT_TEST @@ -792,7 +800,8 @@ MicroTAGE::updateCounter(bool taken, unsigned width, short &counter) { // Calculate TAGE tag with folded history - optimized version using bitwise operations Addr -MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, Addr position) +MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position, uint8_t asidHash) { // Create mask for tableTagBits[t] to limit result size Addr mask = (1ULL << tableTagBits[t]) - 1; @@ -807,11 +816,12 @@ MicroTAGE::getTageTag(Addr pc, int t, uint64_t foldedHist, uint64_t altFoldedHis Addr altTagBits = (altFoldedHist << 1) & mask; // XOR all components together, including position (like RTL) - return pcBits ^ foldedBits ^ position ^ altTagBits; + return injectAsidHashIntoTag(pcBits ^ foldedBits ^ position ^ altTagBits, + tableTagBits[t], asidHash); } Addr -MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) +MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist, uint8_t asidHash) { // Create mask for tableIndexBits[t] to limit result size Addr mask = (1ULL << tableIndexBits[t]) - 1; @@ -820,13 +830,13 @@ MicroTAGE::getTageIndex(Addr pc, int t, uint64_t foldedHist) Addr pcBits = (pc >> pcShift) & mask; Addr foldedBits = foldedHist & mask; - return pcBits ^ foldedBits; + return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits[t], asidHash); } Addr -MicroTAGE::getTageIndex(Addr pc, int t) +MicroTAGE::getTageIndex(Addr pc, int t, uint8_t asidHash) { - return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get()); + return getTageIndex(pc, t, historyState(0).indexFoldedHist[t].get(), asidHash); } bool diff --git a/src/cpu/pred/btb/microtage.hh b/src/cpu/pred/btb/microtage.hh index 3a5fcc518c..51dc756746 100644 --- a/src/cpu/pred/btb/microtage.hh +++ b/src/cpu/pred/btb/microtage.hh @@ -168,17 +168,18 @@ class MicroTAGE : public TimedBaseBTBPredictor // Look up predictions in TAGE tables for a stream of instructions void lookupHelper(const Addr &startPC, const std::vector &btbEntries, - CondTakens& results, ThreadID tid); + CondTakens& results, ThreadID tid, uint8_t asidHash); // Calculate TAGE index for a given PC and table - Addr getTageIndex(Addr pc, int table); + Addr getTageIndex(Addr pc, int table, uint8_t asidHash = 0); // Calculate TAGE index with folded history (uint64_t version for performance) - Addr getTageIndex(Addr pc, int table, uint64_t foldedHist); + Addr getTageIndex(Addr pc, int table, uint64_t foldedHist, uint8_t asidHash = 0); // Calculate TAGE tag with folded history (uint64_t version for performance) // position: branch position within the block (xored into tag like RTL) - Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, Addr position = 0); + Addr getTageTag(Addr pc, int table, uint64_t foldedHist, uint64_t altFoldedHist, + Addr position = 0, uint8_t asidHash = 0); // Get branch index within a prediction block unsigned getBranchIndexInBlock(Addr branchPC, Addr startPC); @@ -355,7 +356,8 @@ private: TagePrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const std::shared_ptr predMeta = nullptr, - ThreadID tid = 0); + ThreadID tid = 0, + uint8_t asidHash = 0); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); @@ -372,6 +374,7 @@ private: bool actual_taken, unsigned main_table, std::shared_ptr meta, + uint8_t asidHash, uint64_t &allocated_table, uint64_t &allocated_index, uint64_t &allocated_way); From c79fec16c5483d6a59d7a3c23c4f33f72a80f449 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 9 Apr 2026 11:50:38 +0800 Subject: [PATCH 14/38] cpu-o3: fix smt thread-local inst stop threshold Use per-thread committed instruction counts for O3 warmup and stat-dump stop checks in SMT mode, instead of summing instructions across threads. Change-Id: I6ecd5f96a18ce9aa96d0712a9e05f3d8dedcbac4 --- src/cpu/o3/cpu.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 5961aed7b1..9433193c06 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -1377,10 +1377,10 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst) cpi_r.roll(1); } - uint64_t committedInsts = totalInsts(); + const uint64_t committedThreadInsts = thread[tid]->numInst; if (this->nextDumpInstCount && !dump_done - && committedInsts >= this->nextDumpInstCount) { + && committedThreadInsts >= this->nextDumpInstCount) { fprintf(stderr, "Will trigger stat dump and reset\n"); statistics::schedStatEvent(true, true, curTick(), 0); scheduleInstStop(tid,0,"Will trigger stat dump and reset"); @@ -1394,7 +1394,8 @@ CPU::instDone(ThreadID tid, const DynInstPtr &inst) // Check for instruction-count-based events. thread[tid]->comInstEventQueue.serviceEvents(thread[tid]->numInst); - if (this->warmupInstCount && !warmup_done && committedInsts >= this->warmupInstCount) { + if (this->warmupInstCount && !warmup_done && + committedThreadInsts >= this->warmupInstCount) { fprintf(stderr, "Will trigger stat dump and reset\n"); statistics::schedStatEvent(true, true, curTick(), 0); scheduleInstStop(tid,0,"Will trigger stat dump and reset"); From 1105968e1019cae1bcfe54d6b9d3dffc9f095b8a Mon Sep 17 00:00:00 2001 From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com> Date: Thu, 9 Apr 2026 11:51:18 +0800 Subject: [PATCH 15/38] cpu-o3: fix Decoder scheduler,thread 1 count is incorrect (#816) Co-authored-by: mo haonan --- src/cpu/o3/decode.cc | 26 ++++++++++++++++++++++++++ src/cpu/o3/decode.hh | 4 ++++ src/cpu/o3/fetch.cc | 30 +++++++++++++++++++++++++++++- src/cpu/o3/fetch.hh | 8 ++++++++ src/cpu/o3/iew.cc | 7 ++++--- 5 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 93ede3d673..0d36e05a85 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -138,8 +138,14 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) : statistics::Group(cpu, "decode"), ADD_STAT(idleCycles, statistics::units::Cycle::get(), "Number of cycles decode is idle"), + ADD_STAT(smtidleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle per tid"), ADD_STAT(blockedCycles, statistics::units::Cycle::get(), "Number of cycles decode is blocked"), + ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked per tid"), + ADD_STAT(smtnotactiveCycles, statistics::units::Cycle::get(), + "Number of cycles fetch no active per tid"), ADD_STAT(runCycles, statistics::units::Cycle::get(), "Number of cycles decode is running"), ADD_STAT(unblockCycles, statistics::units::Cycle::get(), @@ -179,6 +185,16 @@ Decode::DecodeStats::DecodeStats(CPU *cpu) mispredictedByPC.flags(statistics::total); mispredictedByNPC.flags(statistics::total); fusedInsts.init(128).flags(statistics::nozero); + + smtidleCycles + .init(4) + .flags(statistics::total); + smtblockedCycles + .init(4) + .flags(statistics::total); + smtnotactiveCycles + .init(4) + .flags(statistics::total); } void @@ -488,6 +504,15 @@ Decode::tick() bool block = stallSig->blockDecode[i]; bool active = !block && !fixedbuffer[i].empty(); + if(block){ + ++stats.smtblockedCycles[i]; + } + + if(!active) + { + ++stats.smtnotactiveCycles[i]; + } + stallSig->blockFetch[i] = block || fifoBackpressured; stallSig->fetchBlockReason[i] = stallSig->blockFetch[i] ? @@ -583,6 +608,7 @@ Decode::decodeInsts(ThreadID tid) " early.\n",tid); // Should I change the status to idle? ++stats.idleCycles; + ++stats.smtidleCycles[tid]; StallReason stall = StallReason::NoStall; for (auto iter : fromFetch->fetchStallReason) { diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index c548fad3c7..f2e39b56a6 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -259,8 +259,12 @@ class Decode /** Stat for total number of idle cycles. */ statistics::Scalar idleCycles; + + statistics::Vector smtidleCycles; /** Stat for total number of blocked cycles. */ statistics::Scalar blockedCycles; + statistics::Vector smtblockedCycles; + statistics::Vector smtnotactiveCycles; /** Stat for total number of normal running cycles. */ statistics::Scalar runCycles; /** Stat for total number of unblocking cycles. */ diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index f95738bd2c..ff31aa9bb9 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -204,8 +204,12 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) "Number of cycles fetch has spent waiting for tlb"), ADD_STAT(idleCycles, statistics::units::Cycle::get(), "Number of cycles fetch was idle"), + ADD_STAT(smtidleCycles, statistics::units::Cycle::get(), + "Number of cycles fetch was idle per tid"), ADD_STAT(blockedCycles, statistics::units::Cycle::get(), "Number of cycles fetch has spent blocked"), + ADD_STAT(smtblockedCycles, statistics::units::Cycle::get(), + "Number of cycles fetch has spent blocked per tid"), ADD_STAT(miscStallCycles, statistics::units::Cycle::get(), "Number of cycles fetch has spent waiting on interrupts, or bad " "addresses, or out of MSHRs"), @@ -241,6 +245,10 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) "Distribution of fetch status"), ADD_STAT(decodeStalls, statistics::units::Count::get(), "Number of decode stalls"), + ADD_STAT(smtdecodeStalls, statistics::units::Count::get(), + "Number of decode stalls per tid"), + ADD_STAT(smtftqempty, statistics::units::Count::get(), + "Number of ftq empty per tid"), ADD_STAT(decodeStallRate, statistics::units::Rate< statistics::units::Count, statistics::units::Cycle>::get(), "Number of decode stalls per cycle", @@ -336,6 +344,18 @@ Fetch::FetchStatGroup::FetchStatGroup(CPU *cpu, Fetch *fetch) } decodeStalls .prereq(decodeStalls); + smtdecodeStalls + .init(fetch->numThreads) + .flags(statistics::total); + smtftqempty + .init(fetch->numThreads) + .flags(statistics::total); + smtidleCycles + .init(fetch->numThreads) + .flags(statistics::total); + smtblockedCycles + .init(fetch->numThreads) + .flags(statistics::total); decodeStallRate .flags(statistics::total); fetchBubbles @@ -1401,9 +1421,12 @@ Fetch::sendInstructionsToDecode() for (int i = 0; i < numThreads; i++) { if (!stallSig->blockFetch[i]) { any_thread_active = true; - break; + //break; + }else{ + fetchStats.smtdecodeStalls[i]++; } } + if (!any_thread_active) { // All threads are blocked, no instructions to send ThreadID blocked_tid = InvalidThreadID; @@ -1427,6 +1450,7 @@ Fetch::sendInstructionsToDecode() } ThreadID tid =selectUnstalledThread(); + DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid); // fetch totally stalled if (stallSig->blockFetch[tid]) { @@ -1512,6 +1536,7 @@ Fetch::measureFrontendBubbles(unsigned insts_to_decode, ThreadID tid) if (stallSig->blockFetch[tid]) { fetchStats.decodeStalls++; + //fetchStats.smtdecodeStalls[tid]++; } } @@ -1849,6 +1874,7 @@ Fetch::prepareFetchAddress(ThreadID tid, bool &status_change) } else { if (fetchStatus[tid] == Idle) { ++fetchStats.idleCycles; + ++fetchStats.smtidleCycles[tid]; DPRINTF(Fetch, "[tid:%i] Fetch is idle!\n", tid); } // Status is Idle, so fetch should do nothing. @@ -2111,6 +2137,7 @@ Fetch::sendNextCacheRequest(ThreadID tid, const PCStateBase &pc_state) { } if (ftqEmpty(tid)) { + ++fetchStats.smtftqempty[tid]; DPRINTF(Fetch, "[tid:%i] No FSQ entry available for next fetch\n", tid); return; } @@ -2183,6 +2210,7 @@ Fetch::profileStall(ThreadID tid) DPRINTF(Fetch, "Fetch has no active thread!\n"); } else if (fetchStatus[tid] == Blocked) { ++fetchStats.blockedCycles; + ++fetchStats.smtblockedCycles[tid]; DPRINTF(Fetch, "[tid:%i] Fetch is blocked!\n", tid); } else if (fetchStatus[tid] == Squashing) { ++fetchStats.squashCycles; diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 0061b87912..18e6159022 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1054,8 +1054,12 @@ class Fetch * the pipeline. */ statistics::Scalar idleCycles; + + statistics::Vector smtidleCycles; /** Total number of cycles spent blocked. */ statistics::Scalar blockedCycles; + + statistics::Vector smtblockedCycles; /** Total number of cycles spent in any other state. */ statistics::Scalar miscStallCycles; /** Total number of cycles spent in waiting for drains. */ @@ -1091,6 +1095,10 @@ class Fetch statistics::Vector fetchStatusDist; /** Number of decode stalls */ statistics::Scalar decodeStalls; + + statistics::Vector smtdecodeStalls; + + statistics::Vector smtftqempty; /** Number of decode stalls per cycle */ statistics::Formula decodeStallRate; /** Unutilized issue-pipeline slots while there is no backend-stall */ diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 04c2b893ca..a9a1a14565 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -1547,6 +1547,9 @@ IEW::executeInsts() while (threads != end) { ThreadID tid = *threads++; fetchRedirect[tid] = false; + toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid); + toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid); + toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid); } // Uncomment this if you want to see all available instructions. @@ -1557,9 +1560,7 @@ IEW::executeInsts() ThreadID tid = *activeThreads->begin(); toFetch->iewInfo[tid].resolvedCFIs.clear(); - toFetch->iewInfo[tid].ldstqCount=ldstQueue.getCount(tid); - toFetch->iewInfo[tid].robCount= rob->getThreadEntries(tid); - toFetch->iewInfo[tid].iqCount= scheduler->getIQInsts(tid); + // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; fromIssue->size = 0; From 325d970614beb9434d6648fdf776c39cb412e33c Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 9 Apr 2026 19:18:03 +0800 Subject: [PATCH 16/38] cpu: add shared lsq and ftq modes for smt Change-Id: Idea57378fbe47dedd654f168141d0284faee716f --- configs/example/smt_idealkmhv3.py | 47 +++++ src/cpu/o3/BaseO3CPU.py | 7 +- src/cpu/o3/SConscript | 3 +- src/cpu/o3/lsq.cc | 276 +++++++++++++++++++++++----- src/cpu/o3/lsq.hh | 67 +++---- src/cpu/o3/lsq_unit.cc | 10 +- src/cpu/o3/lsq_unit.hh | 10 +- src/cpu/pred/BranchPredictor.py | 11 ++ src/cpu/pred/SConscript | 2 +- src/cpu/pred/btb/decoupled_bpred.cc | 94 +++++++++- src/cpu/pred/btb/decoupled_bpred.hh | 22 ++- 11 files changed, 454 insertions(+), 95 deletions(-) create mode 100644 configs/example/smt_idealkmhv3.py diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py new file mode 100644 index 0000000000..a83681506f --- /dev/null +++ b/configs/example/smt_idealkmhv3.py @@ -0,0 +1,47 @@ +from m5.objects import Root + +from m5.util import addToPath + +addToPath('../') + +from common import Simulation +from common.xiangshan import build_xiangshan_system, xiangshan_system_init +from idealkmhv3 import setKmhV3IdealParams + + +def setSharedLSQParams(args, system): + setKmhV3IdealParams(args, system) + + for cpu in system.cpu: + # Reuse the ideal KMHV3 LSQ-related sizes, but interpret them as a + # shared SMT-wide pool. For example, LQEntries=128 means both threads + # compete for a total of 128 load entries instead of 128 each. The + # same shared-mode accounting applies to SQ/RARQ/RAWQ. Likewise, + # branchPred.ftq_size is interpreted as a shared SMT-wide FTQ pool. + # Keep FTQ partitioned by default so one thread cannot monopolize the + # shared target queue and starve the other thread's frontend. + cpu.smtLSQMode = 'Shared' + cpu.smtLSQPolicy = 'Dynamic' + cpu.branchPred.smtFTQMode = 'Shared' + cpu.branchPred.smtFTQPolicy = 'Partitioned' + + +if __name__ == '__m5_main__': + FutureClass = None + + args = xiangshan_system_init() + + assert not args.external_memory_system + + args.smt = True + args.bp_type = 'DecoupledBPUWithBTB' + args.l2_size = '2MB' + + Simulation.setMemClass(args) + + test_sys = build_xiangshan_system(args) + setSharedLSQParams(args, test_sys) + + root = Root(full_system=True, system=test_sys) + + Simulation.run_vanilla(args, root, test_sys, FutureClass) diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index f6f46d85b8..b1f6979368 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -53,6 +53,9 @@ class SMTFetchPolicy(ScopedEnum): class SMTQueuePolicy(ScopedEnum): vals = [ 'Dynamic', 'Partitioned', 'Threshold' ] +class SMTLSQMode(ScopedEnum): + vals = [ 'Independent', 'Shared' ] + class CommitPolicy(ScopedEnum): vals = [ 'RoundRobin', 'OldestReady' ] @@ -233,8 +236,10 @@ def support_take_over(cls): smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching Threads") smtFetchPolicy = Param.SMTFetchPolicy('RoundRobin', "SMT Fetch policy") + smtLSQMode = Param.SMTLSQMode('Independent', + "SMT LSQ mode: per-thread independent or shared quota") smtLSQPolicy = Param.SMTQueuePolicy('Partitioned', - "SMT LSQ Sharing Policy") + "SMT shared LSQ allocation policy") smtLSQThreshold = Param.Int(100, "SMT LSQ Threshold Sharing Parameter") smtIQPolicy = Param.SMTQueuePolicy('Partitioned', "SMT IQ Sharing Policy") diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript index 463a8cdfc0..3c2902a6b4 100755 --- a/src/cpu/o3/SConscript +++ b/src/cpu/o3/SConscript @@ -35,7 +35,8 @@ if env['CONF']['TARGET_ISA'] != 'null': 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler']) SimObject('FuncUnitConfig.py', sim_objects=[]) SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[ - 'SMTFetchPolicy', 'SMTQueuePolicy', 'CommitPolicy', 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord']) + 'SMTFetchPolicy', 'SMTQueuePolicy', 'SMTLSQMode', 'CommitPolicy', + 'ROBWalkPolicy', 'ROBCompressPolicy', 'PerfRecord']) Source('commit.cc') Source('cpu.cc') diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 4fe227f6ac..a341c1eaa0 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -484,15 +484,15 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) _storeWbStage(params.StoreWbStage), waitingForStaleTranslation(false), staleTranslationWaitTxnId(0), + lsqMode(params.smtLSQMode), lsqPolicy(params.smtLSQPolicy), + smtLSQThreshold(params.smtLSQThreshold), stats(nullptr), LQEntries(params.LQEntries), SQEntries(params.SQEntries), enqueueWidth(params.renameWidth), - maxLQEntries(maxLSQAllocation(lsqPolicy, LQEntries, params.numThreads, - params.smtLSQThreshold)), - maxSQEntries(maxLSQAllocation(lsqPolicy, SQEntries, params.numThreads, - params.smtLSQThreshold)), + RARQEntries(params.RARQEntries), + RAWQEntries(params.RAWQEntries), dcachePort(this, cpu_ptr), numThreads(params.numThreads) { @@ -518,30 +518,37 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) //************ Handle SMT Parameters *********** //********************************************** - /* Run SMT olicy checks. */ + if (lsqMode == SMTLSQMode::Independent) { + DPRINTF(LSQ, "LSQ mode set to Independent: each thread gets up to " + "%u LQ, %u SQ, %u RARQ and %u RAWQ entries\n", + LQEntries, SQEntries, RARQEntries, RAWQEntries); + } else if (lsqMode == SMTLSQMode::Shared) { + panic_if(lsqPolicy == SMTQueuePolicy::Threshold && + smtLSQThreshold == 0, + "SMT LSQ threshold must be non-zero in shared threshold mode"); + if (lsqPolicy == SMTQueuePolicy::Dynamic) { - DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n"); - } else if (lsqPolicy == SMTQueuePolicy::Partitioned) { - DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: " - "%i entries per LQ | %i entries per SQ\n", - maxLQEntries,maxSQEntries); - } else if (lsqPolicy == SMTQueuePolicy::Threshold) { - - assert(params.smtLSQThreshold > params.LQEntries); - assert(params.smtLSQThreshold > params.SQEntries); - - DPRINTF(LSQ, "LSQ sharing policy set to Threshold: " - "%i entries per LQ | %i entries per SQ\n", - maxLQEntries,maxSQEntries); + DPRINTF(LSQ, "LSQ mode set to Shared/Dynamic: %u LQ and %u SQ " + "entries are shared across active SMT threads, along " + "with %u RARQ and %u RAWQ entries\n", + LQEntries, SQEntries, RARQEntries, RAWQEntries); + } else if (lsqPolicy == SMTQueuePolicy::Partitioned) { + DPRINTF(LSQ, "LSQ mode set to Shared/Partitioned\n"); + } else if (lsqPolicy == SMTQueuePolicy::Threshold) { + DPRINTF(LSQ, "LSQ mode set to Shared/Threshold: threshold=%u\n", + smtLSQThreshold); + } else { + panic("Invalid LSQ sharing policy. Options are: Dynamic, " + "Partitioned, Threshold"); + } } else { - panic("Invalid LSQ sharing policy. Options are: Dynamic, " - "Partitioned, Threshold"); + panic("Invalid SMT LSQ mode. Options are: Independent, Shared"); } thread.reserve(numThreads); // TODO: Parameterize the load/store pipeline stages for (ThreadID tid = 0; tid < numThreads; tid++) { - thread.emplace_back(maxLQEntries, maxSQEntries, + thread.emplace_back(LQEntries, SQEntries, params.LdPipeStages, params.StPipeStages, params.RARQEntries, params.RAWQEntries, params.RARDequeuePerCycle, params.RAWDequeuePerCycle, params.LoadCompletionWidth, params.StoreCompletionWidth); @@ -778,13 +785,13 @@ LSQ::notifyDcacheRefill(Addr addr) unsigned LSQ::getFreeLQEntries(ThreadID tid) { - return thread[tid].numFreeLoadEntries(); + return logicalFreeLoadEntries(tid); } unsigned LSQ::getFreeSQEntries(ThreadID tid) { - return thread[tid].numFreeStoreEntries(); + return logicalFreeStoreEntries(tid); } unsigned @@ -1240,7 +1247,9 @@ LSQ::getStoreHeadSeqNum(ThreadID tid) int LSQ::getCount(ThreadID tid) { return thread.at(tid).getCount(); } -int LSQ::numLoads(ThreadID tid) { return thread.at(tid).numLoads(); } +int LSQ::numLoads(ThreadID tid) const { return thread.at(tid).numLoads(); } +int LSQ::numRAREntries(ThreadID tid) const { return thread.at(tid).numRAREntries(); } +int LSQ::numRAWEntries(ThreadID tid) const { return thread.at(tid).numRAWEntries(); } int LSQ::anyInflightLoadsNotComplete() { @@ -1273,7 +1282,7 @@ LSQ::anyStoreNotExecute() return false; } -int LSQ::numStores(ThreadID tid) { return thread.at(tid).numStores(); } +int LSQ::numStores(ThreadID tid) const { return thread.at(tid).numStores(); } int LSQ::numHtmStarts(ThreadID tid) const @@ -1471,9 +1480,8 @@ LSQ::getCount() return total; } - int -LSQ::numLoads() +LSQ::numLoads() const { unsigned total = 0; @@ -1490,7 +1498,24 @@ LSQ::numLoads() } int -LSQ::numStores() +LSQ::numRAREntries() const +{ + unsigned total = 0; + + std::list::iterator threads = activeThreads->begin(); + std::list::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + + total += numRAREntries(tid); + } + + return total; +} + +int +LSQ::numStores() const { unsigned total = 0; @@ -1506,9 +1531,149 @@ LSQ::numStores() return total; } +int +LSQ::numRAWEntries() const +{ + unsigned total = 0; + + std::list::iterator threads = activeThreads->begin(); + std::list::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + + total += numRAWEntries(tid); + } + + return total; +} + +bool +LSQ::sharedLSQMode() const +{ + return lsqMode == SMTLSQMode::Shared; +} + +unsigned +LSQ::activeLSQThreads() const +{ + if (!activeThreads || activeThreads->empty()) { + return numThreads; + } + return activeThreads->size(); +} + +unsigned +LSQ::sharedLSQAllocation(unsigned entries) const +{ + const unsigned active_threads = std::max(1U, activeLSQThreads()); + + switch (lsqPolicy) { + case SMTQueuePolicy::Dynamic: + return entries; + case SMTQueuePolicy::Partitioned: + return entries / active_threads; + case SMTQueuePolicy::Threshold: + return active_threads == 1 ? entries : + std::min(entries, smtLSQThreshold); + default: + panic("Invalid LSQ sharing policy. Options are: Dynamic, " + "Partitioned, Threshold"); + } +} + +unsigned +LSQ::logicalMaxLoadEntries(ThreadID tid) const +{ + return sharedLSQMode() ? sharedLSQAllocation(LQEntries) : LQEntries; +} + +unsigned +LSQ::logicalMaxStoreEntries(ThreadID tid) const +{ + return sharedLSQMode() ? sharedLSQAllocation(SQEntries) : SQEntries; +} + +unsigned +LSQ::logicalMaxRAREntries(ThreadID tid) const +{ + return sharedLSQMode() ? sharedLSQAllocation(RARQEntries) : RARQEntries; +} + +unsigned +LSQ::logicalMaxRAWEntries(ThreadID tid) const +{ + return sharedLSQMode() ? sharedLSQAllocation(RAWQEntries) : RAWQEntries; +} + +unsigned +LSQ::logicalFreeLoadEntries(ThreadID tid) const +{ + const unsigned thread_free = std::max(0, + static_cast(logicalMaxLoadEntries(tid)) - thread[tid].numLoads()); + if (!sharedLSQMode()) { + return thread_free; + } + + const unsigned shared_used = numLoads(); + const unsigned shared_free = std::max( + 0, static_cast(LQEntries) - static_cast(shared_used)); + return std::min(thread_free, shared_free); +} + +unsigned +LSQ::logicalFreeStoreEntries(ThreadID tid) const +{ + const unsigned thread_free = std::max(0, + static_cast(logicalMaxStoreEntries(tid)) - thread[tid].numStores()); + if (!sharedLSQMode()) { + return thread_free; + } + + const unsigned shared_used = numStores(); + const unsigned shared_free = std::max( + 0, static_cast(SQEntries) - static_cast(shared_used)); + return std::min(thread_free, shared_free); +} + +unsigned +LSQ::logicalFreeRAREntries(ThreadID tid) const +{ + const unsigned thread_free = std::max(0, + static_cast(logicalMaxRAREntries(tid)) - numRAREntries(tid)); + if (!sharedLSQMode()) { + return thread_free; + } + + const unsigned shared_used = numRAREntries(); + const unsigned shared_free = std::max( + 0, static_cast(RARQEntries) - static_cast(shared_used)); + return std::min(thread_free, shared_free); +} + +unsigned +LSQ::logicalFreeRAWEntries(ThreadID tid) const +{ + const unsigned thread_free = std::max(0, + static_cast(logicalMaxRAWEntries(tid)) - numRAWEntries(tid)); + if (!sharedLSQMode()) { + return thread_free; + } + + const unsigned shared_used = numRAWEntries(); + const unsigned shared_free = std::max( + 0, static_cast(RAWQEntries) - static_cast(shared_used)); + return std::min(thread_free, shared_free); +} + unsigned LSQ::numFreeLoadEntries() { + if (sharedLSQMode()) { + const unsigned used = numLoads(); + return used < LQEntries ? LQEntries - used : 0; + } + unsigned total = 0; std::list::iterator threads = activeThreads->begin(); @@ -1526,6 +1691,11 @@ LSQ::numFreeLoadEntries() unsigned LSQ::numFreeStoreEntries() { + if (sharedLSQMode()) { + const unsigned used = numStores(); + return used < SQEntries ? SQEntries - used : 0; + } + unsigned total = 0; std::list::iterator threads = activeThreads->begin(); @@ -1543,18 +1713,22 @@ LSQ::numFreeStoreEntries() unsigned LSQ::numFreeLoadEntries(ThreadID tid) { - return thread[tid].numFreeLoadEntries(); + return logicalFreeLoadEntries(tid); } unsigned LSQ::numFreeStoreEntries(ThreadID tid) { - return thread[tid].numFreeStoreEntries(); + return logicalFreeStoreEntries(tid); } bool LSQ::isFull() { + if (sharedLSQMode()) { + return lqFull() || sqFull(); + } + std::list::iterator threads = activeThreads->begin(); std::list::iterator end = activeThreads->end(); @@ -1571,12 +1745,12 @@ LSQ::isFull() bool LSQ::isFull(ThreadID tid) { - //@todo: Change to Calculate All Entries for - //Dynamic Policy - if (lsqPolicy == SMTQueuePolicy::Dynamic) - return isFull(); - else - return thread[tid].lqFull() || thread[tid].sqFull(); + if (sharedLSQMode()) { + return logicalFreeLoadEntries(tid) == 0 || + logicalFreeStoreEntries(tid) == 0; + } + + return thread[tid].lqFull() || thread[tid].sqFull(); } bool @@ -1632,6 +1806,10 @@ LSQ::sqEmpty(ThreadID tid) const bool LSQ::lqFull() { + if (sharedLSQMode()) { + return numFreeLoadEntries() == 0; + } + std::list::iterator threads = activeThreads->begin(); std::list::iterator end = activeThreads->end(); @@ -1648,17 +1826,20 @@ LSQ::lqFull() bool LSQ::lqFull(ThreadID tid) { - //@todo: Change to Calculate All Entries for - //Dynamic Policy - if (lsqPolicy == SMTQueuePolicy::Dynamic) - return lqFull(); - else - return thread[tid].lqFull(); + if (sharedLSQMode()) { + return logicalFreeLoadEntries(tid) == 0; + } + + return thread[tid].lqFull(); } bool LSQ::sqFull() { + if (sharedLSQMode()) { + return numFreeStoreEntries() == 0; + } + std::list::iterator threads = activeThreads->begin(); std::list::iterator end = activeThreads->end(); @@ -1675,12 +1856,11 @@ LSQ::sqFull() bool LSQ::sqFull(ThreadID tid) { - //@todo: Change to Calculate All Entries for - //Dynamic Policy - if (lsqPolicy == SMTQueuePolicy::Dynamic) - return sqFull(); - else - return thread[tid].sqFull(); + if (sharedLSQMode()) { + return logicalFreeStoreEntries(tid) == 0; + } + + return thread[tid].sqFull(); } const DynInstPtr& diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 159eaa0ab5..28cb6e0146 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -66,6 +66,7 @@ #include "cpu/o3/dyn_inst_xsmeta.hh" #include "cpu/o3/limits.hh" #include "cpu/utils.hh" +#include "enums/SMTLSQMode.hh" #include "enums/SMTQueuePolicy.hh" #include "mem/packet.hh" #include "mem/port.hh" @@ -928,18 +929,28 @@ class LSQ int getCount(ThreadID tid); /** Returns the total number of loads in the load queue. */ - int numLoads(); + int numLoads() const; /** Returns the total number of loads for a single thread. */ - int numLoads(ThreadID tid); + int numLoads(ThreadID tid) const; int anyInflightLoadsNotComplete(); bool anyStoreNotExecute(); /** Returns the total number of stores in the store queue. */ - int numStores(); + int numStores() const; /** Returns the total number of stores for a single thread. */ - int numStores(ThreadID tid); + int numStores(ThreadID tid) const; + + /** Returns the total number of entries in the RAR queue. */ + int numRAREntries() const; + /** Returns the total number of RAR queue entries for a single thread. */ + int numRAREntries(ThreadID tid) const; + + /** Returns the total number of entries in the RAW queue. */ + int numRAWEntries() const; + /** Returns the total number of RAW queue entries for a single thread. */ + int numRAWEntries(ThreadID tid) const; // hardware transactional memory @@ -1207,6 +1218,18 @@ class LSQ unsigned getFreeSQEntries(ThreadID tid); unsigned getAndResetLastSQPopEntries(ThreadID tid); + bool sharedLSQMode() const; + unsigned activeLSQThreads() const; + unsigned sharedLSQAllocation(unsigned entries) const; + unsigned logicalMaxLoadEntries(ThreadID tid) const; + unsigned logicalMaxStoreEntries(ThreadID tid) const; + unsigned logicalFreeLoadEntries(ThreadID tid) const; + unsigned logicalFreeStoreEntries(ThreadID tid) const; + unsigned logicalMaxRAREntries(ThreadID tid) const; + unsigned logicalMaxRAWEntries(ThreadID tid) const; + unsigned logicalFreeRAREntries(ThreadID tid) const; + unsigned logicalFreeRAWEntries(ThreadID tid) const; + /** Is D-cache blocked? */ bool cacheBlocked() const; /** Set D-cache blocked status */ @@ -1292,30 +1315,13 @@ class LSQ Addr staleTranslationWaitTxnId; /** The LSQ policy for SMT mode. */ + SMTLSQMode lsqMode; + + /** The LSQ allocation policy used in shared mode. */ SMTQueuePolicy lsqPolicy; - /** Auxiliary function to calculate per-thread max LSQ allocation limit. - * Depending on a policy, number of entries and possibly number of threads - * and threshold, this function calculates how many resources each thread - * can occupy at most. - */ - static uint32_t - maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries, - uint32_t numThreads, uint32_t SMTThreshold) - { - if (pol == SMTQueuePolicy::Dynamic) { - return entries; - } else if (pol == SMTQueuePolicy::Partitioned) { - //@todo:make work if part_amt doesnt divide evenly. - return entries / numThreads; - } else if (pol == SMTQueuePolicy::Threshold) { - //Divide up by threshold amount - //@todo: Should threads check the max and the total - //amount of the LSQ - return SMTThreshold; - } - return 0; - } + /** The per-thread threshold used in shared threshold mode. */ + unsigned smtLSQThreshold; struct LSQStats : public statistics::Group { @@ -1352,11 +1358,10 @@ class LSQ /** Max number of memory instructions that may enter LSQ in one cycle. */ const unsigned enqueueWidth; - /** Max LQ Size - Used to Enforce Sharing Policies. */ - unsigned maxLQEntries; - - /** Max SQ Size - Used to Enforce Sharing Policies. */ - unsigned maxSQEntries; + /** Total Size of RARQ Entries. */ + unsigned RARQEntries; + /** Total Size of RAWQ Entries. */ + unsigned RAWQEntries; /** Data port. */ DcachePort dcachePort; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 467fd73160..5112ee8c40 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -1478,11 +1478,13 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) const bool trackRAR = loadCompletedIdx != loadQueue.tail() && inst->isNormalLd() && inst->lqIt.idx() > loadCompletedIdx + 1; - const bool rarReplay = trackRAR && RARQueue.size() >= maxRARQEntries; + const bool rarReplay = + trackRAR && lsq->logicalFreeRAREntries(lsqID) == 0; const bool trackRAW = storeCompletedIdx != storeQueue.tail() && inst->isNormalLd() && inst->sqIt.idx() > storeCompletedIdx + 1; - const bool rawReplay = trackRAW && RAWQueue.size() >= maxRAWQEntries; + const bool rawReplay = + trackRAW && lsq->logicalFreeRAWEntries(lsqID) == 0; if (cacheMissReplay) { inst->markReplayFlag(LdStReplayType::CacheMissReplay); @@ -3853,7 +3855,7 @@ LSQUnit::processReplayQueues() // Collect instructions from RAR replay queue when space available assert(RARQueue.size() <= maxRARQEntries); - const int freeRARSize = maxRARQEntries - RARQueue.size(); + const int freeRARSize = lsq->logicalFreeRAREntries(lsqID); const int maxRARCollect = std::min(freeRARSize, (int)rarDequeuePerCycle - RARReplayCount); for (int i = 0; i < maxRARCollect && !RARReplayQueue.empty(); ++i) { DynInstPtr inst = RARReplayQueue.front(); @@ -3863,7 +3865,7 @@ LSQUnit::processReplayQueues() // Collect instructions from RAW replay queue when space available assert(RAWQueue.size() <= maxRAWQEntries); - const int freeRAWSize = maxRAWQEntries - RAWQueue.size(); + const int freeRAWSize = lsq->logicalFreeRAWEntries(lsqID); const int maxRAWCollect = std::min(freeRAWSize, (int)rawDequeuePerCycle - RAWReplayCount); for (int i = 0; i < maxRAWCollect && !RAWReplayQueue.empty(); ++i) { DynInstPtr inst = RAWReplayQueue.front(); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 837cc65506..fd2ff7d172 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -410,10 +410,16 @@ class LSQUnit unsigned getAndResetLastClockSQPopEntries(); /** Returns the number of loads in the LQ. */ - int numLoads() { return loadQueue.size(); } + int numLoads() const { return loadQueue.size(); } /** Returns the number of stores in the SQ. */ - int numStores() { return storeQueue.size(); } + int numStores() const { return storeQueue.size(); } + + /** Returns the number of entries in the per-thread RAR queue. */ + int numRAREntries() const { return RARQueue.size(); } + + /** Returns the number of entries in the per-thread RAW queue. */ + int numRAWEntries() const { return RAWQueue.size(); } // hardware transactional memory int numHtmStarts() const { return htmStarts; } diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 3171928e1b..4b1b48c097 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -959,6 +959,12 @@ class DecoupledBPUWithFTB(BranchPredictor): enableJumpAheadPredictor = Param.Bool(False, "Use jump ahead predictor to skip no-need-to-predict blocks") enableTwoTaken = Param.Bool(False, "Enable predicting two taken blocks per cycle") +class SMTFTQMode(ScopedEnum): + vals = [ 'Independent', 'Shared' ] + +class SMTFTQPolicy(ScopedEnum): + vals = [ 'Dynamic', 'Partitioned', 'Threshold' ] + class TimedBaseBTBPredictor(SimObject): type = 'TimedBaseBTBPredictor' cxx_class = 'gem5::branch_prediction::btb_pred::TimedBaseBTBPredictor' @@ -1188,6 +1194,11 @@ class DecoupledBPUWithBTB(BranchPredictor): # n = 2 ftq_size = Param.Unsigned(128, "Fetch target queue size") + smtFTQMode = Param.SMTFTQMode('Independent', + "SMT FTQ mode: per-thread independent or shared quota") + smtFTQPolicy = Param.SMTFTQPolicy('Partitioned', + "SMT shared FTQ allocation policy") + smtFTQThreshold = Param.Int(100, "SMT FTQ Threshold Sharing Parameter") fsq_size = Param.Unsigned(64, "Fetch stream queue size") maxHistLen = Param.Unsigned(970, "The length of history") diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript index a7b3ff30f1..ae3ab2da7f 100644 --- a/src/cpu/pred/SConscript +++ b/src/cpu/pred/SConscript @@ -51,7 +51,7 @@ SimObject('BranchPredictor.py', sim_objects=[ 'AheadBTB', 'MBTB', 'UBTB', 'DecoupledBPUWithBTB', 'TimedBaseBTBPredictor', 'BTBRAS', 'BTBTAGE', 'BTBTAGEUpperBound', 'MicroTAGE', - 'BTBITTAGE', 'BTBMGSC'], enums=["BpType"]) + 'BTBITTAGE', 'BTBMGSC'], enums=["BpType", "SMTFTQMode", "SMTFTQPolicy"]) DebugFlag('Indirect') Source('bpred_unit.cc') diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index aec2222806..4ba14a04ed 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -1,5 +1,6 @@ #include "cpu/pred/btb/decoupled_bpred.hh" +#include #include #include "arch/riscv/regs/misc.hh" @@ -60,10 +61,20 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) // uras(p.uras), bpDBSwitches(p.bpDBSwitches), numStages(p.numStages), + ftqEntries(p.ftq_size), + ftqMode(p.smtFTQMode), + ftqPolicy(p.smtFTQPolicy), + smtFTQThreshold(p.smtFTQThreshold), ftq(p.numThreads, p.ftq_size), resolveBlockThreshold(p.resolveBlockThreshold), dbpBtbStats(this, p.numStages, p.fsq_size, maxInstsNum) { + panic_if(ftqMode == SMTFTQMode::Shared && + ftqPolicy == SMTFTQPolicy::Threshold && + smtFTQThreshold > ftqEntries, + "SMT FTQ threshold (%u) exceeds total FTQ entries (%u)", + smtFTQThreshold, ftqEntries); + if (bpDBSwitches.size() > 0) { initDB(); } @@ -135,6 +146,85 @@ DecoupledBPUWithBTB::DecoupledBPUWithBTB(const DecoupledBPUWithBTBParams &p) }); } +bool +DecoupledBPUWithBTB::sharedFTQMode() const +{ + return ftqMode == SMTFTQMode::Shared; +} + +unsigned +DecoupledBPUWithBTB::activeFTQThreads() const +{ + if (!sharedFTQMode()) { + return 1; + } + + if (!cpu) { + return std::max(1u, numThreads); + } + + return std::max(1, cpu->numActiveThreads()); +} + +unsigned +DecoupledBPUWithBTB::totalFTQEntries() const +{ + unsigned total = 0; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + total += ftq.size(tid); + } + return total; +} + +unsigned +DecoupledBPUWithBTB::sharedFTQAllocation(unsigned entries) const +{ + const unsigned active_threads = activeFTQThreads(); + + switch (ftqPolicy) { + case SMTFTQPolicy::Dynamic: + return entries; + case SMTFTQPolicy::Partitioned: + return entries / active_threads; + case SMTFTQPolicy::Threshold: + return active_threads == 1 ? entries : std::min(entries, smtFTQThreshold); + default: + panic("Invalid SMT FTQ sharing policy"); + } +} + +unsigned +DecoupledBPUWithBTB::logicalMaxFTQEntries(ThreadID tid) const +{ + if (!sharedFTQMode()) { + return ftqEntries; + } + + return sharedFTQAllocation(ftqEntries); +} + +unsigned +DecoupledBPUWithBTB::logicalFreeFTQEntries(ThreadID tid) const +{ + const unsigned local_max = logicalMaxFTQEntries(tid); + const unsigned local_used = ftq.size(tid); + const unsigned local_free = local_used >= local_max ? 0 : local_max - local_used; + + if (!sharedFTQMode()) { + return local_free; + } + + const unsigned total_used = totalFTQEntries(); + const unsigned shared_free = total_used >= ftqEntries ? 0 : ftqEntries - total_used; + return std::min(local_free, shared_free); +} + +bool +DecoupledBPUWithBTB::ftqFull(ThreadID tid) const +{ + return logicalFreeFTQEntries(tid) == 0; +} + ThreadID DecoupledBPUWithBTB::scheduleThread() { @@ -187,7 +277,7 @@ DecoupledBPUWithBTB::tick() } // 1. Request new prediction if FSQ not full and we are idle - if (!threads[curTid].validprediction && !ftq.full(curTid)) { + if (!threads[curTid].validprediction && !ftqFull(curTid)) { if (threads[curTid].blockPredictionPending) { DPRINTF(Override, "Prediction blocked to prioritize resolve update\n"); dbpBtbStats.predictionBlockedForUpdate++; @@ -394,7 +484,7 @@ DecoupledBPUWithBTB::processNewPrediction(ThreadID tid) // Monitor FSQ size for statistics dbpBtbStats.fsqEntryDist.sample(ftq.size(tid), 1); - if (ftq.full(tid)) { + if (ftqFull(tid)) { dbpBtbStats.fsqFullCannotEnq++; DPRINTF(Override, "FSQ is full (%lu entries)\n", ftq.size(tid)); return; diff --git a/src/cpu/pred/btb/decoupled_bpred.hh b/src/cpu/pred/btb/decoupled_bpred.hh index 0a46c1a4e5..380e2e6eb5 100644 --- a/src/cpu/pred/btb/decoupled_bpred.hh +++ b/src/cpu/pred/btb/decoupled_bpred.hh @@ -19,20 +19,20 @@ #include "cpu/pred/btb/btb_mgsc.hh" #include "cpu/pred/btb/btb_tage.hh" #include "cpu/pred/btb/btb_ubtb.hh" +#include "cpu/pred/btb/common.hh" #include "cpu/pred/btb/ftq.hh" +#include "cpu/pred/btb/history_manager.hh" #include "cpu/pred/btb/mbtb.hh" #include "cpu/pred/btb/microtage.hh" #include "cpu/pred/btb/ras.hh" -#include "cpu/pred/general_arch_db.hh" - -// #include "cpu/pred/btb/uras.hh" -#include "cpu/pred/btb/common.hh" -#include "cpu/pred/btb/history_manager.hh" #include "cpu/pred/btb/timed_base_pred.hh" +#include "cpu/pred/general_arch_db.hh" #include "cpu/timebuf.hh" #include "debug/DBPBTBStats.hh" #include "debug/DecoupleBP.hh" #include "debug/DecoupleBPProbe.hh" +#include "enums/SMTFTQMode.hh" +#include "enums/SMTFTQPolicy.hh" #include "params/DecoupledBPUWithBTB.hh" namespace gem5 @@ -121,6 +121,10 @@ class DecoupledBPUWithBTB : public BPredUnit // std::vector predsOfEachStage{}; unsigned numComponents{}; unsigned numStages{}; + unsigned ftqEntries; + SMTFTQMode ftqMode; + SMTFTQPolicy ftqPolicy; + unsigned smtFTQThreshold; FetchTargetQueue ftq; @@ -144,6 +148,14 @@ class DecoupledBPUWithBTB : public BPredUnit std::vector resolveDequeueFailCounters; const unsigned resolveBlockThreshold; + bool sharedFTQMode() const; + unsigned activeFTQThreads() const; + unsigned totalFTQEntries() const; + unsigned sharedFTQAllocation(unsigned entries) const; + unsigned logicalMaxFTQEntries(ThreadID tid) const; + unsigned logicalFreeFTQEntries(ThreadID tid) const; + bool ftqFull(ThreadID tid) const; + ThreadID scheduleThread(); void processNewPrediction(ThreadID tid); From cc766e3127d5fa1487a1ef2943666a20ba1d7daa Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 13 Apr 2026 12:26:39 +0800 Subject: [PATCH 17/38] cpu: avoid full memcpy_init for dedup difftest Change-Id: Ic49b3e7ab82a32b81427026ce8d185e7ebeaba76 --- src/cpu/base.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 264e17bf4d..2c64a00014 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -1684,8 +1684,6 @@ BaseCPU::difftestStep(ThreadID tid, InstSeqNum seq) assert(diffAllStates->proxy->ref_get_backed_memory); diffAllStates->proxy->ref_get_backed_memory( system->createCopyOnWriteBranch(), pmemSize); - diffAllStates->proxy->memcpy_init( - 0x80000000u, goldenMemPtr, pmemSize, DUT_TO_REF); } else { assert(diffAllStates->proxy->ref_get_backed_memory); diffAllStates->proxy->ref_get_backed_memory(system->createCopyOnWriteBranch(), pmemSize); From ea80c0a5e88ddecb0a76ed1f51bb7baf266cd707 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 13 Apr 2026 17:01:59 +0800 Subject: [PATCH 18/38] cpu: apply asid hash to mgsc lookups Change-Id: I4ee24d52f8ffe8fdc0a2086d97bcbc9be10860cb --- src/cpu/pred/btb/btb_mgsc.cc | 89 ++++++++++++++++++----------- src/cpu/pred/btb/btb_mgsc.hh | 18 +++--- src/cpu/pred/btb/decoupled_bpred.cc | 12 +++- 3 files changed, 75 insertions(+), 44 deletions(-) diff --git a/src/cpu/pred/btb/btb_mgsc.cc b/src/cpu/pred/btb/btb_mgsc.cc index f0a3837191..fa51bbdb33 100755 --- a/src/cpu/pred/btb/btb_mgsc.cc +++ b/src/cpu/pred/btb/btb_mgsc.cc @@ -345,10 +345,12 @@ BTBMGSC::calculatePercsum(const std::vector>> & * @return Found weight or 0 if not found */ int -BTBMGSC::findWeight(const std::vector &weightTable, Addr pc) +BTBMGSC::findWeight(const std::vector &weightTable, Addr pc, + uint8_t asidHash) { auto mask = (1 << weightTableIdxWidth) - 1; auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask; + pcHash = xorAsidHashIntoIndex(pcHash, weightTableIdxWidth, asidHash); auto &entry = weightTable[pcHash]; return entry; } @@ -369,10 +371,12 @@ BTBMGSC::calculateScaledPercsum(int weight, int percsum) * @return Found threshold or default value if not found */ int -BTBMGSC::findThreshold(const std::vector &thresholdTable, Addr pc) +BTBMGSC::findThreshold(const std::vector &thresholdTable, Addr pc, + uint8_t asidHash) { auto mask = (1 << thresholdTablelogSize) - 1; auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask; + pcHash = xorAsidHashIntoIndex(pcHash, thresholdTablelogSize, asidHash); auto &entry = thresholdTable[pcHash]; return entry; } @@ -403,7 +407,7 @@ BTBMGSC::calculateWeightScaleDiff(int total_sum, int scale_percsum, int percsum) BTBMGSC::MgscPrediction BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info, - ThreadID tid) + ThreadID tid, uint8_t asidHash) { DPRINTF(MGSC, "generateSinglePrediction for btbEntry: %#lx, always taken %d\n", btb_entry.pc, btb_entry.alwaysTaken); @@ -412,12 +416,15 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC // Calculate indices for all tables for (unsigned int i = 0; i < bwTableNum; ++i) { bwIndex[i] = getHistIndex(startPC, bwTableIdxWidth - numCtrsPerLineBits, - state.indexBwFoldedHist[i].get()); + state.indexBwFoldedHist[i].get(), asidHash); } + const Addr localHistoryIndex = + getPcIndex(startPC, log2(numEntriesFirstLocalHistories), asidHash); for (unsigned int i = 0; i < lTableNum; ++i) { lIndex[i] = getHistIndex(startPC, lTableIdxWidth - numCtrsPerLineBits, - state.indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][i].get()); + state.indexLFoldedHist[localHistoryIndex][i].get(), + asidHash); } // std::string buf; // boost::to_string(indexLFoldedHist[getPcIndex(startPC, log2(numEntriesFirstLocalHistories))][0].getAsBitset(), buf); @@ -425,46 +432,46 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC for (unsigned int i = 0; i < iTableNum; ++i) { iIndex[i] = getHistIndex(startPC, iTableIdxWidth - numCtrsPerLineBits, - state.indexIFoldedHist[i].get()); + state.indexIFoldedHist[i].get(), asidHash); } for (unsigned int i = 0; i < gTableNum; ++i) { gIndex[i] = getHistIndex(startPC, gTableIdxWidth - numCtrsPerLineBits, - state.indexGFoldedHist[i].get()); + state.indexGFoldedHist[i].get(), asidHash); } for (unsigned int i = 0; i < pTableNum; ++i) { pIndex[i] = getHistIndex(startPC, pTableIdxWidth - numCtrsPerLineBits, - state.indexPFoldedHist[i].get()); + state.indexPFoldedHist[i].get(), asidHash); } for (unsigned int i = 0; i < biasTableNum; ++i) { biasIndex[i] = getBiasIndex(startPC, biasTableIdxWidth - numCtrsPerLineBits, tage_info.tage_main_taken, - tage_info.tage_pred_conf_low); + tage_info.tage_pred_conf_low, asidHash); } int bw_percsum = enableBwTable ? calculatePercsum(bwTable, bwIndex, bwTableNum, btb_entry.pc) : 0; - int bw_weight = findWeight(bwWeightTable, btb_entry.pc); + int bw_weight = findWeight(bwWeightTable, btb_entry.pc, asidHash); int bw_scaled_percsum = calculateScaledPercsum(bw_weight, bw_percsum); int l_percsum = enableLTable ? calculatePercsum(lTable, lIndex, lTableNum, btb_entry.pc) : 0; - int l_weight = findWeight(lWeightTable, btb_entry.pc); + int l_weight = findWeight(lWeightTable, btb_entry.pc, asidHash); int l_scaled_percsum = calculateScaledPercsum(l_weight, l_percsum); int i_percsum = enableITable ? calculatePercsum(iTable, iIndex, iTableNum, btb_entry.pc) : 0; - int i_weight = findWeight(iWeightTable, btb_entry.pc); + int i_weight = findWeight(iWeightTable, btb_entry.pc, asidHash); int i_scaled_percsum = calculateScaledPercsum(i_weight, i_percsum); int g_percsum = enableGTable ? calculatePercsum(gTable, gIndex, gTableNum, btb_entry.pc) : 0; - int g_weight = findWeight(gWeightTable, btb_entry.pc); + int g_weight = findWeight(gWeightTable, btb_entry.pc, asidHash); int g_scaled_percsum = calculateScaledPercsum(g_weight, g_percsum); int p_percsum = enablePTable ? calculatePercsum(pTable, pIndex, pTableNum, btb_entry.pc) : 0; - int p_weight = findWeight(pWeightTable, btb_entry.pc); + int p_weight = findWeight(pWeightTable, btb_entry.pc, asidHash); int p_scaled_percsum = calculateScaledPercsum(p_weight, p_percsum); int bias_percsum = enableBiasTable ? calculatePercsum(biasTable, biasIndex, biasTableNum, btb_entry.pc) : 0; - int bias_weight = findWeight(biasWeightTable, btb_entry.pc); + int bias_weight = findWeight(biasWeightTable, btb_entry.pc, asidHash); int bias_scaled_percsum = calculateScaledPercsum(bias_weight, bias_percsum); // Calculate total sum of all weighted percsums @@ -473,7 +480,8 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC // Find thresholds // pc-indexed threshold table (only if enabled) - int p_update_thres = enablePCThreshold ? findThreshold(pUpdateThreshold, btb_entry.pc) : 0; + int p_update_thres = + enablePCThreshold ? findThreshold(pUpdateThreshold, btb_entry.pc, asidHash) : 0; int total_thres = (updateThreshold / 8) + p_update_thres; // Threshold is used as a confidence gate; avoid negative values which @@ -530,7 +538,7 @@ BTBMGSC::generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC void BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntries, const std::unordered_map &tageInfoForMgscs, - CondTakens &results, ThreadID tid) + CondTakens &results, ThreadID tid, uint8_t asidHash) { DPRINTF(MGSC, "lookupHelper startAddr: %#lx\n", startPC); @@ -541,7 +549,8 @@ BTBMGSC::lookupHelper(const Addr &startPC, const std::vector &btbEntri auto tage_info = tageInfoForMgscs.find(btb_entry.pc); if (tage_info != tageInfoForMgscs.end()) { auto pred = generateSinglePrediction(btb_entry, startPC, - tage_info->second, tid); + tage_info->second, tid, + asidHash); threadMeta[tid]->preds[btb_entry.pc] = pred; results.push_back({btb_entry.pc, pred.taken || btb_entry.alwaysTaken}); } else { @@ -569,6 +578,7 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, { const ThreadID tid = predictorTid(stagePreds); const auto &state = historyState(tid); + const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; DPRINTF(MGSC, "putPCHistory startAddr: %#lx\n", stream_start); // IMPORTANT: when this function is called, @@ -592,7 +602,8 @@ BTBMGSC::putPCHistory(Addr stream_start, const boost::dynamic_bitset<> &history, auto &stage_pred = stagePreds[s]; stage_pred.condTakens.clear(); lookupHelper(stream_start, stage_pred.btbEntries, - stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid); + stage_pred.tageInfoForMgscs, stage_pred.condTakens, tid, + asidHash); } } @@ -705,10 +716,11 @@ BTBMGSC::updateWeightTable(std::vector &weightTable, Addr tableIndex, A * @param update_direction Direction to update (true=increment, false=decrement) */ void -BTBMGSC::updatePCThresholdTable(Addr pc, bool update_direction) +BTBMGSC::updatePCThresholdTable(Addr pc, uint8_t asidHash, bool update_direction) { auto mask = (1 << thresholdTablelogSize) - 1; auto pcHash = ((pc >> instShiftAmt) ^ ((pc >> instShiftAmt) >> 2)) & mask; + pcHash = xorAsidHashIntoIndex(pcHash, thresholdTablelogSize, asidHash); auto &entry = pUpdateThreshold[pcHash]; updateCounter(update_direction, pUpdateThresholdWidth, entry); } @@ -876,10 +888,11 @@ BTBMGSC::updateSinglePredictor(const BTBEntry &entry, bool actual_taken, const M } #endif - // Only update tables if prediction was wrong or confidence was low + // Only update tables if prediction was wrong or confidence was low if (sc_pred_taken != actual_taken || abs(total_sum) < (total_thres / 2)) { // get weight table index from startPC - Addr weightTableIdx = getPcIndex(stream.startPC, weightTableIdxWidth); + Addr weightTableIdx = getPcIndex(stream.startPC, weightTableIdxWidth, + stream.asidHash); bool threshold_inc = (sc_pred_taken != actual_taken); if (threshold_inc) { mgscStats.pcThresholdInc++; @@ -921,7 +934,8 @@ BTBMGSC::updateSinglePredictor(const BTBEntry &entry, bool actual_taken, const M // Update PC-indexed threshold table (only if enabled) if (enablePCThreshold) { - updatePCThresholdTable(entry.pc, sc_pred_taken != actual_taken); + updatePCThresholdTable(entry.pc, stream.asidHash, + sc_pred_taken != actual_taken); } // Update global threshold table @@ -1007,7 +1021,8 @@ BTBMGSC::updateCounter(bool taken, unsigned width, uint64_t &counter); Addr -BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist) +BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist, + uint8_t asidHash) { // Create mask to limit result size to tableIndexBits Addr mask = (1ULL << tableIndexBits) - 1; @@ -1016,11 +1031,12 @@ BTBMGSC::getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist) Addr pcBits = (pc >> floorLog2(blockSize)) & mask; Addr foldedBits = foldedHist & mask; - return pcBits ^ foldedBits; + return xorAsidHashIntoIndex(pcBits ^ foldedBits, tableIndexBits, asidHash); } Addr -BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1) +BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, + bool lowbit1, uint8_t asidHash) { // Create mask for tableIndexBits-2 to extract PC bits Addr mask = (1ULL << (tableIndexBits - 2)) - 1; @@ -1028,17 +1044,18 @@ BTBMGSC::getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbi // Extract lower bits of PC directly and combine with low bits Addr pcBits = (pc >> floorLog2(blockSize)) & mask; unsigned index = (pcBits << 2) + (lowbit1 << 1) + lowbit0; - return index; + return xorAsidHashIntoIndex(index, tableIndexBits, asidHash); } Addr -BTBMGSC::getPcIndex(Addr pc, unsigned tableIndexBits) +BTBMGSC::getPcIndex(Addr pc, unsigned tableIndexBits, uint8_t asidHash) { // Create mask to extract tableIndexBits from PC Addr mask = (1ULL << tableIndexBits) - 1; // Extract lower bits of PC directly without bitset - return (pc >> floorLog2(blockSize)) & mask; + Addr baseIndex = (pc >> floorLog2(blockSize)) & mask; + return xorAsidHashIntoIndex(baseIndex, tableIndexBits, asidHash); } template @@ -1244,8 +1261,10 @@ BTBMGSC::specUpdateLHist(const std::vector> &history, Fu int shamt; bool cond_taken; std::tie(shamt, cond_taken) = pred.getHistInfo(); - doUpdateHist(history[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - state.indexLFoldedHist[getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories))]); + const Addr localHistoryIndex = + getPcIndex(pred.bbStart, log2(numEntriesFirstLocalHistories), pred.asidHash); + doUpdateHist(history[localHistoryIndex], shamt, cond_taken, + state.indexLFoldedHist[localHistoryIndex]); } /** @@ -1386,9 +1405,11 @@ BTBMGSC::recoverLHist(const std::vector> &history, const state.indexLFoldedHist[k][i].recover(predMeta->indexLFoldedHist[k][i]); } } - doUpdateHist(history[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))], shamt, cond_taken, - state.indexLFoldedHist[getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories))]); - } + const Addr localHistoryIndex = + getPcIndex(entry.startPC, log2(numEntriesFirstLocalHistories), entry.asidHash); + doUpdateHist(history[localHistoryIndex], shamt, cond_taken, + state.indexLFoldedHist[localHistoryIndex]); +} #ifndef UNIT_TEST // Constructor for TAGE statistics diff --git a/src/cpu/pred/btb/btb_mgsc.hh b/src/cpu/pred/btb/btb_mgsc.hh index 6ff29b13c8..99c4c3be98 100755 --- a/src/cpu/pred/btb/btb_mgsc.hh +++ b/src/cpu/pred/btb/btb_mgsc.hh @@ -198,7 +198,7 @@ class BTBMGSC : public TimedBaseBTBPredictor ThreadID tid, const char *when); // Check GHR folded // Calculate MGSC weight index - Addr getPcIndex(Addr pc, unsigned tableIndexBits); + Addr getPcIndex(Addr pc, unsigned tableIndexBits, uint8_t asidHash = 0); private: // Utility functions for reducing code duplication @@ -211,7 +211,7 @@ class BTBMGSC : public TimedBaseBTBPredictor /** * Find weight in a weight table for a given PC */ - int findWeight(const std::vector &weightTable, Addr pc); + int findWeight(const std::vector &weightTable, Addr pc, uint8_t asidHash); /** * Calculate scaled percsum using weight @@ -221,7 +221,7 @@ class BTBMGSC : public TimedBaseBTBPredictor /** * Find threshold in a threshold table for a given PC */ - int findThreshold(const std::vector &thresholdTable, Addr pc); + int findThreshold(const std::vector &thresholdTable, Addr pc, uint8_t asidHash); /** * Calculate if weight scale causes prediction difference @@ -243,7 +243,7 @@ class BTBMGSC : public TimedBaseBTBPredictor /** * Update a threshold table and allocate new entry if needed */ - void updatePCThresholdTable(Addr pc, bool update_direction); + void updatePCThresholdTable(Addr pc, uint8_t asidHash, bool update_direction); /** * Update the global threshold table and allocate new entry if needed @@ -253,13 +253,15 @@ class BTBMGSC : public TimedBaseBTBPredictor // Look up predictions in MGSC tables for a stream of instructions void lookupHelper(const Addr &stream_start, const std::vector &btbEntries, const std::unordered_map &tageInfoForMgscs, - CondTakens &results, ThreadID tid); + CondTakens &results, ThreadID tid, uint8_t asidHash); // Calculate MGSC history index with folded history - Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist); + Addr getHistIndex(Addr pc, unsigned tableIndexBits, uint64_t foldedHist, + uint8_t asidHash = 0); // Calculate MGSC bias index - Addr getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1); + Addr getBiasIndex(Addr pc, unsigned tableIndexBits, bool lowbit0, bool lowbit1, + uint8_t asidHash = 0); // Get offset within a block for a given PC Addr getOffset(Addr pc) { return (pc & (blockSize - 1)) >> 1; } @@ -284,7 +286,7 @@ class BTBMGSC : public TimedBaseBTBPredictor // Helper method to generate prediction for a single BTB entry MgscPrediction generateSinglePrediction(const BTBEntry &btb_entry, const Addr &startPC, const TageInfoForMGSC &tage_info, - ThreadID tid); + ThreadID tid, uint8_t asidHash); // Helper method to prepare BTB entries for update std::vector prepareUpdateEntries(const FetchTarget &stream); diff --git a/src/cpu/pred/btb/decoupled_bpred.cc b/src/cpu/pred/btb/decoupled_bpred.cc index 4ba14a04ed..07e3b138e1 100644 --- a/src/cpu/pred/btb/decoupled_bpred.cc +++ b/src/cpu/pred/btb/decoupled_bpred.cc @@ -1083,8 +1083,12 @@ DecoupledBPUWithBTB::updateHistoryForPrediction(FetchTarget &entry) pHistShiftIn(2, p_taken, s0PHistory, p_pc, p_target); // Update local history + const Addr localHistoryIndex = + mgsc->getPcIndex(finalPred.bbStart, + log2(mgsc->getNumEntriesFirstLocalHistories()), + finalPred.asidHash); histShiftIn(shamt, taken, - s0LHistory[mgsc->getPcIndex(finalPred.bbStart, log2(mgsc->getNumEntriesFirstLocalHistories()))]); + s0LHistory[localHistoryIndex]); #ifndef NDEBUG if (tage->isEnabled()) { @@ -1171,8 +1175,12 @@ DecoupledBPUWithBTB::recoverHistoryForSquash( histShiftIn(real_bw_shamt, real_bw_taken, s0BwHistory); // Update local history with actual outcome + const Addr localHistoryIndex = + mgsc->getPcIndex(target.startPC, + log2(mgsc->getNumEntriesFirstLocalHistories()), + target.asidHash); histShiftIn(real_shamt, real_taken, - s0LHistory[mgsc->getPcIndex(target.startPC, log2(mgsc->getNumEntriesFirstLocalHistories()))]); + s0LHistory[localHistoryIndex]); // Update history manager with appropriate branch info if (squash_type == SQUASH_CTRL) { From 491baad33712990d27f83e73ee7fa35e238475db Mon Sep 17 00:00:00 2001 From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com> Date: Tue, 14 Apr 2026 14:53:14 +0800 Subject: [PATCH 19/38] cpu-o3: add backend SMT PMU (#827) Co-authored-by: mo haonan --- src/cpu/o3/iew.cc | 19 ++++++++++++- src/cpu/o3/iew.hh | 2 ++ src/cpu/o3/issue_queue.cc | 10 +++++-- src/cpu/o3/issue_queue.hh | 1 + src/cpu/o3/rename.cc | 57 +++++++++++++++++++++++++++++++-------- src/cpu/o3/rename.hh | 14 +++++++--- 6 files changed, 85 insertions(+), 18 deletions(-) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index a9a1a14565..96fded9794 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -207,6 +207,8 @@ IEW::IEWStats::IEWStats(CPU *cpu) "Average fanout of values written-back"), ADD_STAT(stallEvents, statistics::units::Count::get(), "Number of events the IEW has stalled"), + ADD_STAT(smtStallEvents, statistics::units::Count::get(), + "Number of events the IEW has stalled per thread"), ADD_STAT(fetchStallReason, statistics::units::Count::get(), "Number of fetch stall reasons each tick (Total)"), ADD_STAT(decodeStallReason, statistics::units::Count::get(), @@ -243,6 +245,11 @@ IEW::IEWStats::IEWStats(CPU *cpu) stallEvents .init(StallEventCount) .flags(statistics::total); + + smtStallEvents + .init(StallEventCount,0,cpu->numThreads-1,1) + .flags(statistics::total); + dispDist.init(0,10,1).flags(statistics::nozero); @@ -257,6 +264,7 @@ IEW::IEWStats::IEWStats(CPU *cpu) for (int i = 0; i < StallEventCount; i++) { stallEvents.subname(i, stall_event_str[static_cast(i)]); + smtStallEvents.subname(i, stall_event_str[static_cast(i)]); } fetchStallReason @@ -829,6 +837,7 @@ IEW::checkSquash() fetchRedirect[i] = false; iewStats.stallEvents[ROBWalk]++; + iewStats.smtStallEvents[ROBWalk].sample(i); setAllStalls(StallReason::CommitSquash); } @@ -837,6 +846,7 @@ IEW::checkSquash() wroteToTimeBuffer = true; iewStats.stallEvents[ROBWalk]++; + iewStats.smtStallEvents[ROBWalk].sample(i); setAllStalls(StallReason::CommitSquash); } } @@ -1014,6 +1024,7 @@ IEW::dispatchInstFromRename(ThreadID tid) DPRINTF(IEW, "[tid:%i] Dispatch: %s has become full.\n", tid, inst->isLoad() ? "LQ" : "SQ"); iewStats.stallEvents[LSQFull]++; + iewStats.smtStallEvents[LSQFull].sample(tid); ++iewStats.lsqFullEvents; dispatch_stalls.push(checkDispatchStall(tid, NumDQ, inst, disp_seq)); @@ -1025,6 +1036,8 @@ IEW::dispatchInstFromRename(ThreadID tid) if (!scheduler->ready(inst, disp_seq)) { DPRINTF(IEW, "[tid:%i] Dispatch: IQ is full or bwFull.\n", tid); iewStats.stallEvents[IQFull]++; + iewStats.smtStallEvents[IQFull].sample(tid); + ++iewStats.iqFullEvents; dispatch_stalls.push(checkDispatchStall(tid, NumDQ, inst, disp_seq)); @@ -1158,6 +1171,8 @@ IEW::dispatchInstFromRename(ThreadID tid) DPRINTF(IEW,"[tid:%i] Dispatch: Bandwidth Full. Blocking.\n", tid); iewStats.stallEvents[DispBWFull]++; + iewStats.smtStallEvents[DispBWFull].sample(tid); + } } @@ -1270,6 +1285,7 @@ IEW::classifyInstToDispQue(ThreadID tid) if (!insts_to_dispatch.empty()) { DPRINTF(IEW,"[tid:%i] Dispatch: Bandwidth Full. Blocking.\n", tid); iewStats.stallEvents[DispBWFull]++; + iewStats.smtStallEvents[DispBWFull].sample(tid); } } @@ -1302,6 +1318,7 @@ IEW::dispatchInstFromDispQue() DPRINTF(IEW, "[tid:%i] Dispatch: IQ is full or bwFull.\n", tid); iewStats.stallEvents[IQFull]++; + iewStats.smtStallEvents[IQFull].sample(tid); ++iewStats.iqFullEvents; break; } @@ -1314,7 +1331,7 @@ IEW::dispatchInstFromDispQue() inst->isLoad() ? "LQ" : "SQ"); iewStats.stallEvents[LSQFull]++; - + iewStats.smtStallEvents[LSQFull].sample(tid); ++iewStats.lsqFullEvents; break; } diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 94cfbcb8cc..c621e62ebc 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -571,6 +571,8 @@ class IEW statistics::Vector stallEvents; + statistics::VectorDistribution smtStallEvents; + /** Distribution of number of fetch stall reasons each tick. */ statistics::Vector fetchStallReason; /** Distribution of number of decode stall reasons each tick. */ diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index c3739031aa..d83083a45f 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -224,7 +224,8 @@ IssueQue::IssueQueStats::IssueQueStats(statistics::Group* parent, IssueQue* que, ADD_STAT(issueDist, statistics::units::Count::get(), "distruibution of issue"), ADD_STAT(portissued, statistics::units::Count::get(), "count each port issues"), ADD_STAT(portBusy, statistics::units::Count::get(), "count each port busy cycles"), - ADD_STAT(avgInsts, statistics::units::Count::get(), "average insts") + ADD_STAT(avgInsts, statistics::units::Count::get(), "average insts"), + ADD_STAT(instsNum, statistics::units::Count::get(), "insts per thread") { insertDist.init(que->inports + 1).flags(statistics::nozero); issueDist.init(que->outports + 1).flags(statistics::nozero); @@ -235,6 +236,7 @@ IssueQue::IssueQueStats::IssueQueStats(statistics::Group* parent, IssueQue* que, loadmiss.flags(statistics::nozero); arbFailed.flags(statistics::nozero); issueOccupy.flags(statistics::nozero); + instsNum.flags(statistics::nozero); } IssueQue::IssueQue(const IssueQueParams& params) @@ -375,6 +377,7 @@ IssueQue::setCPU(CPU* cpu) this->cpu = cpu; _name = cpu->name() + ".scheduler." + getName(); iqstats = new IssueQueStats(cpu, this, "scheduler." + this->getName()); + iqstats->instsNum.init(cpu->numThreads); } void @@ -903,7 +906,10 @@ IssueQue::incInIQInstsCounter(ThreadID tid) { if (instsCounter) { instsCounter->incCounter(tid); - } + } + if (iqstats) { + iqstats->instsNum[tid]++; + } } void diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index a4416663a0..6c6d9f8fbf 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -206,6 +206,7 @@ class IssueQue : public SimObject statistics::Vector portissued; statistics::Vector portBusy; statistics::Average avgInsts; + statistics::Vector instsNum; }* iqstats = nullptr; void replay(const DynInstPtr& inst); diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 02c4f40144..33d7852a87 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -66,9 +66,9 @@ Rename::Rename(CPU *_cpu, const BaseO3CPUParams ¶ms) renameWidth(params.renameWidth), releaseWidth(params.phyregReleaseWidth), numThreads(params.numThreads), - stats(_cpu), + stats(_cpu, this), valuePred(params.valuePred), - enableSelectiveVPFlush(params.enableSelectiveVPFlush) + enableSelectiveVPFlush(params.enableSelectiveVPFlush) { if (renameWidth > MaxWidth) fatal("renameWidth (%d) is larger than compiled limit (%d),\n" @@ -94,8 +94,8 @@ Rename::name() const return cpu->name() + ".rename"; } -Rename::RenameStats::RenameStats(statistics::Group *parent) - : statistics::Group(parent, "rename"), +Rename::RenameStats::RenameStats(CPU *cpu, Rename *rename) + : statistics::Group(cpu, "rename"), ADD_STAT(squashCycles, statistics::units::Cycle::get(), "Number of cycles rename is squashing"), ADD_STAT(idleCycles, statistics::units::Cycle::get(), @@ -109,7 +109,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(unblockCycles, statistics::units::Cycle::get(), "Number of cycles rename is unblocking"), ADD_STAT(renamedInsts, statistics::units::Count::get(), - "Number of instructions processed by rename"), + "Number of instructions processed by rename per thread"), ADD_STAT(squashedInsts, statistics::units::Count::get(), "Number of squashed instructions processed by rename"), ADD_STAT(ROBFullEvents, statistics::units::Count::get(), @@ -149,7 +149,9 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) ADD_STAT(constantFolded, statistics::units::Count::get(), "count of insts eliminated by constant folding"), ADD_STAT(stallEvents, statistics::units::Count::get(), - "count of stall events") + "count of stall events"), + ADD_STAT(smtStallEvents, statistics::units::Count::get(), + "Number of events the Rename has stalled per thread") { squashCycles.prereq(squashCycles); idleCycles.prereq(idleCycles); @@ -158,14 +160,12 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) runCycles.prereq(idleCycles); unblockCycles.prereq(unblockCycles); - renamedInsts.prereq(renamedInsts); squashedInsts.prereq(squashedInsts); ROBFullEvents.prereq(ROBFullEvents); IQFullEvents.prereq(IQFullEvents); LQFullEvents.prereq(LQFullEvents); SQFullEvents.prereq(SQFullEvents); - fullRegistersEvents.prereq(fullRegistersEvents); renamedOperands.prereq(renamedOperands); lookups.prereq(lookups); @@ -182,7 +182,13 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) moveEliminated.flags(statistics::total); constantFolded.flags(statistics::total); + renamedInsts.init(cpu->numThreads).flags(statistics::total); + fullRegistersEvents.init(cpu->numThreads).flags(statistics::total); + stallEvents.init(StallEventCount).flags(statistics::total); + smtStallEvents + .init(StallEventCount,0,cpu->numThreads-1,1) + .flags(statistics::total); std::map < StallEvent, const char* > stall_event_str = { { ROBWalk, "ROBWalk"}, { IEWStall, "IEWStall"}, @@ -196,6 +202,7 @@ Rename::RenameStats::RenameStats(statistics::Group *parent) for (int i = 0; i < StallEventCount; i++) { stallEvents.subname(i, stall_event_str[static_cast(i)]); + smtStallEvents.subname(i, stall_event_str[static_cast(i)]); } } @@ -361,10 +368,23 @@ Rename::tick() block_reason = checkRenameStallFromIEW(i); if (block_reason == StallReason::NoStall) { block_reason = StallReason::RegFull; - ++stats.fullRegistersEvents; + ++stats.fullRegistersEvents[i]; stats.stallEvents[RegFull]++; } } + + if (block_reason == StallReason::ROBFull) { + stats.smtStallEvents[ROBFull].sample(i); + } else if (block_reason == StallReason::RegFull) { + stats.smtStallEvents[RegFull].sample(i); + } else if (block_reason == StallReason::SerializeStall) { + stats.smtStallEvents[SerializeInst].sample(i); + } else if ( block_reason == StallReason::MemDQBandwidth || + block_reason == StallReason::IntDQBandwidth || + block_reason == StallReason::FVDQBandwidth) { + stats.smtStallEvents[BWFull].sample(i); + } + DPRINTF(Rename, "[tid:%i] blockRename: %i, canRename: %i, block: %i, active: %i\n", i, stallSig->blockRename[i], can_rename, block, active); @@ -402,6 +422,7 @@ Rename::tick() renameInsts(tid); if (stallSig->blockRename[tid]) { setAllStalls(stallSig->renameBlockReason[tid]); + stats.smtStallEvents[stallSig->renameBlockReason[tid]].sample(tid); } else if (toIEW->size > 0 && renameStalls[0] == StallReason::NoStall) { for (int i = 0; i < renameStalls.size(); i++) { if (i < toIEW->size) { @@ -584,8 +605,9 @@ Rename::renameInsts(ThreadID tid) breakRename = checkRenameStallFromIEW(tid); if (breakRename == StallReason::NoStall) { breakRename = StallReason::RegFull; - ++stats.fullRegistersEvents; + ++stats.fullRegistersEvents[tid]; stats.stallEvents[RegFull]++; + // stats.smtStallEvents[RegFull].sample(tid); } } blockReason = breakRename; @@ -599,7 +621,20 @@ Rename::renameInsts(ThreadID tid) } else if (breakRename != StallReason::NoStall) { setAllStalls(breakRename); } - stats.renamedInsts += renamed_insts; + + stats.renamedInsts[tid] += renamed_insts; + + if (breakRename == StallReason::ROBFull) { + stats.smtStallEvents[ROBFull].sample(tid); + } else if (breakRename == StallReason::RegFull) { + stats.smtStallEvents[RegFull].sample(tid); + } else if (breakRename == StallReason::SerializeStall) { + stats.smtStallEvents[SerializeInst].sample(tid); + } else if ( breakRename == StallReason::MemDQBandwidth || + breakRename == StallReason::IntDQBandwidth || + breakRename == StallReason::FVDQBandwidth) { + stats.smtStallEvents[BWFull].sample(tid); + } // If we wrote to the time buffer, record this. if (toIEWIndex) { diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index 861b0f82c2..a8d555a019 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -382,7 +382,8 @@ class Rename struct RenameStats : public statistics::Group { - RenameStats(statistics::Group *parent); + // RenameStats(statistics::Group *parent); + RenameStats(CPU *cpu, Rename *rename); /** Stat for total number of cycles spent squashing. */ statistics::Scalar squashCycles; @@ -397,8 +398,10 @@ class Rename statistics::Scalar runCycles; /** Stat for total number of cycles spent unblocking. */ statistics::Scalar unblockCycles; - /** Stat for total number of renamed instructions. */ - statistics::Scalar renamedInsts; + // /** Stat for total number of renamed instructions. */ + // statistics::Scalar renamedInsts; + /** Stat for total number of renamed instructions per thread. */ + statistics::Vector renamedInsts; /** Stat for total number of squashed instructions that rename * discards. */ statistics::Scalar squashedInsts; @@ -416,7 +419,7 @@ class Rename statistics::Scalar SQFullEvents; /** Stat for total number of times that rename runs out of free * registers to use to rename. */ - statistics::Scalar fullRegistersEvents; + statistics::Vector fullRegistersEvents; /** Stat for total number of renamed destination registers. */ statistics::Scalar renamedOperands; /** Stat for total number of source register rename lookups. */ @@ -441,6 +444,9 @@ class Rename statistics::Scalar constantFolded; statistics::Vector stallEvents; + + statistics::VectorDistribution smtStallEvents; + } stats; std::vector renameStalls; From 97922239963c2be889709a1f7333f1aacabfd49f Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 15 Apr 2026 10:59:49 +0800 Subject: [PATCH 20/38] cpu-o3: enlarge smt l3 to 32MB Change-Id: I653f16b0adefdfcc978f54791e21adbf74ecd84e --- configs/example/smt_idealkmhv3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py index a83681506f..dbbe66f814 100644 --- a/configs/example/smt_idealkmhv3.py +++ b/configs/example/smt_idealkmhv3.py @@ -36,6 +36,7 @@ def setSharedLSQParams(args, system): args.smt = True args.bp_type = 'DecoupledBPUWithBTB' args.l2_size = '2MB' + args.l3_size = '32MB' Simulation.setMemClass(args) From d7beabef943ef74ca02289d6d7ae213e44b9884f Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 27 Apr 2026 14:34:42 +0800 Subject: [PATCH 21/38] cpu: Isolate ABTB pipeline for SMT Change-Id: I8060ec785799940c4190fdba32b349226f44795d --- src/cpu/pred/BranchPredictor.py | 2 +- src/cpu/pred/btb/abtb.cc | 98 ++++++++++++++++++++++-------- src/cpu/pred/btb/abtb.hh | 48 +++++++++++---- src/cpu/pred/btb/test/abtb.test.cc | 53 +++++++++++++++- 4 files changed, 158 insertions(+), 43 deletions(-) diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 4b1b48c097..3359188b81 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -1000,7 +1000,7 @@ class AheadBTB(TimedBaseBTBPredictor): numEntries = Param.Unsigned(1024, "Number of entries in the BTB") tagBits = Param.Unsigned(38, "Number of bits in the tag") instShiftAmt = Param.Unsigned(1, "Amount to shift PC to get inst bits") - numThreads = Param.Unsigned(1, "Number of threads") + numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") numWays = Param.Unsigned(8, "Number of ways per set") aheadPipelinedStages = Param.Unsigned(1, "Number of stages ahead pipelined") entryHalfAligned = Param.Bool(False, "Whether the entries are half-aligned") diff --git a/src/cpu/pred/btb/abtb.cc b/src/cpu/pred/btb/abtb.cc index 8013900e83..e0bf942724 100644 --- a/src/cpu/pred/btb/abtb.cc +++ b/src/cpu/pred/btb/abtb.cc @@ -66,12 +66,16 @@ namespace test { */ #ifdef UNIT_TEST // Test constructor for unit testing mode - fixed ahead-pipelined configuration -AheadBTB::AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, unsigned numDelay) +AheadBTB::AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, + unsigned numDelay, unsigned numThreads) : TimedBaseBTBPredictor(), numEntries(numEntries), numWays(numWays), + numThreads(numThreads), + threadStates(numThreads), tagBits(tagBits) { + usingS3Pred = false; setNumDelay(numDelay); this->aheadPipelinedStages = 1; // fixed ahead-pipelined stages = 1 #else @@ -80,6 +84,8 @@ AheadBTB::AheadBTB(const Params &p) : TimedBaseBTBPredictor(p), numEntries(p.numEntries), numWays(p.numWays), + numThreads(p.numThreads), + threadStates(p.numThreads), tagBits(p.tagBits), usingS3Pred(p.usingS3Pred), btbStats(this) @@ -91,6 +97,7 @@ AheadBTB::AheadBTB(const Params &p) // AheadBTB always uses single instruction alignment: | tag | idx | instShiftAmt idxShiftAmt = 1; + assert(numThreads > 0); assert(numEntries % numWays == 0); numSets = numEntries / numWays; // AheadBTB always uses ahead-pipelined stages = 1 @@ -130,6 +137,27 @@ AheadBTB::AheadBTB(const Params &p) #endif } +ThreadID +AheadBTB::predictorTid(const std::vector &stagePreds) const +{ + assert(!stagePreds.empty()); + return stagePreds.front().tid; +} + +AheadBTB::ThreadState & +AheadBTB::threadState(ThreadID tid) +{ + assert(tid < threadStates.size()); + return threadStates[tid]; +} + +const AheadBTB::ThreadState & +AheadBTB::threadState(ThreadID tid) const +{ + assert(tid < threadStates.size()); + return threadStates[tid]; +} + #ifndef UNIT_TEST void AheadBTB::tickStart() @@ -297,14 +325,17 @@ AheadBTB::fillStagePredictions(const std::vector& entries, */ void AheadBTB::updatePredictionMeta(const std::vector& entries, - std::vector& stagePreds) + std::vector& stagePreds, + ThreadID tid) { + auto &state = threadState(tid); + // Save current BTB entries for (auto e: entries) { - meta->hit_entries.push_back(BTBEntry(e)); + state.meta->hit_entries.push_back(BTBEntry(e)); } - lastPredEntries = meta->hit_entries; + state.lastPredEntries = state.meta->hit_entries; } void @@ -312,10 +343,12 @@ AheadBTB::putPCHistory(Addr startAddr, const boost::dynamic_bitset<> &history, std::vector &stagePreds) { - meta = std::make_shared(); + const ThreadID tid = predictorTid(stagePreds); + auto &state = threadState(tid); + state.meta = std::make_shared(); const uint8_t asidHash = stagePreds.empty() ? 0 : stagePreds.front().asidHash; // Lookup all matching entries in BTB - auto find_entries = lookup(startAddr, asidHash); + auto find_entries = lookup(startAddr, tid, asidHash); // Process BTB entries auto processed_entries = processEntries(find_entries, startAddr); @@ -324,19 +357,22 @@ AheadBTB::putPCHistory(Addr startAddr, fillStagePredictions(processed_entries, stagePreds); // Update metadata for later stages - updatePredictionMeta(processed_entries, stagePreds); + updatePredictionMeta(processed_entries, stagePreds, tid); } std::shared_ptr AheadBTB::getPredictionMeta(ThreadID tid) { - (void)tid; + if (tid >= threadStates.size()) { + return nullptr; + } + auto &state = threadStates[tid]; // Lazy-initialize meta so callers never observe a null pointer // This avoids early-cycle crashes when prediction hasn't populated meta yet - if (!meta) { - meta = std::make_shared(); + if (!state.meta) { + state.meta = std::make_shared(); } - return meta; + return state.meta; } void @@ -345,9 +381,10 @@ AheadBTB::specUpdateHist(const boost::dynamic_bitset<> &history, FullBTBPredicti void AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget &entry, int shamt, bool cond_taken) { + auto &state = threadState(entry.tid); // clear ahead pipeline first - while (!aheadReadBtbEntries.empty()) { - aheadReadBtbEntries.pop(); + while (!state.aheadReadBtbEntries.empty()) { + state.aheadReadBtbEntries.pop(); } } @@ -358,19 +395,22 @@ AheadBTB::recoverHist(const boost::dynamic_bitset<> &history, const FetchTarget * @return Vector of matching BTB entries */ std::vector -AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) +AheadBTB::lookupSingleBlock(Addr block_pc, ThreadID tid, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { return res; // ignore false hit when lowest bit is 1 } + auto &state = threadState(tid); Addr btb_idx = getIndex(block_pc, asidHash); auto btb_set = btb[btb_idx]; assert(btb_idx < numSets); // AheadBTB always uses ahead-pipelined implementation: // memory access with previous block PC, tag compare with current PC - DPRINTF(AheadPipeline, "AheadBTB: pushing set for ahead-pipelined stages, idx %ld\n", btb_idx); - aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set)); + DPRINTF(AheadPipeline, + "AheadBTB: [tid:%u] pushing set for ahead-pipelined stages, idx %ld\n", + tid, btb_idx); + state.aheadReadBtbEntries.push(std::make_tuple(block_pc, btb_idx, btb_set)); Addr tag_curStartpc = getTag(block_pc, asidHash);// abtb uses current FB pc to get tag Addr pc = 0; @@ -378,21 +418,24 @@ AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) BTBSet set; // AheadBTB always uses ahead-pipelined logic (aheadPipelinedStages > 0) // only if the ahead-pipeline is filled can we use the entry - if (aheadReadBtbEntries.size() >= aheadPipelinedStages+1) { + if (state.aheadReadBtbEntries.size() >= aheadPipelinedStages+1) { // +1 because we pushed a new set in this cycle before // in case there are push without corresponding pop - assert(aheadReadBtbEntries.size() == aheadPipelinedStages+1); - std::tie(pc, idx_prvStartpc, set) = aheadReadBtbEntries.front(); - DPRINTF(AheadPipeline, "AheadBTB: ahead-pipeline filled, using set %ld from pc %#lx\n", - idx_prvStartpc, pc); + assert(state.aheadReadBtbEntries.size() == aheadPipelinedStages+1); + std::tie(pc, idx_prvStartpc, set) = state.aheadReadBtbEntries.front(); + DPRINTF(AheadPipeline, + "AheadBTB: [tid:%u] ahead-pipeline filled, using set %ld from pc %#lx\n", + tid, idx_prvStartpc, pc); DPRINTF(AheadPipeline, "AheadBTB: dumping btb set\n"); for (auto &entry : set) { printTickedBTBEntry(entry); } - aheadReadBtbEntries.pop(); + state.aheadReadBtbEntries.pop(); } else { - DPRINTF(AheadPipeline, "AheadBTB: ahead-pipeline not filled, only have %ld sets read," - " skipping tag compare, assigning miss\n", aheadReadBtbEntries.size()); + DPRINTF(AheadPipeline, + "AheadBTB: [tid:%u] ahead-pipeline not filled, only have %ld sets read," + " skipping tag compare, assigning miss\n", + tid, state.aheadReadBtbEntries.size()); } DPRINTF(ABTB, "BTB: Doing tag comparison for index 0x%lx tag %#lx\n", idx_prvStartpc, tag_curStartpc); @@ -407,7 +450,7 @@ AheadBTB::lookupSingleBlock(Addr block_pc, uint8_t asidHash) } std::vector -AheadBTB::lookup(Addr block_pc, uint8_t asidHash) +AheadBTB::lookup(Addr block_pc, ThreadID tid, uint8_t asidHash) { std::vector res; if (block_pc & 0x1) { @@ -415,7 +458,7 @@ AheadBTB::lookup(Addr block_pc, uint8_t asidHash) } // AheadBTB always uses single block lookup - res = lookupSingleBlock(block_pc, asidHash); + res = lookupSingleBlock(block_pc, tid, asidHash); return res; } @@ -603,7 +646,8 @@ AheadBTB::updateUsingS3Pred(FullBTBPrediction &s3Pred, const Addr previousPC) (s3Pred.bbStart + predictWidth) & ~mask(floorLog2(predictWidth)-1); // AheadBTB use S3 prediction for update - auto old_entries= processOldEntries(lastPredEntries, end_inst_pc); + auto &state = threadState(s3Pred.tid); + auto old_entries= processOldEntries(state.lastPredEntries, end_inst_pc); auto entries_to_update = collectEntriesToUpdateFromS3Pred(old_entries,s3Pred); diff --git a/src/cpu/pred/btb/abtb.hh b/src/cpu/pred/btb/abtb.hh index e5e29f7ffd..233489dab1 100644 --- a/src/cpu/pred/btb/abtb.hh +++ b/src/cpu/pred/btb/abtb.hh @@ -40,7 +40,10 @@ #ifndef __CPU_PRED_BTB_BTB_HH__ #define __CPU_PRED_BTB_BTB_HH__ +#include #include +#include +#include #include "base/types.hh" #include "cpu/pred/btb/common.hh" @@ -83,7 +86,8 @@ class AheadBTB : public TimedBaseBTBPredictor #ifdef UNIT_TEST // Test constructor - fixed ahead-pipelined configuration - AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, unsigned numDelay); + AheadBTB(unsigned numEntries, unsigned tagBits, unsigned numWays, + unsigned numDelay, unsigned numThreads = 1); #else // Production constructor typedef AheadBTBParams Params; @@ -260,15 +264,30 @@ class AheadBTB : public TimedBaseBTBPredictor } }BTBMeta; - std::shared_ptr meta; // metadata for BTB, set in putPCHistory, used in update - /** - * lastPredEntries is using in updateusingS3pred() to store the hit entries during prediction - * it is using to hold the hit entries for later use in S3 update - * because in gem5 generat pred and updateusingS3pred finish in the same cycle - * so we can use this instead of using BTBMeta + * Per-thread ABTB prediction-time state. The BTB storage itself remains + * shared, but the ahead-read pipeline must not be shared across SMT + * threads because index read and tag compare occur in different cycles. */ - std::vector lastPredEntries; // cached hit entries for the latest prediction + struct ThreadState + { + // metadata for BTB, set in putPCHistory, used in update + std::shared_ptr meta; + + /** + * lastPredEntries is used in updateUsingS3Pred() to store hit entries + * during prediction. It holds the hit entries for later S3 update. + * Because gem5 generate pred and updateUsingS3Pred finish in the same + * cycle, we can use this instead of BTBMeta. + */ + std::vector lastPredEntries; + + std::queue> aheadReadBtbEntries; + }; + + ThreadID predictorTid(const std::vector &stagePreds) const; + ThreadState &threadState(ThreadID tid); + const ThreadState &threadState(ThreadID tid) const; /** Process BTB entries for prediction * @param entries Vector of BTB entries to process @@ -289,7 +308,8 @@ class AheadBTB : public TimedBaseBTBPredictor * @param entries Processed BTB entries */ void updatePredictionMeta(const std::vector& entries, - std::vector& stagePreds); + std::vector& stagePreds, + ThreadID tid); /** Process prediction metadata and old entries * @param meta BTB metadata from prediction @@ -367,13 +387,15 @@ class AheadBTB : public TimedBaseBTBPredictor * @param inst_PC The address of the block to look up. * @return Returns all hit BTB entries. */ - std::vector lookup(Addr block_pc, uint8_t asidHash); + std::vector lookup(Addr block_pc, ThreadID tid, + uint8_t asidHash); /** Helper function to lookup entries in a single block * @param block_pc The aligned PC to lookup * @return Vector of matching BTB entries */ - std::vector lookupSingleBlock(Addr block_pc, uint8_t asidHash); + std::vector lookupSingleBlock(Addr block_pc, ThreadID tid, + uint8_t asidHash); /** The BTB structure: * - Organized as numSets sets @@ -389,12 +411,12 @@ class AheadBTB : public TimedBaseBTBPredictor */ std::vector mruList; - std::queue> aheadReadBtbEntries; - /** BTB configuration parameters */ unsigned numEntries; // Total number of entries unsigned numWays; // Number of ways per set unsigned numSets; // Number of sets (numEntries/numWays) + unsigned numThreads; // Number of SMT threads with isolated pipeline state + std::vector threadStates; #ifdef UNIT_TEST uint64_t blockSize{32}; // max size in byte of a Fetch Block diff --git a/src/cpu/pred/btb/test/abtb.test.cc b/src/cpu/pred/btb/test/abtb.test.cc index 36c7cc1bd4..654d70cdfb 100644 --- a/src/cpu/pred/btb/test/abtb.test.cc +++ b/src/cpu/pred/btb/test/abtb.test.cc @@ -19,6 +19,8 @@ namespace test FetchTarget createStream(Addr startPC, FullBTBPrediction &pred, AheadBTB *abtb) { FetchTarget stream; + stream.tid = pred.tid; + stream.asidHash = pred.asidHash; stream.startPC = startPC; Addr fallThroughAddr = pred.getFallThrough(abtb->predictWidth); stream.isHit = pred.btbEntries.size() > 0; // TODO: fix isHit and falseHit @@ -26,7 +28,7 @@ FetchTarget createStream(Addr startPC, FullBTBPrediction &pred, AheadBTB *abtb) stream.predBTBEntries = pred.btbEntries; stream.predTaken = pred.isTaken(); stream.predEndPC = fallThroughAddr; - stream.predMetas[0] = abtb->getPredictionMeta(); + stream.predMetas[0] = abtb->getPredictionMeta(stream.tid); return stream; } @@ -39,13 +41,27 @@ void resolveStream(FetchTarget &stream, bool taken, Addr brPc, Addr target, bool stream.exeTaken = taken; } -FullBTBPrediction makePrediction(Addr startPC, AheadBTB *abtb) { +FullBTBPrediction makePrediction(Addr startPC, AheadBTB *abtb, + ThreadID tid = 0, uint8_t asidHash = 0) { std::vector stagePreds(2); // 2 stages + for (int i = 0; i < stagePreds.size(); i++) { + stagePreds[i].tid = tid; + stagePreds[i].asidHash = asidHash; + stagePreds[i].bbStart = startPC; + stagePreds[i].predSource = i; + } boost::dynamic_bitset<> history(8, 0); // history does not matter for BTB abtb->putPCHistory(startPC, history, stagePreds); return stagePreds[1]; } +void clearAheadPipeline(AheadBTB *abtb, ThreadID tid) { + FetchTarget stream; + stream.tid = tid; + boost::dynamic_bitset<> history(8, 0); + abtb->recoverHist(history, stream, 0, false); +} + void updateBTB(FetchTarget &stream, AheadBTB *abtb, MBTB *mbtb) { mbtb->getAndSetNewBTBEntry(stream); // usually called by mbtb, here for testing purpose abtb->update(stream); @@ -151,6 +167,39 @@ TEST_F(ABTBTest, AliasAvoidance){ EXPECT_EQ(pred_C_test.btbEntries.size(), 0); } +TEST_F(ABTBTest, AheadPipelineIsThreadIsolated){ + AheadBTB twoThreadAbtb(1024, 20, 1, 0, 2); + + Addr t0PrevPC = 0x1000; + Addr t0StartPC = 0x2000; + Addr t0BrPC = 0x2004; + Addr t0Target = 0x3000; + Addr t1PrevPC = 0x1040; + + // Train a thread-0 ABTB entry indexed by t0PrevPC and tagged by t0StartPC. + auto pred_t0 = makePrediction(t0StartPC, &twoThreadAbtb, 0); + auto stream_t0 = createStream(t0StartPC, pred_t0, &twoThreadAbtb); + stream_t0.previousPCs.push(t0PrevPC); + resolveStream(stream_t0, true, t0BrPC, t0Target, true); + updateBTB(stream_t0, &twoThreadAbtb, mbtb); + + clearAheadPipeline(&twoThreadAbtb, 0); + clearAheadPipeline(&twoThreadAbtb, 1); + + // Interleave another thread between thread 0's previous/current blocks. + // With a shared ahead FIFO, thread 0's current lookup would consume the + // set read by thread 1 and miss the trained entry. + makePrediction(t0PrevPC, &twoThreadAbtb, 0); + makePrediction(t1PrevPC, &twoThreadAbtb, 1); + auto pred_t0_test = makePrediction(t0StartPC, &twoThreadAbtb, 0); + + EXPECT_EQ(pred_t0_test.btbEntries.size(), 1); + if (!pred_t0_test.btbEntries.empty()) { + EXPECT_EQ(pred_t0_test.btbEntries[0].pc, t0BrPC); + EXPECT_EQ(pred_t0_test.btbEntries[0].target, t0Target); + } +} + } // namespace test } // namespace btb_pred } // namespace branch_prediction From 9845df977d53861866b8611b21b34c38534a120e Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 27 Apr 2026 16:32:44 +0800 Subject: [PATCH 22/38] cpu-o3: isolate MDP replay tracking per thread Change-Id: Ia8509f4290de8b2c7e337892506c69a8d399d493 --- src/cpu/o3/inst_queue.cc | 64 +++++++++++++++++++++++++--------------- src/cpu/o3/inst_queue.hh | 5 +++- src/cpu/o3/lsq_unit.cc | 28 ++++++++++++++++++ src/cpu/o3/lsq_unit.hh | 18 +++++++++++ 4 files changed, 91 insertions(+), 24 deletions(-) diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index db8ec407f4..dda79556dc 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -119,6 +119,17 @@ InstructionQueue::MdpAddrReplayLdInst::MdpAddrReplayLdInst( { } +bool +InstructionQueue::hasMdpAddrReplayInsts() const +{ + for (const auto &replay_ld_insts : mdpAddrReplayLdInsts) { + if (!replay_ld_insts.empty()) { + return true; + } + } + return false; +} + InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) : cpu(cpu_ptr), @@ -384,7 +395,9 @@ InstructionQueue::resetState() deferredMemInsts.clear(); cacheMissLdInsts.clear(); stlfFailLdInsts.clear(); - mdpAddrReplayLdInsts.clear(); + for (auto &replay_ld_insts : mdpAddrReplayLdInsts) { + replay_ld_insts.clear(); + } blockedMemInsts.clear(); retryMemInsts.clear(); wbOutstanding = 0; @@ -683,7 +696,7 @@ InstructionQueue::scheduleReadyInsts() // removed from the code below. if (total_issued || !retryMemInsts.empty() || !deferredMemInsts.empty() || !cacheMissLdInsts.empty() || !stlfFailLdInsts.empty() || - !mdpAddrReplayLdInsts.empty()) { + hasMdpAddrReplayInsts()) { cpu->activityThisCycle(); } else { DPRINTF(IQ, "Not able to schedule any instructions.\n"); @@ -708,14 +721,15 @@ InstructionQueue::resolveMdpAddrReplayStoreAddr(const DynInstPtr &store_inst) const ThreadID tid = store_inst->threadNumber; const InstSeqNum store_sn = store_inst->seqNum; + auto &replay_ld_insts = mdpAddrReplayLdInsts[tid]; - for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) { + for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) { if (!it->inst || it->inst->isSquashed()) { - it = mdpAddrReplayLdInsts.erase(it); + it = replay_ld_insts.erase(it); continue; } - if (it->inst->threadNumber != tid || it->strict) { + if (it->strict) { ++it; continue; } @@ -725,7 +739,7 @@ InstructionQueue::resolveMdpAddrReplayStoreAddr(const DynInstPtr &store_inst) DPRINTF(IQ, "Load[sn:%llu] MDP addr replay ready (store[sn:%llu] addr ready)\n", it->inst->seqNum, store_sn); it->inst->issueQue->retryMem(it->inst); - it = mdpAddrReplayLdInsts.erase(it); + it = replay_ld_insts.erase(it); continue; } ++it; @@ -982,8 +996,10 @@ InstructionQueue::mdpAddrReplayRegister( return; } + auto &replay_ld_insts = mdpAddrReplayLdInsts[load_inst->threadNumber]; + // Avoid duplicate registration for the same dynamic inst. - for (const auto &entry : mdpAddrReplayLdInsts) { + for (const auto &entry : replay_ld_insts) { if (entry.inst && entry.inst->seqNum == load_inst->seqNum) { return; } @@ -991,7 +1007,7 @@ InstructionQueue::mdpAddrReplayRegister( DPRINTF(IQ, "Load[sn:%llu] MDP addr replay register, wait %lu stores\n", load_inst->seqNum, store_seq_nums.size()); - mdpAddrReplayLdInsts.emplace_back(load_inst, store_seq_nums); + replay_ld_insts.emplace_back(load_inst, store_seq_nums); } void @@ -1002,7 +1018,9 @@ InstructionQueue::mdpAddrReplayRegisterStrict(const DynInstPtr &load_inst, return; } - for (const auto &entry : mdpAddrReplayLdInsts) { + auto &replay_ld_insts = mdpAddrReplayLdInsts[load_inst->threadNumber]; + + for (const auto &entry : replay_ld_insts) { if (entry.inst && entry.inst->seqNum == load_inst->seqNum) { return; } @@ -1010,7 +1028,7 @@ InstructionQueue::mdpAddrReplayRegisterStrict(const DynInstPtr &load_inst, DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay register, wait storeCompletedIdx >= %lu\n", load_inst->seqNum, required_store_completed_idx); - mdpAddrReplayLdInsts.emplace_back(load_inst, required_store_completed_idx); + replay_ld_insts.emplace_back(load_inst, required_store_completed_idx); } void @@ -1021,8 +1039,8 @@ InstructionQueue::mdpAddrReplayPipeDone(const DynInstPtr &load_inst) } const ThreadID tid = load_inst->threadNumber; - for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end(); - ++it) { + auto &replay_ld_insts = mdpAddrReplayLdInsts[tid]; + for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end(); ++it) { if (!it->inst || it->inst->seqNum != load_inst->seqNum) { continue; } @@ -1033,13 +1051,13 @@ InstructionQueue::mdpAddrReplayPipeDone(const DynInstPtr &load_inst) DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay ready (pipeDone)\n", load_inst->seqNum); load_inst->issueQue->retryMem(load_inst); - mdpAddrReplayLdInsts.erase(it); + replay_ld_insts.erase(it); } } else if (it->storeSeqNums.empty()) { DPRINTF(IQ, "Load[sn:%llu] MDP addr replay ready (pipeDone)\n", load_inst->seqNum); load_inst->issueQue->retryMem(load_inst); - mdpAddrReplayLdInsts.erase(it); + replay_ld_insts.erase(it); } return; } @@ -1059,19 +1077,20 @@ InstructionQueue::mdpAddrReplayUpdateStoreCompletedIdx( } mdpStoreCompletedIdx[tid] = store_completed_idx; + auto &replay_ld_insts = mdpAddrReplayLdInsts[tid]; - for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) { + for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) { if (!it->inst || it->inst->isSquashed()) { - it = mdpAddrReplayLdInsts.erase(it); + it = replay_ld_insts.erase(it); continue; } - if (it->inst->threadNumber == tid && it->strict && it->pipeDone && + if (it->strict && it->pipeDone && store_completed_idx >= it->requiredStoreCompletedIdx) { DPRINTF(IQ, "Load[sn:%llu] MDP strict addr replay ready (storeCompletedIdx=%lu)\n", it->inst->seqNum, store_completed_idx); it->inst->issueQue->retryMem(it->inst); - it = mdpAddrReplayLdInsts.erase(it); + it = replay_ld_insts.erase(it); continue; } ++it; @@ -1127,11 +1146,10 @@ InstructionQueue::doSquash(ThreadID tid) squashInfo.squashSn = squashedSeqNum[tid]; scheduler->doSquash(squashInfo); - for (auto it = mdpAddrReplayLdInsts.begin(); it != mdpAddrReplayLdInsts.end();) { - if (!it->inst || - (it->inst->threadNumber == tid && - it->inst->seqNum > squashedSeqNum[tid])) { - it = mdpAddrReplayLdInsts.erase(it); + auto &replay_ld_insts = mdpAddrReplayLdInsts[tid]; + for (auto it = replay_ld_insts.begin(); it != replay_ld_insts.end();) { + if (!it->inst || it->inst->seqNum > squashedSeqNum[tid]) { + it = replay_ld_insts.erase(it); } else { ++it; } diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index f163ebb28e..1f3790e286 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -42,6 +42,7 @@ #ifndef __CPU_O3_INST_QUEUE_HH__ #define __CPU_O3_INST_QUEUE_HH__ +#include #include #include #include @@ -373,10 +374,12 @@ class InstructionQueue MdpAddrReplayLdInst(const DynInstPtr &inst, size_t required_store_completed_idx); }; - std::list mdpAddrReplayLdInsts; + std::array, MaxThreads> mdpAddrReplayLdInsts; size_t mdpStoreCompletedIdx[MaxThreads] = {}; + bool hasMdpAddrReplayInsts() const; + /** List of instructions that have been cache blocked. */ std::list blockedMemInsts; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 5112ee8c40..127212ecb4 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -593,6 +593,18 @@ LSQUnit::LSQUnitStats::LSQUnitStats(statistics::Group *parent) "Number of load-load violation events"), ADD_STAT(stLdViolation, statistics::units::Count::get(), "Number of store-load violation events"), + ADD_STAT(rawMemOrderViolation, statistics::units::Count::get(), + "Number of RAW memory ordering violations"), + ADD_STAT(rawViolationMdpNoPred, statistics::units::Count::get(), + "Number of RAW violations where replay-based MDP had no producer prediction"), + ADD_STAT(rawViolationMdpHit, statistics::units::Count::get(), + "Number of RAW violations where replay-based MDP predicted the violating store"), + ADD_STAT(rawViolationMdpMiss, statistics::units::Count::get(), + "Number of RAW violations where replay-based MDP predicted other stores only"), + ADD_STAT(rawViolationMdpStrict, statistics::units::Count::get(), + "Number of RAW violations where replay-based MDP used strict wait"), + ADD_STAT(loadOrderViolation, statistics::units::Count::get(), + "Number of load-load or snoop ordering violations"), ADD_STAT(busForwardSuccess, statistics::units::Count::get(), "Number of successfully forwarding from bus"), ADD_STAT(cacheMissReplayEarly, statistics::units::Count::get(), @@ -1185,6 +1197,7 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, ++stats.memOrderViolation; ++stats.ldLdViolation; + ++stats.loadOrderViolation; return std::make_shared( "Detected fault with inst [sn:%lli] and " @@ -1237,6 +1250,19 @@ LSQUnit::checkViolations(typename LoadQueue::iterator& loadIt, ++stats.stLdViolation; countedStLdViolationThisCycle = true; } + ++stats.rawMemOrderViolation; + if (ld_inst->mdpPredStrictWait) { + ++stats.rawViolationMdpStrict; + } else if (ld_inst->mdpProducingStores.empty()) { + ++stats.rawViolationMdpNoPred; + } else if (std::find(ld_inst->mdpProducingStores.begin(), + ld_inst->mdpProducingStores.end(), + inst->seqNum) != + ld_inst->mdpProducingStores.end()) { + ++stats.rawViolationMdpHit; + } else { + ++stats.rawViolationMdpMiss; + } return std::make_shared( "Detected fault with " @@ -1353,6 +1379,7 @@ LSQUnit::loadDoSendRequest(const DynInstPtr &inst) auto& store_inst = storePipeSx[1]->insts[i]; if (pipeLineNukeCheck(inst, store_inst)) { DPRINTF(LoadPipeline, "Load [sn:%llu] Nuke need replay\n", inst->seqNum); + ++stats.pipeRawNukeReplay; inst->setProducerStorePC(store_inst->pcState().instAddr()); inst->setNukeReplay(); return NoFault; @@ -1532,6 +1559,7 @@ LSQUnit::loadDoRecvData(const DynInstPtr &inst) return fault; case LdStReplayType::NukeReplay: DPRINTF(LoadPipeline, "Load [sn:%llu] Nuke need replay\n", inst->seqNum); + ++stats.pipeRawNukeReplay; return fault; default: panic("Unsupported load replay type selected in s2"); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index fd2ff7d172..7752b5003e 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -817,6 +817,24 @@ class LSQUnit /** Total number of store-load violation events. */ statistics::Scalar stLdViolation; + /** RAW memory ordering violations caused by a younger load. */ + statistics::Scalar rawMemOrderViolation; + + /** RAW violations where replay-based MDP had no producer prediction. */ + statistics::Scalar rawViolationMdpNoPred; + + /** RAW violations where replay-based MDP predicted the violating store. */ + statistics::Scalar rawViolationMdpHit; + + /** RAW violations where replay-based MDP predicted other stores only. */ + statistics::Scalar rawViolationMdpMiss; + + /** RAW violations where replay-based MDP used strict wait. */ + statistics::Scalar rawViolationMdpStrict; + + /** Load-load/snoop ordering violations. */ + statistics::Scalar loadOrderViolation; + /** Tota number of successfully forwarding from bus. */ statistics::Scalar busForwardSuccess; From e16b2a27dffb8cd7a73e5d4eaeb5882723a4ef6d Mon Sep 17 00:00:00 2001 From: mhnGitHubz <66786667+mhnGitHubz@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:34:28 +0800 Subject: [PATCH 23/38] cpu-o3: 1. Add scheduler starvation prevention mechanism; 2. Modify sbuffer resource allocation mechanism; 3. Vectorize waitForVsetvl (#844) Co-authored-by: mo haonan --- src/cpu/o3/fetch.cc | 39 +++++++++++++++++++++++++++------------ src/cpu/o3/fetch.hh | 2 +- src/cpu/o3/issue_queue.cc | 21 +++++++++++++++++++-- src/cpu/o3/lsq.cc | 12 +++++++++--- src/cpu/o3/lsq.hh | 1 + src/cpu/o3/lsq_unit.cc | 5 ++++- src/cpu/o3/lsq_unit.hh | 2 +- src/cpu/o3/smt_sched.hh | 18 ++++++++++++++---- 8 files changed, 76 insertions(+), 24 deletions(-) diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index ff31aa9bb9..2fdf1076f5 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -502,6 +502,7 @@ Fetch::resetStage() fetchQueue[tid].clear(); priorityList.push_back(tid); + waitForVsetvl[tid] = false; } wroteToTimeBuffer = false; @@ -1329,14 +1330,12 @@ Fetch::initializeTickState() // for each thread. bool updated_status = checkSignalsAndUpdate(tid); status_change = status_change || updated_status; + if (fromCommit->commitInfo[tid].emptyROB) { + waitForVsetvl[tid] = false; + } } DPRINTF(Fetch, "Running stage.\n"); - - if (fromCommit->commitInfo[0].emptyROB) { - waitForVsetvl = false; - } - return status_change; } @@ -1391,22 +1390,32 @@ Fetch::selectUnstalledThread() // if (numThreads == 1) { // return 0; // } + ThreadID selected = -1; + bool all_stalled = true; for (ThreadID tid = 0; tid < numThreads; ++tid) { - if (!stallSig->blockFetch[tid]) { + if (!stallSig->blockFetch[tid] &&fetchQueue[tid].size() > 0) { lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount); iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount); robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount); + all_stalled = false; - } else { + }else { lsqCounter->setCounter(tid, UINT64_MAX); iqCounter->setCounter(tid, UINT64_MAX); robCounter->setCounter(tid, UINT64_MAX); } + + if(all_stalled) + { + selected = -1; + }else{ + selected = decodeScheduler->getThread(); + } DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount); } - ThreadID selected = decodeScheduler->getThread(); + return selected; } @@ -1450,6 +1459,12 @@ Fetch::sendInstructionsToDecode() } ThreadID tid =selectUnstalledThread(); + + if(tid == -1) + { + DPRINTF(Fetch, "All threads are stalled, no thread selected.\n"); + return; + } DPRINTF(Fetch, "select Unstalled [tid:%i]\n",tid); // fetch totally stalled @@ -1997,9 +2012,9 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, // Special handling for RISC-V vector configuration instructions. if (staticInst->isVectorConfig()) { - waitForVsetvl = dec_ptr->stall(); - DPRINTF(Fetch, "[tid:%i] Vector config instruction, waitForVsetvl=%d\n", - tid, waitForVsetvl); + waitForVsetvl[tid] = dec_ptr->stall(); + DPRINTF(Fetch, "[tid:%i] Vector config instruction, waitForVsetvl[tid]=%d\n", + tid, waitForVsetvl[tid]); } instruction->setVersion(localSquashVer[tid]); @@ -2076,7 +2091,7 @@ Fetch::performInstructionFetch(ThreadID tid) // For decoupled frontend (including trace mode), check FTQ availability StallReason stall = StallReason::NoStall; while (numInst < fetchWidth && fetchQueue[tid].size() < fetchQueueSize && - !predictedBranch && !ftqEmpty(tid) && !waitForVsetvl) { + !predictedBranch && !ftqEmpty(tid) && !waitForVsetvl[tid]) { // Check memory needs and supply bytes to decoder if required stall = checkMemoryNeeds(tid, pc_state, curMacroop); diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 18e6159022..c76bb8d77f 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1141,7 +1141,7 @@ public: private: - bool waitForVsetvl = false; + bool waitForVsetvl [MaxThreads]; /** Value predictor */ valuepred::VPUnit *valuePred; diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index d83083a45f..bf5bb1ea47 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -646,6 +646,13 @@ IssueQue::selectInst() selectQ.clear(); for (int pi = 0; pi < outports; pi++) { auto readyQ = readyQs[pi]; + // iq->getInstsCounter()->getCounter(tid) + int iqcount = 0; + for (auto it = readyQ->begin(); it != readyQ->end(); ++it) { + DPRINTF(Schedule, "readyQ for port %d has [sn:%llu] %s [tid:%u]\n", pi, (*it)->seqNum, + (*it)->genDisassembly(), (*it)->threadNumber); + } + selector->begin(readyQ); for (auto it = selector->select(readyQ->begin(), pi); it != readyQ->end(); it = selector->select(it, pi)) { auto& inst = *it; @@ -659,7 +666,15 @@ IssueQue::selectInst() uint64_t busy_bit = (lat > 63 ? -1 : (1llu << lat)); if (!(portBusy[pi] & busy_bit)) { DPRINTF(Schedule, "[sn %ld] was selected\n", inst->seqNum); - + for (ThreadID tid = 0; tid < MaxThreads; tid++) { + if (inst->threadNumber == tid) { + independentIQICountScheduler->scheduleNum[tid]++; + } else { + independentIQICountScheduler->scheduleNum[tid] = 0; + } + } + DPRINTF(Schedule, "smtScheduler->scheduleNum[0]=%d, smtScheduler->scheduleNum[1]=%d\n", + independentIQICountScheduler->scheduleNum[0], independentIQICountScheduler->scheduleNum[1]); // get regfile write port for (int i = 0; i < inst->numDestRegs(); i++) { auto pdst = inst->renamedDestIdx(i); @@ -786,7 +801,7 @@ IssueQue::insert(const DynInstPtr& inst) cpu->perfCCT->updateInstPos(inst->seqNum, PerfRecord::AtIssueQue); - DPRINTF(Schedule, "[sn:%llu] %s insert into %s\n", inst->seqNum, enums::OpClassStrings[inst->opClass()], iqname); + DPRINTF(Schedule, "[tid:%u] [sn:%llu] %s insert into %s\n", inst->threadNumber, inst->seqNum, enums::OpClassStrings[inst->opClass()], iqname); selector->allocate(inst); inst->issueQue = this; instList.emplace_back(inst); @@ -906,6 +921,7 @@ IssueQue::incInIQInstsCounter(ThreadID tid) { if (instsCounter) { instsCounter->incCounter(tid); + DPRINTF(Schedule, "Thread %d: incInIQInstsCounter to %d\n", tid, instsCounter->getCounter(tid)); } if (iqstats) { iqstats->instsNum[tid]++; @@ -917,6 +933,7 @@ IssueQue::decInIQInstsCounter(ThreadID tid) { if (instsCounter) { instsCounter->decCounter(tid); + DPRINTF(Schedule, "Thread %d: decInIQInstsCounter to %d\n", tid, instsCounter->getCounter(tid)); } } diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index a341c1eaa0..448530ab69 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -940,6 +940,7 @@ LSQ::processWriteback() std::vector offload_quota(numThreads, 0); std::vector offload_demand(numThreads, 0); std::vector requester_tids; + std::vector offload_fail(numThreads, false); requester_tids.reserve(activeThreads->size()); for (ThreadID tid : *activeThreads) { @@ -992,9 +993,14 @@ LSQ::processWriteback() } } threads = activeThreads->begin(); - while (threads != end) { - ThreadID tid = *threads++; - thread[tid].offloadToStoreBuffer(offload_quota[tid]); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + thread[(nextStoreBufferInsertTid + tid) % numThreads].offloadToStoreBuffer(offload_quota[(nextStoreBufferInsertTid + tid) % numThreads], offload_fail); + } + + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (offload_fail[tid]) { + nextStoreBufferInsertTid = tid; + } } // A fence/flush only waits for the requesting thread's sbuffer domain. diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 28cb6e0146..397f372c26 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -1294,6 +1294,7 @@ class LSQ uint64_t storeBufferWritebackInactive = 0; StoreBufferEntry *blockedSbufferEntry = nullptr; ThreadID nextStoreBufferOffloadTid = InvalidThreadID; + ThreadID nextStoreBufferInsertTid = 0; bool enableBankConflictCheck; bool sbufferBankWriteAccurately; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index 127212ecb4..c0e9283c55 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -2338,7 +2338,7 @@ LSQUnit::countStoreBufferOffloadableEntries(uint32_t max_entries) const } void -LSQUnit::offloadToStoreBuffer(uint32_t max_entries) +LSQUnit::offloadToStoreBuffer(uint32_t max_entries, std::vector& offload_fail) { assert(!lsq->storeBufferBlocked()); if (isStoreBlocked) return; @@ -2415,6 +2415,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) if (success) { request->_numOutstandingPackets++; } else { + offload_fail[lsqID] = true; break; } } @@ -2435,6 +2436,7 @@ LSQUnit::offloadToStoreBuffer(uint32_t max_entries) vaddr, paddr, (uint8_t *)storeWBIt->data(), request->_size, request->mainReq()->getByteEnable(), inst->seqNum); if (!success) { + offload_fail[lsqID] = true; break; } ++accepted_entries; @@ -2501,6 +2503,7 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, // create new entry if (storeBuffer.full()) { stats.sbufferFull++; + // lsq->nextStoreBufferInsertTid = lsqID; DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr); return false; } diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 7752b5003e..e199d089fc 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -359,7 +359,7 @@ class LSQUnit uint32_t countStoreBufferOffloadableEntries(uint32_t max_entries) const; /** Writes back stores. */ - void offloadToStoreBuffer(uint32_t max_entries); + void offloadToStoreBuffer(uint32_t max_entries, std::vector& offload_fail); bool insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, uint64_t size, const std::vector& mask, diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh index 74198c44fd..7fcbf5733f 100644 --- a/src/cpu/o3/smt_sched.hh +++ b/src/cpu/o3/smt_sched.hh @@ -131,23 +131,33 @@ class IndependentIQICountScheduler : public SMTScheduler { private: InstsCounter* counter; // Counter for this IQ only + public: IndependentIQICountScheduler(int numThreads, InstsCounter* counter) : SMTScheduler(numThreads), counter(counter){} ThreadID getThread() override { ThreadID selectedTid = 0; - uint64_t minCount = counter->getCounter(0); - + uint64_t maxCount = counter->getCounter(0); + if(scheduleNum[0] >= 100){ + selectedTid = 1; + return selectedTid; + } for (ThreadID tid = 1; tid < numThreads; ++tid) { + if(scheduleNum[tid] >= 100){ + selectedTid = 0; + return selectedTid; + } uint64_t count = counter->getCounter(tid); - if (count < minCount) { - minCount = count; + if (count > maxCount) { + maxCount = count; selectedTid = tid; } } return selectedTid; } + int scheduleNum[MaxThreads]; + }; }} From 032f5757fbdc09ca9c1a6daf564f49015058d6a2 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 27 Apr 2026 16:59:09 +0800 Subject: [PATCH 24/38] cpu: preserve BTB tag bits when hashing ASID Change-Id: I30d0a510565c278349b4d4b915e84ead21f8c6ae --- src/cpu/pred/btb/common.hh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpu/pred/btb/common.hh b/src/cpu/pred/btb/common.hh index e40dee3cf2..0e603a6f04 100644 --- a/src/cpu/pred/btb/common.hh +++ b/src/cpu/pred/btb/common.hh @@ -49,7 +49,8 @@ injectAsidHashIntoTag(Addr base_tag, unsigned tag_bits, uint8_t asid_hash) const unsigned hash_bits = std::min(4, tag_bits); const Addr hash_mask = mask(hash_bits); - return (base_tag & ~hash_mask) | (static_cast(asid_hash) & hash_mask); + return (base_tag ^ (static_cast(asid_hash) & hash_mask)) & + mask(tag_bits); } inline Addr From 036db121e7a7fc26efb0610d8003afb4dccc8893 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 28 Apr 2026 10:40:24 +0800 Subject: [PATCH 25/38] cpu-o3: isolate committed stream state per thread Change-Id: Id6528ad1ec2b2ad7a26cd0ef18030b8f671c3302 --- src/cpu/o3/commit.cc | 10 ++++++---- src/cpu/o3/commit.hh | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index 746a39872b..668f222553 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -188,6 +188,8 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara htmStarts[tid] = 0; htmStops[tid] = 0; traceCommitIndex[tid] = 0; + committedTargetId[tid] = 1; + committedLoopIter[tid] = 0; fixedbuffer[tid] = boost::circular_buffer(renameWidth); } interrupt = NoFault; @@ -725,8 +727,8 @@ Commit::squashAll(ThreadID tid) set(toIEW->commitInfo[tid].pc, pc[tid]); - toIEW->commitInfo[tid].squashedTargetId = committedTargetId; - toIEW->commitInfo[tid].squashedLoopIter = committedLoopIter; + toIEW->commitInfo[tid].squashedTargetId = committedTargetId[tid]; + toIEW->commitInfo[tid].squashedLoopIter = committedLoopIter[tid]; cpu->mmu->useNewPriv(cpu->getContext(tid)); @@ -1412,8 +1414,8 @@ Commit::commitInsts() if (head_inst->getFtqId() > 1) { toIEW->commitInfo[tid].doneFtqId = head_inst->getFtqId() - 1; } - committedTargetId = head_inst->getFtqId(); - committedLoopIter = head_inst->getLoopIteration(); + committedTargetId[tid] = head_inst->getFtqId(); + committedLoopIter[tid] = head_inst->getLoopIteration(); if (tid == 0) canHandleInterrupts = !head_inst->isDelayedCommit(); diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 3c83b610e5..27ac59157e 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -567,8 +567,8 @@ class Commit // committed Stream and Target - uint64_t committedTargetId{1}; - uint64_t committedLoopIter{}; + uint64_t committedTargetId[MaxThreads]; + uint64_t committedLoopIter[MaxThreads]; struct CommitStats : public statistics::Group { From ada41dc4e5a04217fc793e7cfd5d7eab473bff37 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 28 Apr 2026 16:08:07 +0800 Subject: [PATCH 26/38] cpu: Isolate VPU state per thread Change-Id: Ica716ed754083ab470c1eace992837b454547b55 --- src/cpu/o3/commit.cc | 5 +-- src/cpu/o3/fetch.cc | 1 + src/cpu/valuepred/ValuePredictor.py | 1 + src/cpu/valuepred/enhanced_stride.cc | 41 ++++++++++++++++--------- src/cpu/valuepred/enhanced_stride.hh | 7 +++-- src/cpu/valuepred/ideal_constant_lvp.cc | 9 +++++- src/cpu/valuepred/ideal_constant_lvp.hh | 5 +-- src/cpu/valuepred/valuepred_metadata.hh | 3 ++ src/cpu/valuepred/valuepred_unit.cc | 16 +++++++++- src/cpu/valuepred/valuepred_unit.hh | 8 ++++- 10 files changed, 72 insertions(+), 24 deletions(-) diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index 668f222553..2257e1bd46 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -707,7 +707,7 @@ Commit::squashAll(ThreadID tid) changedROBNumEntries[tid] = true; if (valuePred) - valuePred->squash(squashed_inst); + valuePred->squash(tid, squashed_inst); // Send back the sequence number of the squashed instruction. toIEW->commitInfo[tid].doneSeqNum = squashed_inst; @@ -1099,7 +1099,7 @@ Commit::commit() changedROBNumEntries[tid] = true; if (valuePred) - valuePred->squash(squashed_inst); + valuePred->squash(tid, squashed_inst); toIEW->commitInfo[tid].doneSeqNum = squashed_inst; toIEW->commitInfo[tid].doneMemSeqNum = squashed_inst; @@ -1928,6 +1928,7 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) VPDataStructFactory::buildUpdateMetaData(valuePred->getValuePredictorType()); updateMetaData->pc = head_inst->getPC(); updateMetaData->seq_no = head_inst->seqNum; + updateMetaData->tid = tid; updateMetaData->actualValue = head_inst->actualValue; updateMetaData->isMisprediction = head_inst->vpMisprediction; valuePred->updateValuePredictor(updateMetaData); diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 2fdf1076f5..0642e5a587 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -2061,6 +2061,7 @@ Fetch::processSingleInstruction(ThreadID tid, PCStateBase &pc, vpPredMetaData->pc = instruction->getPC(); vpPredMetaData->seq_no = instruction->seqNum; + vpPredMetaData->tid = tid; instruction->vpResult = valuePred->valuePredict(vpPredMetaData); delete vpPredMetaData; } diff --git a/src/cpu/valuepred/ValuePredictor.py b/src/cpu/valuepred/ValuePredictor.py index 1b586060ad..65bc7b606c 100644 --- a/src/cpu/valuepred/ValuePredictor.py +++ b/src/cpu/valuepred/ValuePredictor.py @@ -11,6 +11,7 @@ class ValuePredictor(SimObject): cxx_class = "gem5::valuepred::VPUnit" cxx_header = "cpu/valuepred/valuepred_unit.hh" abstract = True + numThreads = Param.Unsigned(Parent.numThreads, "Number of threads") class EStride(ValuePredictor): type = "EStride" diff --git a/src/cpu/valuepred/enhanced_stride.cc b/src/cpu/valuepred/enhanced_stride.cc index ffbce18571..5707e2b40c 100644 --- a/src/cpu/valuepred/enhanced_stride.cc +++ b/src/cpu/valuepred/enhanced_stride.cc @@ -108,7 +108,7 @@ EStride::EStride(const Params ¶ms) logMaxConfidence(params.logMaxConfidence), MAXCONFIDENCE(1 << logMaxConfidence), confidenceThreshold(static_cast(params.thresholdPercent * MAXCONFIDENCE)), - inflightWindow(params.inflightWindowTagLength, params.idealWindow), + inflightWindows(), enableTimeMsgInUpdate(params.enableTimeMsgInUpdate), esstats(this) { @@ -122,9 +122,17 @@ EStride::EStride(const Params ¶ms) gem5_assert(params.inflightWindowTagLength, "EStride inflightWindowTagLength must > 0 \n"); // init stats - ESTables.resize(ways); - for (auto &table : ESTables) { - table.resize(entryCounts); + inflightWindows.reserve(numThreads); + for (ThreadID tid = 0; tid < numThreads; ++tid) { + inflightWindows.emplace_back(params.inflightWindowTagLength, params.idealWindow); + } + + ESTables.resize(numThreads); + for (auto &threadTables : ESTables) { + threadTables.resize(ways); + for (auto &table : threadTables) { + table.resize(entryCounts); + } } esstats.allocate.init(ways, entryCounts); @@ -289,8 +297,9 @@ EStride::doPredict(ESPredMetaData *esPredMetaData, int inflights) int way; uint32_t index; ESEntry entryCopy; + const ThreadID tid = esPredMetaData->tid; for (int i = 0; i < ways; ++i) { - const ESEntry &entry = ESTables[i][indexEachWays[i]]; + const ESEntry &entry = ESTables[tid][i][indexEachWays[i]]; if (!compareTags(entry.tag, tagEachWays[i])) { found = true; way = i; @@ -329,10 +338,11 @@ EStride::valuePredict(VPPredMetaData *predMetaData) { gem5_assert(predMetaData, "can't pass nullptr to vpunit\n"); ESPredMetaData *esPredMetaData = dynamic_cast(predMetaData); + assertValidTid(esPredMetaData->tid); // value prediction - int inflights = inflightWindow.addToInflightWindow(esPredMetaData->pc); + int inflights = inflightWindows[esPredMetaData->tid].addToInflightWindow(esPredMetaData->pc); esstats.inflightSH.sample(inflights, 1); return doPredict(esPredMetaData, inflights); @@ -343,10 +353,12 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) { gem5_assert(updateMetaData, "can't pass nullptr to vpunit\n"); ESUpdateMetaData *esUpdateMetaData = dynamic_cast(updateMetaData); + assertValidTid(esUpdateMetaData->tid); + const ThreadID tid = esUpdateMetaData->tid; // the first step update inflights window - inflightWindow.removeFromWindow(esUpdateMetaData->pc, esUpdateMetaData->seq_no); + inflightWindows[tid].removeFromWindow(esUpdateMetaData->pc, esUpdateMetaData->seq_no); // Given the nature of the current hash method, the same PC gets the // same hash value every time it is computed. So instead of storing @@ -387,7 +399,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) int way; uint32_t index; for (size_t i = 0; i < ways; ++i) { - const ESEntry &entry = ESTables[i][indexEachWays[i]]; + const ESEntry &entry = ESTables[tid][i][indexEachWays[i]]; // todo maybe change the occupied if (!compareTags(entry.tag, tagEachWays[i])) { found = true; @@ -400,7 +412,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) if (found) { // update - ESEntry &entry = ESTables[way][index]; + ESEntry &entry = ESTables[tid][way][index]; DPRINTF(EStride, "[way: %d index: %u][confidence: %d useful: %d lastValue: %lu]\n", way, index, entry.confidence, entry.useful, entry.lastValue); @@ -468,7 +480,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) // first find no confidence for (size_t i = 0; i < ways; ++i) { - ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]]; + ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]]; if (entry.confidence == 0) { DPRINTF(EStride, "allocate by confidence: [way: %d index: %u] \n", wayBegin, indexEachWays[wayBegin]); esstats.allocate[wayBegin][indexEachWays[wayBegin]]++; @@ -485,7 +497,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) // second find not useful for (size_t i = 0; i < ways; ++i) { - ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]]; + ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]]; if (entry.useful == 0) { DPRINTF(EStride, "allocate by useful: [way: %d index: %u] \n", wayBegin, indexEachWays[wayBegin]); esstats.allocate[wayBegin][indexEachWays[wayBegin]]++; @@ -501,7 +513,7 @@ EStride::updateValuePredictor(VPUpdateMetaData *updateMetaData) } // can't allocate, just random dec some useful count - ESEntry &entry = ESTables[wayBegin][indexEachWays[wayBegin]]; + ESEntry &entry = ESTables[tid][wayBegin][indexEachWays[wayBegin]]; DPRINTF(EStride, "try dec useful \n"); if (tryDecUseful(entry) == 0) { DPRINTF(EStride, "[dec useful count]=> way: %d index: %d", wayBegin, indexEachWays[wayBegin]); @@ -518,9 +530,10 @@ EStride::specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData) } void -EStride::squash(const uint64_t seq_no) +EStride::squash(ThreadID tid, const uint64_t seq_no) { - inflightWindow.squash(seq_no); + assertValidTid(tid); + inflightWindows[tid].squash(seq_no); } } diff --git a/src/cpu/valuepred/enhanced_stride.hh b/src/cpu/valuepred/enhanced_stride.hh index 5fc89c1730..d67509122d 100644 --- a/src/cpu/valuepred/enhanced_stride.hh +++ b/src/cpu/valuepred/enhanced_stride.hh @@ -110,12 +110,13 @@ class EStride : public VPUnit const int logMaxConfidence; const int MAXCONFIDENCE; const int confidenceThreshold; - InflightWindow inflightWindow; + std::vector inflightWindows; const bool enableTimeMsgInUpdate; private: - std::vector> ESTables; + // [tid][way][index] + std::vector>> ESTables; private: // This function really implements the prediction function. @@ -155,7 +156,7 @@ class EStride : public VPUnit // speculative updates may no longer be needed. virtual void specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData) override; - virtual void squash(const uint64_t seq_no) override; + virtual void squash(ThreadID tid, const uint64_t seq_no) override; virtual ValuePredType getValuePredictorType() override { return ValuePredType::EStride; } diff --git a/src/cpu/valuepred/ideal_constant_lvp.cc b/src/cpu/valuepred/ideal_constant_lvp.cc index 1d70397d19..823ad202b4 100644 --- a/src/cpu/valuepred/ideal_constant_lvp.cc +++ b/src/cpu/valuepred/ideal_constant_lvp.cc @@ -12,6 +12,7 @@ namespace valuepred IdealConstantLVP::IdealConstantLVP(const Params ¶ms) : VPUnit(params), + idealConstTables(params.numThreads), satCounterBits(params.satCounterBits), resetConfidence(params.resetConfidence) { @@ -20,6 +21,8 @@ IdealConstantLVP::IdealConstantLVP(const Params ¶ms) VPResult IdealConstantLVP::valuePredict(VPPredMetaData *predMetaData) { + assertValidTid(predMetaData->tid); + auto &idealConstTable = idealConstTables[predMetaData->tid]; auto it = idealConstTable.find(predMetaData->pc); if (it != idealConstTable.end()) { if (it->second.confidence.isSaturated()) { @@ -32,6 +35,8 @@ IdealConstantLVP::valuePredict(VPPredMetaData *predMetaData) void IdealConstantLVP::updateValuePredictor(VPUpdateMetaData *updateMetaData) { + assertValidTid(updateMetaData->tid); + auto &idealConstTable = idealConstTables[updateMetaData->tid]; auto it = idealConstTable.find(updateMetaData->pc); if (it == idealConstTable.end()) { // Not found, allocate a new entry @@ -63,8 +68,10 @@ IdealConstantLVP::specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaD } void -IdealConstantLVP::squash(const uint64_t seq_no) +IdealConstantLVP::squash(ThreadID tid, const uint64_t seq_no) { + (void)tid; + (void)seq_no; // Do nothing } diff --git a/src/cpu/valuepred/ideal_constant_lvp.hh b/src/cpu/valuepred/ideal_constant_lvp.hh index de8eaba0fd..5d380670e4 100644 --- a/src/cpu/valuepred/ideal_constant_lvp.hh +++ b/src/cpu/valuepred/ideal_constant_lvp.hh @@ -2,6 +2,7 @@ #define __IDEAL_CONSTANT_LVP_HH__ #include +#include #include "base/sat_counter.hh" #include "base/types.hh" @@ -30,7 +31,7 @@ class IdealConstantLVP : public VPUnit } }; - std::unordered_map idealConstTable; + std::vector> idealConstTables; const unsigned satCounterBits; const bool resetConfidence; @@ -46,7 +47,7 @@ class IdealConstantLVP : public VPUnit void specUpdateValuePredictor(VPSpecUpdateMetaData *specUpdateMetaData) override; - void squash(const uint64_t seq_no) override; + void squash(ThreadID tid, const uint64_t seq_no) override; virtual ValuePredType getValuePredictorType() override { return ValuePredType::IdealConstantLVP; } }; diff --git a/src/cpu/valuepred/valuepred_metadata.hh b/src/cpu/valuepred/valuepred_metadata.hh index d5cfc2a975..75454d34aa 100644 --- a/src/cpu/valuepred/valuepred_metadata.hh +++ b/src/cpu/valuepred/valuepred_metadata.hh @@ -15,6 +15,7 @@ class VPPredMetaData public: Addr pc; uint64_t seq_no; + ThreadID tid = 0; virtual ~VPPredMetaData() {}; }; @@ -23,6 +24,7 @@ class VPUpdateMetaData public: Addr pc; uint64_t seq_no; + ThreadID tid = 0; RegVal actualValue; bool isMisprediction; virtual ~VPUpdateMetaData() {}; @@ -31,6 +33,7 @@ class VPUpdateMetaData class VPSpecUpdateMetaData { public: + ThreadID tid = 0; virtual ~VPSpecUpdateMetaData() {}; }; diff --git a/src/cpu/valuepred/valuepred_unit.cc b/src/cpu/valuepred/valuepred_unit.cc index 97c7c7ebda..cd7ece5812 100644 --- a/src/cpu/valuepred/valuepred_unit.cc +++ b/src/cpu/valuepred/valuepred_unit.cc @@ -1,5 +1,6 @@ #include "cpu/valuepred/valuepred_unit.hh" +#include "base/logging.hh" #include "base/stats/group.hh" #include "base/stats/units.hh" @@ -9,7 +10,20 @@ namespace gem5 namespace valuepred { -VPUnit::VPUnit(const Params ¶ms) : SimObject(params), stats(this) {} +VPUnit::VPUnit(const Params ¶ms) + : SimObject(params), + numThreads(params.numThreads), + stats(this) +{ + gem5_assert(numThreads > 0, "Value predictor needs at least one thread\n"); +} + +void +VPUnit::assertValidTid(ThreadID tid) const +{ + gem5_assert(tid < numThreads, "%s got invalid tid %u, numThreads=%u\n", + name().c_str(), static_cast(tid), numThreads); +} VPUnit::ValuePredUnitStats::ValuePredUnitStats(VPUnit *vp) : statistics::Group(vp), diff --git a/src/cpu/valuepred/valuepred_unit.hh b/src/cpu/valuepred/valuepred_unit.hh index 4380cdeaed..52d36b0121 100644 --- a/src/cpu/valuepred/valuepred_unit.hh +++ b/src/cpu/valuepred/valuepred_unit.hh @@ -4,6 +4,7 @@ #include #include "base/statistics.hh" +#include "base/types.hh" #include "cpu/valuepred/valuepred_metadata.hh" #include "enums/ValuePredType.hh" #include "params/ValuePredictor.hh" @@ -23,6 +24,11 @@ class VPUnit : public SimObject private: using Params = ValuePredictorParams; + protected: + const unsigned numThreads; + + void assertValidTid(ThreadID tid) const; + public: VPUnit(const Params ¶ms); @@ -38,7 +44,7 @@ class VPUnit : public SimObject virtual void specUpdateValuePredictor(VPSpecUpdateMetaData *specupdateMetadata) = 0; // If predict error, squash the inflight instructions in value predictor. - virtual void squash(const uint64_t seq_no) = 0; + virtual void squash(ThreadID tid, const uint64_t seq_no) = 0; // Get the value predictor type virtual ValuePredType getValuePredictorType() = 0; From 004a04603639c09dc1894be5617d6d495f0ee3e6 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 29 Apr 2026 10:14:53 +0800 Subject: [PATCH 27/38] mem-cache: Preserve prefetch context Carry the triggering request context through prefetch metadata and final HardPFReq creation. Use context-aware prefetch queue matching so SMT threads with the same virtual address do not squash or deduplicate each other's prefetches. Validation: - scons build/RISCV/gem5.opt -j16 - git diff --check Change-Id: I6a40826b47bff2e7ee8c7748eecd8622b44ca3c3 --- src/mem/cache/prefetch/base.cc | 29 +++++++++++++++--- src/mem/cache/prefetch/base.hh | 52 ++++++++++++++++++++++++++++++-- src/mem/cache/prefetch/queued.cc | 22 ++++++++++---- src/mem/cache/prefetch/worker.cc | 6 ++-- 4 files changed, 94 insertions(+), 15 deletions(-) diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc index 0fd233b369..109c75c6b0 100644 --- a/src/mem/cache/prefetch/base.cc +++ b/src/mem/cache/prefetch/base.cc @@ -63,7 +63,11 @@ namespace prefetch Base::PrefetchInfo::PrefetchInfo(PacketPtr pkt, Addr addr, bool miss) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0), - requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()), + requestorId(pkt->req->requestorId()), + _contextId(pkt->req->hasContextId() ? + pkt->req->contextId() : InvalidContextID), + validContextId(pkt->req->hasContextId()), + validPC(pkt->req->hasPC()), secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()), paddress(pkt->req->getPaddr()), cacheMiss(miss) { @@ -86,7 +90,11 @@ Base::PrefetchInfo::PrefetchInfo( PacketPtr pkt, Addr addr, bool miss, Request::XsMetadata xsMeta ) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0), - requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()), + requestorId(pkt->req->requestorId()), + _contextId(pkt->req->hasContextId() ? + pkt->req->contextId() : InvalidContextID), + validContextId(pkt->req->hasContextId()), + validPC(pkt->req->hasPC()), secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()), paddress(pkt->req->getPaddr()), cacheMiss(miss), xsMetadata(xsMeta) { @@ -107,6 +115,7 @@ Base::PrefetchInfo::PrefetchInfo( Base::PrefetchInfo::PrefetchInfo(PrefetchInfo const &pfi, Addr addr) : address(addr), pc(pfi.pc), requestorId(pfi.requestorId), + _contextId(pfi._contextId), validContextId(pfi.validContextId), validPC(pfi.validPC), secure(pfi.secure), size(pfi.size), write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss), data(nullptr),data_ptr(nullptr) @@ -114,6 +123,7 @@ Base::PrefetchInfo::PrefetchInfo(PrefetchInfo const &pfi, Addr addr) } Base::PrefetchInfo::PrefetchInfo(PrefetchInfo_old const &pfi) : address(pfi.address), pc(pfi.pc), requestorId(pfi.requestorId), + _contextId(pfi._contextId), validContextId(pfi.validContextId), validPC(pfi.validPC), secure(pfi.secure), size(pfi.size), write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss), data(nullptr),data_ptr(nullptr) @@ -121,7 +131,11 @@ Base::PrefetchInfo::PrefetchInfo(PrefetchInfo_old const &pfi) } Base::PrefetchInfo_old::PrefetchInfo_old(PacketPtr pkt, Addr addr, bool miss) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0), - requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()), + requestorId(pkt->req->requestorId()), + _contextId(pkt->req->hasContextId() ? + pkt->req->contextId() : InvalidContextID), + validContextId(pkt->req->hasContextId()), + validPC(pkt->req->hasPC()), secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()), paddress(pkt->req->getPaddr()), cacheMiss(miss) { @@ -144,7 +158,11 @@ Base::PrefetchInfo_old::PrefetchInfo_old( PacketPtr pkt, Addr addr, bool miss, Request::XsMetadata xsMeta ) : address(addr), pc(pkt->req->hasPC() ? pkt->req->getPC() : 0), - requestorId(pkt->req->requestorId()), validPC(pkt->req->hasPC()), + requestorId(pkt->req->requestorId()), + _contextId(pkt->req->hasContextId() ? + pkt->req->contextId() : InvalidContextID), + validContextId(pkt->req->hasContextId()), + validPC(pkt->req->hasPC()), secure(pkt->isSecure()), size(pkt->req->getSize()), write(pkt->isWrite()), paddress(pkt->req->getPaddr()), cacheMiss(miss), xsMetadata(xsMeta) { @@ -164,6 +182,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old( } Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &other) : address(other.address), pc(other.pc), requestorId(other.requestorId), + _contextId(other._contextId), validContextId(other.validContextId), validPC(other.validPC), secure(other.secure), size(other.size), write(other.write), paddress(other.paddress), cacheMiss(other.cacheMiss), data(nullptr),data_ptr(nullptr) @@ -172,6 +191,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &other) } Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &pfi, Addr addr) : address(addr), pc(pfi.pc), requestorId(pfi.requestorId), + _contextId(pfi._contextId), validContextId(pfi.validContextId), validPC(pfi.validPC), secure(pfi.secure), size(pfi.size), write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss), data(nullptr),data_ptr(nullptr) @@ -179,6 +199,7 @@ Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo_old const &pfi, Addr addr) } Base::PrefetchInfo_old::PrefetchInfo_old(PrefetchInfo const &pfi) : address(pfi.address), pc(pfi.pc), requestorId(pfi.requestorId), + _contextId(pfi._contextId), validContextId(pfi.validContextId), validPC(pfi.validPC), secure(pfi.secure), size(pfi.size), write(pfi.write), paddress(pfi.paddress), cacheMiss(pfi.cacheMiss), data(nullptr),data_ptr(nullptr) diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh index 2ba250fa48..f1a6fde397 100644 --- a/src/mem/cache/prefetch/base.hh +++ b/src/mem/cache/prefetch/base.hh @@ -163,6 +163,10 @@ class Base : public ClockedObject Addr pc; /** The requestor ID that generated this address. */ RequestorID requestorId; + /** The thread context that generated this address. */ + ContextID _contextId; + /** Whether the thread context is valid. */ + bool validContextId; /** Validity bit for the PC of this address. */ bool validPC; /** Whether this address targets the secure memory space. */ @@ -242,6 +246,17 @@ class Base : public ClockedObject return requestorId; } + bool hasContextId() const + { + return validContextId; + } + + ContextID contextId() const + { + assert(hasContextId()); + return _contextId; + } + /** * Gets the size of the request triggering this event * @return the size in bytes of the request triggering this event @@ -317,7 +332,16 @@ class Base : public ClockedObject bool sameAddr(PrefetchInfo const &pfi) const { return this->getAddr() == pfi.getAddr() && - this->isSecure() == pfi.isSecure(); + this->isSecure() == pfi.isSecure() && + this->sameContext(pfi); + } + + bool sameContext(PrefetchInfo const &pfi) const + { + if (hasContextId() != pfi.hasContextId()) { + return false; + } + return !hasContextId() || _contextId == pfi.contextId(); } bool sameAddr(Addr addr, bool isSecure) const @@ -407,6 +431,10 @@ class Base : public ClockedObject Addr pc; /** The requestor ID that generated this address. */ RequestorID requestorId; + /** The thread context that generated this address. */ + ContextID _contextId; + /** Whether the thread context is valid. */ + bool validContextId; /** Validity bit for the PC of this address. */ bool validPC; /** Whether this address targets the secure memory space. */ @@ -486,6 +514,17 @@ class Base : public ClockedObject return requestorId; } + bool hasContextId() const + { + return validContextId; + } + + ContextID contextId() const + { + assert(hasContextId()); + return _contextId; + } + /** * Gets the size of the request triggering this event * @return the size in bytes of the request triggering this event @@ -561,7 +600,16 @@ class Base : public ClockedObject bool sameAddr(PrefetchInfo_old const &pfi) const { return this->getAddr() == pfi.getAddr() && - this->isSecure() == pfi.isSecure(); + this->isSecure() == pfi.isSecure() && + this->sameContext(pfi); + } + + bool sameContext(PrefetchInfo_old const &pfi) const + { + if (hasContextId() != pfi.hasContextId()) { + return false; + } + return !hasContextId() || _contextId == pfi.contextId(); } bool sameAddr(Addr addr, bool isSecure) const diff --git a/src/mem/cache/prefetch/queued.cc b/src/mem/cache/prefetch/queued.cc index 5fbeb76b68..2ded29bfcf 100644 --- a/src/mem/cache/prefetch/queued.cc +++ b/src/mem/cache/prefetch/queued.cc @@ -67,11 +67,21 @@ Queued::DeferredPacket::createPkt(Addr paddr, unsigned blk_size, RequestorID req /* Create a prefetch memory request */ RequestPtr req; if (owner->useVirtualAddresses && pfInfo.hasPC()) { - req = std::make_shared(pfInfo.getAddr(), blk_size, 0, - requestor_id, pfInfo.getPC(), 0); + if (pfInfo.hasContextId()) { + req = std::make_shared(pfInfo.getAddr(), blk_size, 0, + requestor_id, pfInfo.getPC(), + pfInfo.contextId()); + } else { + req = std::make_shared(); + req->setVirt(pfInfo.getAddr(), blk_size, 0, requestor_id, + pfInfo.getPC()); + } req->setPaddr(paddr); } else { req = std::make_shared(paddr, blk_size, 0, requestor_id); + if (pfInfo.hasContextId()) { + req->setContext(pfInfo.contextId()); + } } req->setFlags(Request::PREFETCH); @@ -213,7 +223,6 @@ void Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi) { Addr blk_addr = blockAddress(pfi.getAddr()); - bool is_secure = pfi.isSecure(); bool late_in_mshr = pkt->missOnLatePf; // hit in pf mshr @@ -222,10 +231,10 @@ Queued::notify(const PacketPtr &pkt, const PrefetchInfo &pfi) // Squash queued prefetches if demand miss to same line if (queueSquash) { + PrefetchInfo blk_pfi(pfi, blk_addr); auto itr = pfq.begin(); while (itr != pfq.end()) { - if (itr->pfInfo.getAddr() == blk_addr && - itr->pfInfo.isSecure() == is_secure) { + if (itr->pfInfo.sameAddr(blk_pfi)) { DPRINTF(HWPrefetch, "Removing pf candidate addr: %#x " "(cl: %#x), demand request going to the same addr\n", itr->pfInfo.getAddr(), @@ -545,9 +554,10 @@ Queued::alreadyInQueue(std::list &queue, RequestPtr Queued::createPrefetchRequest(Addr addr, PrefetchInfo const &pfi, PacketPtr pkt, PrefetchSourceType pf_src, int pf_depth) { + assert(pfi.hasContextId()); RequestPtr translation_req = std::make_shared( addr, blkSize, pkt->req->getFlags(), requestorId, pfi.getPC(), - pkt->req->contextId()); + pfi.contextId()); translation_req->setFlags(Request::PF_EXCLUSIVE); translation_req->setPFSource(pf_src); translation_req->setPFDepth(pf_depth); diff --git a/src/mem/cache/prefetch/worker.cc b/src/mem/cache/prefetch/worker.cc index 6602ebe7a0..a87b6810fa 100644 --- a/src/mem/cache/prefetch/worker.cc +++ b/src/mem/cache/prefetch/worker.cc @@ -60,10 +60,10 @@ WorkerPrefetcher::transfer() auto dpp_it = localBuffer.begin(); while (count < depth && !localBuffer.empty()) { if (queueFilter) { - if (alreadyInQueue(pfq, dpp_it->pfInfo.getAddr(), dpp_it->pfInfo.isSecure(), dpp_it->priority)) { + if (alreadyInQueue(pfq, dpp_it->pfInfo, dpp_it->priority)) { DPRINTF(WorkerPref, "Worker: [%lx, %d] was already in pfq\n", dpp_it->pfInfo.getAddr(), dpp_it->pfahead_host); - } else if (alreadyInQueue(pfqMissingTranslation, dpp_it->pfInfo.getAddr(), dpp_it->pfInfo.isSecure(), + } else if (alreadyInQueue(pfqMissingTranslation, dpp_it->pfInfo, dpp_it->priority)) { DPRINTF(WorkerPref, "Worker: [%lx, %d] was already in pfq\n", dpp_it->pfInfo.getAddr(), dpp_it->pfahead_host); @@ -85,4 +85,4 @@ WorkerPrefetcher::transfer() } } // namespace prefetch -} // namespace gem5 \ No newline at end of file +} // namespace gem5 From 32eed52ed1e8c23c6392372962a1e759fac2c58e Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 11 May 2026 13:07:18 +0800 Subject: [PATCH 28/38] cpu-o3: Select IQ entries by dispatch age Use per-instruction dispatch age for IQ ordering and remove the old SMT-specific selector plumbing. Change-Id: I06f4cec1fcbe910652272cf1caf9366bdbbea171 --- src/cpu/o3/FuncScheduler.py | 7 +-- src/cpu/o3/SConscript | 2 +- src/cpu/o3/dyn_inst.hh | 3 ++ src/cpu/o3/iew.cc | 18 ++++++++ src/cpu/o3/iew.hh | 1 + src/cpu/o3/inst_queue.cc | 3 +- src/cpu/o3/issue_queue.cc | 92 ++----------------------------------- src/cpu/o3/issue_queue.hh | 28 +---------- src/cpu/o3/smt_sched.hh | 33 ------------- 9 files changed, 30 insertions(+), 157 deletions(-) diff --git a/src/cpu/o3/FuncScheduler.py b/src/cpu/o3/FuncScheduler.py index 7676f6d643..2e118a6734 100644 --- a/src/cpu/o3/FuncScheduler.py +++ b/src/cpu/o3/FuncScheduler.py @@ -75,11 +75,6 @@ class PAgeSelector(BaseSelector): piece = Param.Int(2, "number of instructions in a group") -class SMTBasedSelector(BaseSelector): - type = 'SMTBasedSelector' - cxx_class = 'gem5::o3::SMTBasedSelector' - cxx_header = "cpu/o3/issue_queue.hh" - class IssueQue(SimObject): type = 'IssueQue' cxx_class = 'gem5::o3::IssueQue' @@ -90,7 +85,7 @@ class IssueQue(SimObject): inports = Param.Int(2, "") scheduleToExecDelay = Param.Cycles(2, "") oports = VectorParam.IssuePort("") - sel = Param.BaseSelector(SMTBasedSelector(), "Selector for this IQ (default: age first)") + sel = Param.BaseSelector(BaseSelector(), "Selector for this IQ") class Scheduler(SimObject): type = 'Scheduler' diff --git a/src/cpu/o3/SConscript b/src/cpu/o3/SConscript index 3c2902a6b4..93d6c0b1da 100755 --- a/src/cpu/o3/SConscript +++ b/src/cpu/o3/SConscript @@ -32,7 +32,7 @@ Import('*') if env['CONF']['TARGET_ISA'] != 'null': SimObject('FuncScheduler.py', sim_objects=['FUPool', 'SpecWakeupChannel', - 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'SMTBasedSelector', 'Scheduler']) + 'IssuePort', 'IssueQue', 'BaseSelector', 'PAgeSelector', 'Scheduler']) SimObject('FuncUnitConfig.py', sim_objects=[]) SimObject('BaseO3CPU.py', sim_objects=['BaseO3CPU'], enums=[ 'SMTFetchPolicy', 'SMTQueuePolicy', 'SMTLSQMode', 'CommitPolicy', diff --git a/src/cpu/o3/dyn_inst.hh b/src/cpu/o3/dyn_inst.hh index f79a8784b4..60459642f9 100644 --- a/src/cpu/o3/dyn_inst.hh +++ b/src/cpu/o3/dyn_inst.hh @@ -1455,6 +1455,9 @@ class DynInst : public ExecContext, public RefCounted Tick lastWakeDependents = -1; Tick translatedTick = -1; + /** Dispatch age = dispatch cycle * 8 + dispatch position. */ + uint64_t ageCtr = static_cast(-1); + Tick readyTick = -1; Tick completionTick = -1; diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 96fded9794..03076739e3 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -915,6 +915,20 @@ IEW::canInsertLDSTQue(ThreadID tid) return false; } +void +IEW::setDispatchAgeCtr(const DynInstPtr& inst, int dispatch_pos) +{ + constexpr uint64_t dispatchAgeScale = 8; + + assert(dispatch_pos >= 0); + assert(dispatch_pos < static_cast(dispatchAgeScale)); + inst->ageCtr = static_cast(cpu->curCycle()) * dispatchAgeScale + + static_cast(dispatch_pos); + DPRINTF(IEW, "[tid:%i] [sn:%llu] ageCtr=%llu at dispatch pos %d.\n", + inst->threadNumber, inst->seqNum, + static_cast(inst->ageCtr), dispatch_pos); +} + void IEW::dispatchInsts() { @@ -1055,6 +1069,8 @@ IEW::dispatchInstFromRename(ThreadID tid) inst->clearHtmTransactionalState(); } + setDispatchAgeCtr(inst, dispatched); + if (!inst->isNop() && !inst->isEliminated()) { scheduler->addProducer(inst); } @@ -1221,6 +1237,8 @@ IEW::classifyInstToDispQue(ThreadID tid) inst->clearHtmTransactionalState(); } + setDispatchAgeCtr(inst, dispatched); + if (inst->isAtomic()) { ++iewStats.dispStoreInsts; ++iewStats.dispNonSpecInsts; diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index c621e62ebc..a050d3d9d7 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -327,6 +327,7 @@ class IEW /** Dispatches instructions to IQ and LSQ. */ void dispatchInsts(); + void setDispatchAgeCtr(const DynInstPtr& inst, int dispatch_pos); void dispatchInstFromRename(ThreadID tid); diff --git a/src/cpu/o3/inst_queue.cc b/src/cpu/o3/inst_queue.cc index dda79556dc..2b76fdaf7e 100644 --- a/src/cpu/o3/inst_queue.cc +++ b/src/cpu/o3/inst_queue.cc @@ -163,8 +163,7 @@ InstructionQueue::InstructionQueue(CPU *cpu_ptr, IEW *iew_ptr, scheduler->setCPU(cpu_ptr, &iew_ptr->ldstQueue); scheduler->resetDepGraph(numPhysRegs); scheduler->setMemDepUnit(memDepUnit); - scheduler->initIQICountSmtScheduler(numThreads); - + resetState(); } diff --git a/src/cpu/o3/issue_queue.cc b/src/cpu/o3/issue_queue.cc index bf5bb1ea47..094c0a2fa3 100644 --- a/src/cpu/o3/issue_queue.cc +++ b/src/cpu/o3/issue_queue.cc @@ -140,61 +140,12 @@ PAgeSelector::select(ReadyQue::iterator begin, int portid) } } -void -SMTBasedSelector::setparent(Scheduler* scheduler, IssueQue* iq) -{ - BaseSelector::setparent(scheduler, iq); - - smtScheduler = iq->getIndependentIQICountScheduler(); -} - -ReadyQue::iterator -SMTBasedSelector::select(ReadyQue::iterator begin, int portid) -{ - if (begin == end) { - return end; - } - - ThreadID priorityThread = 0; - - if (smtScheduler) { - priorityThread = smtScheduler->getThread(); - - DPRINTF(Schedule, - "SMTBasedSelector: priority thread = %d\n", - priorityThread); - } - - for (auto it = begin; it != end; it++) { - auto& inst = *it; - - if (inst->threadNumber == priorityThread) { - DPRINTF(Schedule, - "[sn:%llu] selected by SMT policy (tid=%d)\n", - inst->seqNum, priorityThread); - return it; - } - } - - - for (auto it = begin; it != end; it++) { - auto& inst = *it; - - if (inst->threadNumber != priorityThread) { - DPRINTF(Schedule, - "[sn:%llu] selected by default (tid=%d, priority=%d)\n", - inst->seqNum, inst->threadNumber, priorityThread); - return it; - } - } - - DPRINTF(Schedule, "SMTBasedSelector: no available instruction\n"); - return begin; -} - bool IssueQue::select_policy::operator()(const DynInstPtr& a, const DynInstPtr& b) const { + if (a->ageCtr != b->ageCtr) { + return a->ageCtr < b->ageCtr; + } return a->seqNum < b->seqNum; } @@ -646,8 +597,6 @@ IssueQue::selectInst() selectQ.clear(); for (int pi = 0; pi < outports; pi++) { auto readyQ = readyQs[pi]; - // iq->getInstsCounter()->getCounter(tid) - int iqcount = 0; for (auto it = readyQ->begin(); it != readyQ->end(); ++it) { DPRINTF(Schedule, "readyQ for port %d has [sn:%llu] %s [tid:%u]\n", pi, (*it)->seqNum, (*it)->genDisassembly(), (*it)->threadNumber); @@ -666,15 +615,6 @@ IssueQue::selectInst() uint64_t busy_bit = (lat > 63 ? -1 : (1llu << lat)); if (!(portBusy[pi] & busy_bit)) { DPRINTF(Schedule, "[sn %ld] was selected\n", inst->seqNum); - for (ThreadID tid = 0; tid < MaxThreads; tid++) { - if (inst->threadNumber == tid) { - independentIQICountScheduler->scheduleNum[tid]++; - } else { - independentIQICountScheduler->scheduleNum[tid] = 0; - } - } - DPRINTF(Schedule, "smtScheduler->scheduleNum[0]=%d, smtScheduler->scheduleNum[1]=%d\n", - independentIQICountScheduler->scheduleNum[0], independentIQICountScheduler->scheduleNum[1]); // get regfile write port for (int i = 0; i < inst->numDestRegs(); i++) { auto pdst = inst->renamedDestIdx(i); @@ -937,17 +877,6 @@ IssueQue::decInIQInstsCounter(ThreadID tid) } } -void -IssueQue::initIndependentIQICountScheduler(int numThreads) -{ - assert(instsCounter != nullptr && "InstsCounter must be set first"); - - independentIQICountScheduler = new IndependentIQICountScheduler( - numThreads, instsCounter); - - DPRINTF(Schedule, "[%s] IndependentIQICountScheduler created.\n",iqname); -} - Scheduler::SpecWakeupCompletion::SpecWakeupCompletion(const DynInstPtr& inst, IssueQue* to, PendingWakeEventsType* owner) : Event(Stat_Event_Pri, AutoDelete), inst(inst), owner(owner), to_issue_queue(to) @@ -1146,6 +1075,7 @@ Scheduler::setCPU(CPU* cpu, LSQ* lsq) this->lsq = lsq; for (auto it : issueQues) { it->setCPU(cpu); + it->selector->setparent(this, it); } } @@ -1763,19 +1693,5 @@ Scheduler::setMainRdpOpt(bool enable) } } -void -Scheduler::initIQICountSmtScheduler(int numThreads) -{ - DPRINTF(Schedule, "Initializing IQ SMT schedulers for %d thread.\n", numThreads); - - // to do: add switch;add SMTSchedulingPolicy - for (auto iq : issueQues) { - InstsCounter* counter = iq->getInstsCounter(); - assert(counter); - iq->initIndependentIQICountScheduler(numThreads); - iq->selector->setparent(this, iq); - } -} - } } diff --git a/src/cpu/o3/issue_queue.hh b/src/cpu/o3/issue_queue.hh index 6c6d9f8fbf..15c99eeb4c 100644 --- a/src/cpu/o3/issue_queue.hh +++ b/src/cpu/o3/issue_queue.hh @@ -23,7 +23,6 @@ #include "params/IssuePort.hh" #include "params/IssueQue.hh" #include "params/PAgeSelector.hh" -#include "params/SMTBasedSelector.hh" #include "params/Scheduler.hh" #include "params/SpecWakeupChannel.hh" #include "sim/sim_object.hh" @@ -101,25 +100,11 @@ class PAgeSelector : public BaseSelector ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; }; -class SMTBasedSelector : public BaseSelector -{ - private: - IndependentIQICountScheduler* smtScheduler = nullptr; - public: - SMTBasedSelector(const SMTBasedSelectorParams& params) : BaseSelector(params) {} - void setparent(Scheduler* scheduler, IssueQue* iq) override; - void allocate(const DynInstPtr& inst) override { BaseSelector::allocate(inst);} - void deallocate(const DynInstPtr& inst) override { BaseSelector::deallocate(inst);} - ReadyQue::iterator select(ReadyQue::iterator begin, int portid) override; -}; - class IssueQue : public SimObject { friend class Scheduler; friend class BaseSelector; friend class PAgeSelector; - friend class InstsCounter; - friend class IndependentIQICountScheduler; std::string _name; const int inports; @@ -188,9 +173,8 @@ class IssueQue : public SimObject Scheduler* scheduler = nullptr; BaseSelector* selector = nullptr; - //iq smt scheduler + // iq per-thread occupancy counter, used for fetch-side feedback stats InstsCounter* instsCounter = nullptr; - IndependentIQICountScheduler* independentIQICountScheduler = nullptr; struct IssueQueStats : public statistics::Group { @@ -228,21 +212,12 @@ class IssueQue : public SimObject void setMainRdpOpt(bool enable) { enableMainRdpOpt = enable; } void resetDepGraph(int numPhysRegs); - void setInstsCounter(InstsCounter* counter) { instsCounter = counter;} - InstsCounter* getInstsCounter() const {return instsCounter; } void incInIQInstsCounter(ThreadID tid); void decInIQInstsCounter(ThreadID tid); bool hasInstsCounter() const { return instsCounter != nullptr; } - void initIndependentIQICountScheduler(int numThreads); - - void setIndependentIQICountScheduler( IndependentIQICountScheduler* _independentIQICountScheduler ) { - independentIQICountScheduler = _independentIQICountScheduler; - } - IndependentIQICountScheduler* getIndependentIQICountScheduler() { return independentIQICountScheduler; } - void tick(); bool ready(); int emptyEntries() const { return iqsize - instNum; } @@ -367,7 +342,6 @@ class Scheduler : public SimObject void setAllScoreBoard(PhysRegIdPtr reg); void setMemDepUnit(MemDepUnit* memDepUnit) { this->memDepUnit = memDepUnit; } void setMainRdpOpt(bool enable); - void initIQICountSmtScheduler(int numThreads); void tick(); void issueAndSelect(); diff --git a/src/cpu/o3/smt_sched.hh b/src/cpu/o3/smt_sched.hh index 7fcbf5733f..a10e404b15 100644 --- a/src/cpu/o3/smt_sched.hh +++ b/src/cpu/o3/smt_sched.hh @@ -127,38 +127,5 @@ class MultiPrioritySched : public SMTScheduler } }; -class IndependentIQICountScheduler : public SMTScheduler { -private: - InstsCounter* counter; // Counter for this IQ only - - -public: - IndependentIQICountScheduler(int numThreads, InstsCounter* counter) - : SMTScheduler(numThreads), counter(counter){} - - ThreadID getThread() override { - ThreadID selectedTid = 0; - uint64_t maxCount = counter->getCounter(0); - if(scheduleNum[0] >= 100){ - selectedTid = 1; - return selectedTid; - } - for (ThreadID tid = 1; tid < numThreads; ++tid) { - if(scheduleNum[tid] >= 100){ - selectedTid = 0; - return selectedTid; - } - uint64_t count = counter->getCounter(tid); - if (count > maxCount) { - maxCount = count; - selectedTid = tid; - } - } - return selectedTid; - } - int scheduleNum[MaxThreads]; - -}; - }} #endif From 70a364f8eb4bcf49933a4543c5f5caa11f6564fd Mon Sep 17 00:00:00 2001 From: Mo Haonan <66786667+mhnGitHubz@users.noreply.github.com> Date: Mon, 11 May 2026 19:44:36 +0800 Subject: [PATCH 29/38] cpu-o3: all threads have a store to offload and both fail, reset request priority. (#847) Co-authored-by: mo haonan --- src/cpu/o3/lsq.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 448530ab69..33ffdeb593 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -993,12 +993,16 @@ LSQ::processWriteback() } } threads = activeThreads->begin(); + bool has_thread_offloaded = false; for (ThreadID tid = 0; tid < numThreads; ++tid) { thread[(nextStoreBufferInsertTid + tid) % numThreads].offloadToStoreBuffer(offload_quota[(nextStoreBufferInsertTid + tid) % numThreads], offload_fail); + has_thread_offloaded |= ((offload_quota[(nextStoreBufferInsertTid + tid) % numThreads] != 0) + && !(offload_fail[(nextStoreBufferInsertTid + tid) % numThreads])); + } for (ThreadID tid = 0; tid < numThreads; ++tid) { - if (offload_fail[tid]) { + if (offload_fail[tid] && has_thread_offloaded) { nextStoreBufferInsertTid = tid; } } From f4334ca0301a02ba1ec0369617c7e6fa9e178f7d Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 19 May 2026 17:02:05 +0800 Subject: [PATCH 30/38] cpu-o3: Expose SMT borrowing tunables Change-Id: I1078cf7423248c619faeb4fde7f0a210a7d02b77 --- configs/example/smt_idealkmhv3.py | 1 + src/cpu/o3/BaseO3CPU.py | 12 ++- src/cpu/o3/comm.hh | 127 ++++++++++++++++++++++++++++++ src/cpu/o3/commit.cc | 52 +++++++++--- src/cpu/o3/commit.hh | 6 ++ src/cpu/o3/decode.cc | 24 +++--- src/cpu/o3/fetch.cc | 80 ++++++++++++++----- src/cpu/o3/fetch.hh | 4 + src/cpu/o3/iew.cc | 44 ++++++++--- src/cpu/o3/lsq.cc | 11 ++- src/cpu/o3/rename.cc | 23 ++++-- src/cpu/o3/rob.cc | 111 ++++++++++++++++++++++++++ src/cpu/o3/rob.hh | 35 +++++++- 13 files changed, 465 insertions(+), 65 deletions(-) diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py index dbbe66f814..28abc2a3e9 100644 --- a/configs/example/smt_idealkmhv3.py +++ b/configs/example/smt_idealkmhv3.py @@ -22,6 +22,7 @@ def setSharedLSQParams(args, system): # shared target queue and starve the other thread's frontend. cpu.smtLSQMode = 'Shared' cpu.smtLSQPolicy = 'Dynamic' + cpu.smtROBPolicy = 'DynamicBorrowing' cpu.branchPred.smtFTQMode = 'Shared' cpu.branchPred.smtFTQPolicy = 'Partitioned' diff --git a/src/cpu/o3/BaseO3CPU.py b/src/cpu/o3/BaseO3CPU.py index b1f6979368..88c03f1dd6 100644 --- a/src/cpu/o3/BaseO3CPU.py +++ b/src/cpu/o3/BaseO3CPU.py @@ -51,7 +51,7 @@ class SMTFetchPolicy(ScopedEnum): vals = [ 'RoundRobin', 'Branch', 'IQCount', 'LSQCount' ] class SMTQueuePolicy(ScopedEnum): - vals = [ 'Dynamic', 'Partitioned', 'Threshold' ] + vals = [ 'Dynamic', 'Partitioned', 'Threshold', 'DynamicBorrowing' ] class SMTLSQMode(ScopedEnum): vals = [ 'Independent', 'Shared' ] @@ -248,6 +248,16 @@ def support_take_over(cls): "SMT ROB Sharing Policy") smtROBThreshold = Param.Int(100, "SMT ROB Threshold Sharing Parameter") smtCommitPolicy = Param.CommitPolicy('RoundRobin', "SMT Commit Policy") + smtBorrowThrottleCycles = Param.Unsigned( + 8, "Cycles to keep a backend-stalled SMT thread throttled at fetch") + smtBorrowLdstqHighWater = Param.Unsigned( + 0, "Explicit SMT borrowing LSQ high-water threshold; 0 uses percentage") + smtBorrowLdstqHighWaterPercent = Param.Percent( + 75, "SMT borrowing LSQ high-water threshold as a percentage of LQ+SQ") + smtBorrowDonorHoldCycles = Param.Unsigned( + 8, "Cycles to keep an SMT thread marked as a ROB borrowing donor") + smtBorrowDonorReserveEntries = Param.Unsigned( + 8, "Minimum ROB entries reserved for a borrowing donor to resume") branchPred = Param.BranchPredictor(DecoupledBPUWithBTB(), "Branch Predictor") diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index ade70ed5e3..8d8c8cdd7d 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -350,6 +350,133 @@ struct TimeStruct CommitComm commitInfo[MaxThreads];// commit to iew, rename, fetch }; +inline bool +smtCanDonateRobHeadroom(StallReason reason) +{ + switch (reason) { + case NoStall: + case ROBFull: + case RegFull: + case MemDQBandwidth: + case IntDQBandwidth: + case FVDQBandwidth: + case VectorReadyButNotIssued: + case ScalarReadyButNotIssued: + case CommitSquash: + return false; + default: + return true; + } +} + +inline bool +smtIsMemoryPressureReason(StallReason reason) +{ + switch (reason) { + case DTlbStall: + case LoadL2Bound: + case LoadL3Bound: + case LoadMemBound: + case StoreL2Bound: + case StoreL3Bound: + case StoreMemBound: + case MemSquashed: + case MemNotReady: + case MemCommitRateLimit: + case Atomic: + case OtherMemStall: + return true; + default: + return false; + } +} + +inline bool +smtHasBorrowThrottleStall(const TimeStruct::IewComm &info) +{ + return smtCanDonateRobHeadroom(info.robHeadStallReason) || + smtCanDonateRobHeadroom(info.lqHeadStallReason) || + smtCanDonateRobHeadroom(info.sqHeadStallReason); +} + +inline bool +smtHasMemoryPressure(const TimeStruct::IewComm &info, + unsigned ldstqHighWater = 0) +{ + if (ldstqHighWater != 0 && info.ldstqCount >= ldstqHighWater) { + return true; + } + + return smtIsMemoryPressureReason(info.robHeadStallReason) || + smtIsMemoryPressureReason(info.lqHeadStallReason) || + smtIsMemoryPressureReason(info.sqHeadStallReason); +} + +inline uint64_t +smtBorrowPriority(const TimeStruct::IewComm &info) +{ + constexpr uint64_t backend_stall_penalty = 1ULL << 48; + constexpr uint64_t memory_pressure_penalty = 1ULL << 49; + + uint64_t score = static_cast(info.robCount) + + static_cast(info.iqCount) * 2 + + static_cast(info.ldstqCount) * 4; + + if (smtHasBorrowThrottleStall(info)) { + score += backend_stall_penalty; + } + if (smtHasMemoryPressure(info)) { + score += memory_pressure_penalty; + } + + return score; +} + +struct SmtActiveThreadFreeze +{ + ThreadID previousActive = InvalidThreadID; + bool freezeCurrent = false; +}; + +class SmtActiveThreadArbiter +{ + public: + static constexpr uint64_t InvalidScore = static_cast(-1); + + SmtActiveThreadFreeze observe(ThreadID tid, uint64_t score) + { + if (score < bestScore) { + selectedTid = tid; + bestScore = score; + } + + if (freezeActive) { + SmtActiveThreadFreeze freeze; + freeze.freezeCurrent = true; + return freeze; + } + + if (firstActiveTid == InvalidThreadID) { + firstActiveTid = tid; + return {}; + } + + freezeActive = true; + SmtActiveThreadFreeze freeze; + freeze.previousActive = firstActiveTid; + freeze.freezeCurrent = true; + return freeze; + } + + ThreadID selected() const { return selectedTid; } + + private: + ThreadID selectedTid = InvalidThreadID; + ThreadID firstActiveTid = InvalidThreadID; + bool freezeActive = false; + uint64_t bestScore = InvalidScore; +}; + struct StallSignals { diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index 2257e1bd46..bfbe23550f 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -147,6 +147,7 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara renameWidth(params.renameWidth), commitWidth(params.commitWidth), numThreads(params.numThreads), + smtBorrowDonorHoldCycles(params.smtBorrowDonorHoldCycles), drainPending(false), drainImminent(false), trapLatency(params.trapLatency), @@ -175,6 +176,7 @@ Commit::Commit(CPU *_cpu, branch_prediction::BPredUnit *_bp, const BaseO3CPUPara for (ThreadID tid = 0; tid < numThreads; tid++) { commitStatus[tid] = Idle; changedROBNumEntries[tid] = false; + borrowingDonorCycles[tid] = 0; trapSquash[tid] = false; tcSquash[tid] = false; squashAfterInst[tid] = nullptr; @@ -563,6 +565,7 @@ Commit::takeOverFrom() for (ThreadID tid = 0; tid < numThreads; tid++) { commitStatus[tid] = Idle; changedROBNumEntries[tid] = false; + borrowingDonorCycles[tid] = 0; trapSquash[tid] = false; tcSquash[tid] = false; squashAfterInst[tid] = NULL; @@ -1974,16 +1977,40 @@ Commit::moveInstsToBuffer() for (int i = 0; i < insts_from_rename; ++i) { const DynInstPtr &inst = fromRename->insts[i]; assert(inst->threadNumber == tid); - if (!inst->isSquashed()) - fixedbuffer[tid].push_back(inst); + if (!inst->isSquashed()) { + fixedbuffer[tid].push_back(inst); + } + } + } + + for (int i = 0; i < numThreads; ++i) { + bool has_buffered_rename = !fixedbuffer[i].empty(); + bool donor = false; + + if (has_buffered_rename) { + borrowingDonorCycles[i] = 0; + } else { + donor = smtHasBorrowThrottleStall(robInfoFromIEW->iewInfo[i]); + if (donor) { + borrowingDonorCycles[i] = smtBorrowDonorHoldCycles; + } else if (borrowingDonorCycles[i] > 0) { + --borrowingDonorCycles[i]; + } + donor = borrowingDonorCycles[i] > 0; } + + rob->setBorrowingDonor(i, donor); } // check threads stall & status - ThreadID tid = InvalidThreadID; + SmtActiveThreadArbiter active_arbiter; + auto freezeActiveThread = [this](ThreadID tid) { + stallSig->blockIEW[tid] = true; + stallSig->iewBlockReason[tid] = StallReason::OtherFragStall; + }; for (int i = 0; i < numThreads; i++) { bool robblock = commitStatus[i] == ROBSquashing || commitStatus[i] == TrapPending; - bool block = (rob->getMaxEntries(i) - rob->getThreadEntries(i) < fixedbuffer[i].size()) || robblock; + bool block = !rob->canAllocate(i, fixedbuffer[i].size()) || robblock; bool active = !block && !fixedbuffer[i].empty(); StallReason block_reason = StallReason::NoStall; if (robblock) { @@ -1999,16 +2026,17 @@ Commit::moveInstsToBuffer() stallSig->blockIEW[i] = block; stallSig->iewBlockReason[i] = block ? block_reason : StallReason::NoStall; if (active) { - if (tid == InvalidThreadID) tid = i; - else { - // if there are multiple active threads, must exhaust all threads first - // to avoid starvation of other threads and also avoid resource conflict - stallSig->blockIEW[tid] = true; - stallSig->blockIEW[i] = true; - DPRINTF(IEW, "Multiple active threads detected, blocking all threads\n"); + const auto freeze = active_arbiter.observe( + i, smtBorrowPriority(robInfoFromIEW->iewInfo[i])); + if (freeze.previousActive != InvalidThreadID) { + freezeActiveThread(freeze.previousActive); + } + if (freeze.freezeCurrent) { + freezeActiveThread(i); } } } + const ThreadID tid = active_arbiter.selected(); if (tid == InvalidThreadID) { DPRINTF(Commit, "No instructions from Rename stage.\n"); return; @@ -2028,7 +2056,7 @@ Commit::moveInstsToBuffer() rob->insertInst(inst); - assert(rob->getThreadEntries(tid) <= rob->getMaxEntries(tid)); + assert(rob->canAllocate(tid, 0)); youngestSeqNum[tid] = inst->seqNum; } else { diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 27ac59157e..510b88a3b7 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -457,6 +457,9 @@ class Commit */ bool changedROBNumEntries[MaxThreads]; + /** Donor hysteresis for dynamic ROB borrowing. */ + unsigned borrowingDonorCycles[MaxThreads]; + /** Records if a thread has to squash this cycle due to a trap. */ bool trapSquash[MaxThreads]; @@ -497,6 +500,9 @@ class Commit /** Number of Active Threads */ const ThreadID numThreads; + /** Cycles to keep a stalled thread marked as a ROB borrowing donor. */ + const unsigned smtBorrowDonorHoldCycles; + /** Is a drain pending? Commit is looking for an instruction boundary while * there are no pending interrupts */ diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index 0d36e05a85..d24e1d1efc 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -486,8 +486,14 @@ Decode::tick() checkSquash(); // check threads stall & status - ThreadID tid = InvalidThreadID; ThreadID blocked_tid = InvalidThreadID; + SmtActiveThreadArbiter active_arbiter; + auto freezeActiveThread = [this](ThreadID tid) { + stallSig->blockFetch[tid] = true; + stallSig->fetchBlockReason[tid] = StallReason::OtherFragStall; + toFetch->decodeInfo[tid].blockReason = + stallSig->fetchBlockReason[tid]; + }; const bool fifoBackpressured = !stallBuffer.empty() && eachstallSize.size() + decodeToFetchDelay + 1 >= @@ -520,19 +526,19 @@ Decode::tick() StallReason::NoStall; toFetch->decodeInfo[i].blockReason = stallSig->fetchBlockReason[i]; if (active) { - if (tid == InvalidThreadID) - tid = i; - else { - // if there are multiple active threads, must exhaust all threads first - // to avoid starvation of other threads and also avoid resource conflict - stallSig->blockFetch[tid] = true; - stallSig->blockFetch[i] = true; - DPRINTF(Decode, "Multiple active threads detected, blocking all threads\n"); + const auto freeze = active_arbiter.observe( + i, smtBorrowPriority(fromIEW->iewInfo[i])); + if (freeze.previousActive != InvalidThreadID) { + freezeActiveThread(freeze.previousActive); + } + if (freeze.freezeCurrent) { + freezeActiveThread(i); } } else if (block && blocked_tid == InvalidThreadID) { blocked_tid = i; } } + const ThreadID tid = active_arbiter.selected(); if (tid == InvalidThreadID) { // all threads are stalled, no need to process if (blocked_tid != InvalidThreadID) { diff --git a/src/cpu/o3/fetch.cc b/src/cpu/o3/fetch.cc index 0642e5a587..5dc44ac398 100644 --- a/src/cpu/o3/fetch.cc +++ b/src/cpu/o3/fetch.cc @@ -116,6 +116,7 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) "\tincrease MaxWidth in src/cpu/o3/limits.hh\n", fetchWidth, static_cast(MaxWidth)); + smtBorrowThrottleHoldCycles = params.smtBorrowThrottleCycles; for (int i = 0; i < MaxThreads; i++) { setThreadStatus(i, Idle); decoder[i] = nullptr; @@ -123,6 +124,13 @@ Fetch::Fetch(CPU *_cpu, const BaseO3CPUParams ¶ms) macroop[i] = nullptr; delayedCommit[i] = false; lastIcacheStall[i] = 0; + smtBorrowThrottleCycles[i] = 0; + } + smtLdstqHighWater = params.smtBorrowLdstqHighWater; + if (smtLdstqHighWater == 0) { + smtLdstqHighWater = + (params.LQEntries + params.SQEntries) * + params.smtBorrowLdstqHighWaterPercent / 100; } branchPred = params.branchPred; @@ -503,6 +511,7 @@ Fetch::resetStage() priorityList.push_back(tid); waitForVsetvl[tid] = false; + smtBorrowThrottleCycles[tid] = 0; } wroteToTimeBuffer = false; @@ -1386,36 +1395,67 @@ Fetch::handleInterrupts() ThreadID Fetch::selectUnstalledThread() { + ThreadID selected = InvalidThreadID; + bool has_candidate = false; + bool has_unthrottled_candidate = false; - // if (numThreads == 1) { - // return 0; - // } - ThreadID selected = -1; - bool all_stalled = true; for (ThreadID tid = 0; tid < numThreads; ++tid) { - if (!stallSig->blockFetch[tid] &&fetchQueue[tid].size() > 0) { - lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount); - iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount); - robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount); - all_stalled = false; - - }else { + const bool candidate = !stallSig->blockFetch[tid] && + !fetchQueue[tid].empty(); + if (!candidate) { + smtBorrowThrottleCycles[tid] = 0; lsqCounter->setCounter(tid, UINT64_MAX); iqCounter->setCounter(tid, UINT64_MAX); robCounter->setCounter(tid, UINT64_MAX); - + continue; + } + has_candidate = true; + + const bool throttle_now = + smtHasBorrowThrottleStall(fromIEW->iewInfo[tid]) || + smtHasMemoryPressure(fromIEW->iewInfo[tid], smtLdstqHighWater); + if (throttle_now) { + smtBorrowThrottleCycles[tid] = smtBorrowThrottleHoldCycles; + } else if (smtBorrowThrottleCycles[tid] > 0) { + --smtBorrowThrottleCycles[tid]; } - if(all_stalled) - { - selected = -1; - }else{ - selected = decodeScheduler->getThread(); + const bool throttled = smtBorrowThrottleCycles[tid] > 0; + if (!throttled) { + has_unthrottled_candidate = true; } - DPRINTF(Fetch, "lsqCounter->setCounter: %d iqCounter->setCounter: %d robCounter->setCounter: %d\n",fromIEW->iewInfo[tid].ldstqCount,fromIEW->iewInfo[tid].iqCount,fromIEW->iewInfo[tid].robCount); + + lsqCounter->setCounter( + tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].ldstqCount); + iqCounter->setCounter( + tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].iqCount); + robCounter->setCounter( + tid, throttled ? UINT64_MAX : fromIEW->iewInfo[tid].robCount); + + DPRINTF(Fetch, + "[tid:%i] lsq=%u iq=%u rob=%u throttled=%u mem_pressure=%u hold=%u\n", + tid, fromIEW->iewInfo[tid].ldstqCount, + fromIEW->iewInfo[tid].iqCount, fromIEW->iewInfo[tid].robCount, + throttled, + smtHasMemoryPressure(fromIEW->iewInfo[tid], smtLdstqHighWater), + smtBorrowThrottleCycles[tid]); + } + + if (has_candidate && !has_unthrottled_candidate) { + for (ThreadID tid = 0; tid < numThreads; ++tid) { + if (stallSig->blockFetch[tid] || fetchQueue[tid].empty()) { + continue; + } + lsqCounter->setCounter(tid, fromIEW->iewInfo[tid].ldstqCount); + iqCounter->setCounter(tid, fromIEW->iewInfo[tid].iqCount); + robCounter->setCounter(tid, fromIEW->iewInfo[tid].robCount); + } + } + + if (has_candidate) { + selected = decodeScheduler->getThread(); } - return selected; } diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index c76bb8d77f..3c874749af 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -242,6 +242,10 @@ class Fetch InstsCounter* iqCounter; InstsCounter* robCounter; + unsigned smtBorrowThrottleCycles[MaxThreads]; + unsigned smtBorrowThrottleHoldCycles; + unsigned smtLdstqHighWater; + // Configuration parameters std::string smtDecodePolicy ="multi_priority"; int delayedSchedulerDelay; diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 03076739e3..491ceda48a 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -937,8 +937,26 @@ IEW::dispatchInsts() } // check threads stall & status - ThreadID tid = InvalidThreadID; + SmtActiveThreadArbiter active_arbiter; + auto freezeActiveThread = [this](ThreadID tid) { + stallSig->blockRename[tid] = true; + stallSig->renameBlockReason[tid] = StallReason::OtherFragStall; + toRename->iewInfo[tid].blockReason = StallReason::OtherFragStall; + }; for (int i = 0; i < numThreads; i++) { + auto &iew_info = toRename->iewInfo[i]; + iew_info.robHeadStallReason = + checkDispatchStall(i, NumDQ, nullptr, -1); + iew_info.lqHeadStallReason = + ldstQueue.lqEmpty(i) ? StallReason::NoStall : + checkLSQStall(i, true); + iew_info.sqHeadStallReason = + ldstQueue.sqEmpty(i) ? StallReason::NoStall : + checkLSQStall(i, false); + iew_info.ldstqCount = ldstQueue.getCount(i); + iew_info.robCount = rob->getThreadEntries(i); + iew_info.iqCount = scheduler->getIQInsts(i); + bool ldst_block = !canInsertLDSTQue(i); bool block = stallSig->blockIEW[i] || ldst_block; bool active = !block && !fixedbuffer[i].empty(); @@ -946,25 +964,27 @@ IEW::dispatchInsts() if (stallSig->blockIEW[i]) { block_reason = stallSig->iewBlockReason[i]; } else if (ldst_block) { - block_reason = checkDispatchStall(i, NumDQ, nullptr, -1); + block_reason = iew_info.robHeadStallReason; if (block_reason == StallReason::NoStall) { block_reason = StallReason::OtherStall; } } + iew_info.blockReason = block ? block_reason : StallReason::NoStall; stallSig->blockRename[i] = block; stallSig->renameBlockReason[i] = block ? block_reason : StallReason::NoStall; if (active) { - if (tid == InvalidThreadID) tid = i; - else { - // if there are multiple active threads, must exhaust all threads first - // to avoid starvation of other threads and also avoid resource conflict - stallSig->blockRename[tid] = true; - stallSig->blockRename[i] = true; - DPRINTF(IEW, "Multiple active threads detected, blocking all threads\n"); + const auto freeze = + active_arbiter.observe(i, smtBorrowPriority(iew_info)); + if (freeze.previousActive != InvalidThreadID) { + freezeActiveThread(freeze.previousActive); + } + if (freeze.freezeCurrent) { + freezeActiveThread(i); } } } + const ThreadID tid = active_arbiter.selected(); if (tid != InvalidThreadID) { DPRINTF(IEW,"Processing [tid:%i]\n",tid); @@ -978,6 +998,9 @@ IEW::dispatchInsts() // check stall again if (!fixedbuffer[tid].empty()) { stallSig->blockRename[tid] = true; + stallSig->renameBlockReason[tid] = + blockReason == StallReason::NoStall ? + StallReason::OtherFragStall : blockReason; DPRINTF(IEW, "Dispatch bandwidth full, blocking thread %i\n", tid); } @@ -987,6 +1010,9 @@ IEW::dispatchInsts() toRename->iewInfo[tid].sqHeadStallReason = ldstQueue.sqEmpty(tid) ? StallReason::NoStall : checkLSQStall(tid, false); toRename->iewInfo[tid].blockReason = blockReason; + toRename->iewInfo[tid].ldstqCount = ldstQueue.getCount(tid); + toRename->iewInfo[tid].robCount = rob->getThreadEntries(tid); + toRename->iewInfo[tid].iqCount = scheduler->getIQInsts(tid); } } diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index 33ffdeb593..eb6bdb4ce6 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -527,7 +527,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) smtLSQThreshold == 0, "SMT LSQ threshold must be non-zero in shared threshold mode"); - if (lsqPolicy == SMTQueuePolicy::Dynamic) { + if (lsqPolicy == SMTQueuePolicy::Dynamic || + lsqPolicy == SMTQueuePolicy::DynamicBorrowing) { DPRINTF(LSQ, "LSQ mode set to Shared/Dynamic: %u LQ and %u SQ " "entries are shared across active SMT threads, along " "with %u RARQ and %u RAWQ entries\n", @@ -539,7 +540,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) smtLSQThreshold); } else { panic("Invalid LSQ sharing policy. Options are: Dynamic, " - "Partitioned, Threshold"); + "Partitioned, Threshold, DynamicBorrowing"); } } else { panic("Invalid SMT LSQ mode. Options are: Independent, Shared"); @@ -1580,6 +1581,7 @@ LSQ::sharedLSQAllocation(unsigned entries) const switch (lsqPolicy) { case SMTQueuePolicy::Dynamic: + case SMTQueuePolicy::DynamicBorrowing: return entries; case SMTQueuePolicy::Partitioned: return entries / active_threads; @@ -1588,7 +1590,7 @@ LSQ::sharedLSQAllocation(unsigned entries) const std::min(entries, smtLSQThreshold); default: panic("Invalid LSQ sharing policy. Options are: Dynamic, " - "Partitioned, Threshold"); + "Partitioned, Threshold, DynamicBorrowing"); } } @@ -1927,7 +1929,8 @@ LSQ::isStalled() bool LSQ::isStalled(ThreadID tid) { - if (lsqPolicy == SMTQueuePolicy::Dynamic) + if (lsqPolicy == SMTQueuePolicy::Dynamic || + lsqPolicy == SMTQueuePolicy::DynamicBorrowing) return isStalled(); else return thread[tid].isStalled(); diff --git a/src/cpu/o3/rename.cc b/src/cpu/o3/rename.cc index 33d7852a87..9e00c2bbef 100644 --- a/src/cpu/o3/rename.cc +++ b/src/cpu/o3/rename.cc @@ -355,8 +355,14 @@ Rename::tick() releasePhysRegs(); // check threads stall & status - ThreadID tid = InvalidThreadID; ThreadID blocked_tid = InvalidThreadID; + SmtActiveThreadArbiter active_arbiter; + auto freezeActiveThread = [this](ThreadID tid) { + stallSig->blockDecode[tid] = true; + stallSig->decodeBlockReason[tid] = StallReason::OtherFragStall; + toDecode->renameInfo[tid].blockReason = + stallSig->decodeBlockReason[tid]; + }; for (int i = 0; i < numThreads; i++) { bool can_rename = canRename(i); bool block = stallSig->blockRename[i] || !can_rename; @@ -394,18 +400,19 @@ Rename::tick() stallSig->blockDecode[i] ? block_reason : StallReason::NoStall; toDecode->renameInfo[i].blockReason = stallSig->decodeBlockReason[i]; if (active) { - if (tid == InvalidThreadID) tid = i; - else { - // if there are multiple active threads, must exhaust all threads first - // to avoid starvation of other threads and also avoid resource conflict - stallSig->blockDecode[tid] = true; - stallSig->blockDecode[i] = true; - DPRINTF(Rename, "Multiple active threads detected, blocking all threads\n"); + const auto freeze = active_arbiter.observe( + i, smtBorrowPriority(fromIEW->iewInfo[i])); + if (freeze.previousActive != InvalidThreadID) { + freezeActiveThread(freeze.previousActive); + } + if (freeze.freezeCurrent) { + freezeActiveThread(i); } } else if (stallSig->blockDecode[i] && blocked_tid == InvalidThreadID) { blocked_tid = i; } } + const ThreadID tid = active_arbiter.selected(); if (tid == InvalidThreadID) { // all threads are stalled, no need to process diff --git a/src/cpu/o3/rob.cc b/src/cpu/o3/rob.cc index d57ea8b0df..6fe0c6467a 100644 --- a/src/cpu/o3/rob.cc +++ b/src/cpu/o3/rob.cc @@ -40,6 +40,7 @@ #include "cpu/o3/rob.hh" +#include #include #include "base/logging.hh" @@ -131,6 +132,7 @@ ROB::allocateGroup_kmhv3(const DynInstPtr inst, ThreadID tid) ROB::ROB(CPU *_cpu, const BaseO3CPUParams ¶ms) : robPolicy(params.smtROBPolicy), + borrowingDonorReserveEntries(params.smtBorrowDonorReserveEntries), robWalkPolicy(params.robWalkPolicy), cpu(_cpu), numEntries(params.numROBEntries), @@ -142,6 +144,10 @@ ROB::ROB(CPU *_cpu, const BaseO3CPUParams ¶ms) numThreads(params.numThreads), stats(_cpu) { + for (ThreadID tid = 0; tid < MaxThreads; ++tid) { + borrowingDonor[tid] = false; + } + //Figure out rob policy if (robPolicy == SMTQueuePolicy::Dynamic) { //Set Max Entries to Total ROB Capacity @@ -149,6 +155,14 @@ ROB::ROB(CPU *_cpu, const BaseO3CPUParams ¶ms) maxEntries[tid] = numEntries; } + } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + DPRINTF(Fetch, "ROB sharing policy set to DynamicBorrowing\n"); + + int part_amt = numEntries / numThreads; + for (ThreadID tid = 0; tid < numThreads; tid++) { + maxEntries[tid] = part_amt; + } + } else if (robPolicy == SMTQueuePolicy::Partitioned) { DPRINTF(Fetch, "ROB sharing policy set to Partitioned\n"); @@ -212,6 +226,7 @@ ROB::resetState() squashIt[tid] = instList[tid].end(); squashedSeqNum[tid] = 0; doneSquashing[tid] = true; + borrowingDonor[tid] = false; } numInstsInROB = 0; @@ -262,6 +277,8 @@ ROB::resetEntries() if (robPolicy == SMTQueuePolicy::Partitioned) { maxEntries[tid] = numEntries / active_threads; + } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + maxEntries[tid] = numEntries / active_threads; } else if (robPolicy == SMTQueuePolicy::Threshold && active_threads == 1) { maxEntries[tid] = numEntries; @@ -275,11 +292,95 @@ ROB::entryAmount(ThreadID num_threads) { if (robPolicy == SMTQueuePolicy::Partitioned) { return numEntries / num_threads; + } else if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + return numEntries / num_threads; } else { return 0; } } +unsigned +ROB::activeThreadCount() const +{ + if (!activeThreads || activeThreads->empty()) { + return numThreads == 0 ? 1 : numThreads; + } + return activeThreads->size(); +} + +unsigned +ROB::totalEntries() const +{ + unsigned total = 0; + for (ThreadID tid = 0; tid < numThreads; ++tid) { + total += threadGroups[tid].size(); + } + return total; +} + +bool +ROB::canBorrow(ThreadID tid) const +{ + return robPolicy == SMTQueuePolicy::DynamicBorrowing && + tid < numThreads; +} + +unsigned +ROB::borrowingLimit(ThreadID tid) const +{ + if (tid >= numThreads) { + return 0; + } + + if (!canBorrow(tid)) { + return maxEntries[tid]; + } + + const unsigned active_threads = std::max(1U, activeThreadCount()); + const unsigned base = std::max(1U, numEntries / active_threads); + const unsigned donor_resume_quota = + std::min(base, borrowingDonorReserveEntries); + + unsigned reserved = 0; + for (ThreadID other = 0; other < numThreads; ++other) { + if (other == tid) { + continue; + } + + const unsigned reserve = + borrowingDonor[other] ? donor_resume_quota : base; + const unsigned used = threadGroups[other].size(); + if (used < reserve) { + reserved += reserve - used; + } + } + + if (reserved >= numEntries) { + return 0; + } + + return numEntries - reserved; +} + +bool +ROB::canAllocate(ThreadID tid, unsigned entries) const +{ + if (tid >= numThreads) { + return false; + } + + const unsigned used = threadGroups[tid].size(); + + if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + if (totalEntries() + entries > numEntries) { + return false; + } + return used + entries <= borrowingLimit(tid); + } + + return used + entries <= maxEntries[tid]; +} + int ROB::countInsts() { @@ -354,6 +455,7 @@ ROB::insertInst(const DynInstPtr &inst) assert(numInstsInROB <= numEntries * instsPerGroup); ThreadID tid = inst->threadNumber; + assert(canAllocate(tid, 1)); // allocate group bool alloc = (this->*allocateNewGroup)(inst, tid); @@ -508,6 +610,15 @@ ROB::getHeadGroupLastDoneSeq(ThreadID tid) unsigned ROB::numFreeEntries(ThreadID tid) { + if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + const unsigned limit = borrowingLimit(tid); + const unsigned used = threadGroups[tid].size(); + if (limit <= used) { + return 0; + } + return limit - used; + } + return maxEntries[tid] - threadGroups[tid].size(); } diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index 94b93d2593..e5726d0c02 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -90,9 +90,15 @@ class ROB /** Per-thread ROB status. */ Status robStatus[MaxThreads]; + /** Whether a thread may donate unused ROB headroom this cycle. */ + bool borrowingDonor[MaxThreads]; + /** ROB resource sharing policy for SMT mode. */ SMTQueuePolicy robPolicy; + /** Minimum entries a donor thread keeps for restarting after a stall. */ + const unsigned borrowingDonorReserveEntries; + ROBWalkPolicy robWalkPolicy; bool allocateGroup_none(const DynInstPtr inst, ThreadID tid); @@ -100,6 +106,11 @@ class ROB bool allocateGroup_MohBoE(const DynInstPtr inst, ThreadID tid); bool allocateGroup_kmhv3(const DynInstPtr inst, ThreadID tid); + unsigned activeThreadCount() const; + unsigned borrowingLimit(ThreadID tid) const; + unsigned totalEntries() const; + bool canBorrow(ThreadID tid) const; + public: /** ROB constructor. * @param _cpu The cpu object pointer. @@ -188,7 +199,19 @@ class ROB /** Returns the maximum number of entries for a specific thread. */ unsigned getMaxEntries(ThreadID tid) - { return maxEntries[tid]; } + { + if (tid >= numThreads) { + return 0; + } + return canBorrow(tid) ? borrowingLimit(tid) : maxEntries[tid]; + } + + /** Returns whether the thread may borrow unused ROB capacity. */ + void setBorrowingDonor(ThreadID tid, bool donor) + { borrowingDonor[tid] = donor; } + + /** Returns whether the thread can reserve the requested ROB entries. */ + bool canAllocate(ThreadID tid, unsigned entries) const; /** Returns the number of entries being used by a specific thread. */ unsigned getThreadEntries(ThreadID tid) @@ -197,6 +220,9 @@ class ROB /** Returns if the ROB is full. */ bool isFull() { + if (robPolicy == SMTQueuePolicy::DynamicBorrowing) { + return totalEntries() >= numEntries; + } for (int i =0;i Date: Tue, 19 May 2026 17:19:54 +0800 Subject: [PATCH 31/38] cpu-o3: Guard empty LSQ head stall checks Change-Id: I735b9557f8f8d69f094121bb6229bd3ab7682f49 --- src/cpu/o3/iew.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/cpu/o3/iew.cc b/src/cpu/o3/iew.cc index 491ceda48a..d0ac1cbdb8 100644 --- a/src/cpu/o3/iew.cc +++ b/src/cpu/o3/iew.cc @@ -2190,8 +2190,15 @@ IEW::checkDispatchStall(ThreadID tid, int dq_stall, const DynInstPtr &dispatch_i if (head_inst->isNonSpeculative()) { return StallReason::SerializeStall; } else if (head_inst->isLoad() && ldstQueue.lqFull(tid)) { + if (ldstQueue.lqEmpty(tid)) { + return StallReason::InstNotReady; + } return checkLSQStall(tid, true); - } else if ((head_inst->isStore() || head_inst->isAtomic()) && ldstQueue.sqFull(tid)) { + } else if ((head_inst->isStore() || head_inst->isAtomic()) && + ldstQueue.sqFull(tid)) { + if (ldstQueue.sqEmpty(tid)) { + return StallReason::InstNotReady; + } return checkLSQStall(tid, false); } else { return StallReason::InstNotReady; @@ -2222,6 +2229,11 @@ IEW::checkDispatchStall(ThreadID tid, int dq_stall, const DynInstPtr &dispatch_i StallReason IEW::checkLSQStall(ThreadID tid, bool isLoad) { + if ((isLoad && ldstQueue.lqEmpty(tid)) || + (!isLoad && ldstQueue.sqEmpty(tid))) { + return StallReason::InstNotReady; + } + DynInstPtr head_inst = ldstQueue.getLSQHeadInst(tid, isLoad); return checkLoadStoreInst(head_inst); } From 1d2a555f52d0c43a465afcc63185ce5a0210a5c1 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Wed, 20 May 2026 10:35:52 +0800 Subject: [PATCH 32/38] arch-riscv: Isolate old TLB privilege by thread Change-Id: Ie6f7cfc4bd2d2d8f7aad6c9e7bace19534b77c00 --- src/arch/riscv/tlb.cc | 52 ++++++++++++++++++++++++++++++++++++------- src/arch/riscv/tlb.hh | 23 ++++++++++--------- src/cpu/o3/commit.cc | 6 ++--- 3 files changed, 58 insertions(+), 23 deletions(-) diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc index 050e0735b8..0004d18a66 100644 --- a/src/arch/riscv/tlb.cc +++ b/src/arch/riscv/tlb.cc @@ -2147,21 +2147,57 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc, return NoFault; } PrivilegeMode -TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode) +TLB::currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode) { - if (use_old_priv && mode != BaseMMU::Execute) { - if (mode == BaseMMU::Execute) { - return old_priv_ex; - } else { - return old_priv_ldst; - } - } STATUS status = (STATUS)tc->readMiscReg(MISCREG_STATUS); PrivilegeMode pmode = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV); if (mode != BaseMMU::Execute && status.mprv == 1) pmode = (PrivilegeMode)(RegVal)status.mpp; return pmode; } + +PrivilegeMode +TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode) +{ + if (mode != BaseMMU::Execute) { + const int tid = tc->threadId(); + if (tid >= 0) { + const auto thread_idx = static_cast(tid); + if (thread_idx < oldPrivByThread.size() && + oldPrivByThread[thread_idx].valid) { + return oldPrivByThread[thread_idx].ldst; + } + } + } + return currentMemPriv(tc, mode); +} + +void +TLB::setOldPriv(ThreadContext *tc) +{ + const int tid = tc->threadId(); + assert(tid >= 0); + const auto thread_idx = static_cast(tid); + if (oldPrivByThread.size() <= thread_idx) { + oldPrivByThread.resize(thread_idx + 1); + } + oldPrivByThread[thread_idx].valid = true; + oldPrivByThread[thread_idx].ldst = currentMemPriv(tc, BaseMMU::Read); +} + +void +TLB::useNewPriv(ThreadContext *tc) +{ + const int tid = tc->threadId(); + if (tid < 0) { + return; + } + const auto thread_idx = static_cast(tid); + if (thread_idx < oldPrivByThread.size()) { + oldPrivByThread[thread_idx].valid = false; + } +} + bool TLB::hasTwoStageTranslation(ThreadContext *tc, const RequestPtr &req, BaseMMU::Mode mode) { diff --git a/src/arch/riscv/tlb.hh b/src/arch/riscv/tlb.hh index 5b94852e2f..99d57e0c3d 100644 --- a/src/arch/riscv/tlb.hh +++ b/src/arch/riscv/tlb.hh @@ -34,6 +34,7 @@ #include #include +#include #include "arch/generic/tlb.hh" #include "arch/riscv/isa.hh" @@ -42,6 +43,7 @@ #include "arch/riscv/regs/misc.hh" #include "arch/riscv/utility.hh" #include "base/statistics.hh" +#include "base/types.hh" #include "mem/request.hh" #include "params/RiscvTLB.hh" #include "sim/sim_object.hh" @@ -107,9 +109,13 @@ class TLB : public BaseTLB uint64_t lastPc; uint64_t traceFlag; - bool use_old_priv; - PrivilegeMode old_priv_ldst; - PrivilegeMode old_priv_ex; + struct OldPrivState + { + bool valid = false; + PrivilegeMode ldst = PrivilegeMode::PRV_M; + }; + + std::vector oldPrivByThread; Walker *walker; @@ -253,6 +259,7 @@ class TLB : public BaseTLB BaseMMU::Translation *translation, BaseMMU::Mode mode) override; Fault finalizePhysical(const RequestPtr &req, ThreadContext *tc, BaseMMU::Mode mode) const override; + PrivilegeMode currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode); TlbEntry *lookup(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden, bool sign_used, uint8_t translateMode, bool is_prefetch = false); @@ -262,14 +269,8 @@ class TLB : public BaseTLB TlbEntry *lookupL2TLB(Addr vpn, uint16_t asid, BaseMMU::Mode mode, bool hidden, int f_level, bool sign_used, uint8_t translateMode); - void setOldPriv(ThreadContext *tc) { - use_old_priv = true; - old_priv_ex = getMemPriv(tc, BaseMMU::Execute); - old_priv_ldst = getMemPriv(tc, BaseMMU::Read); - } - void useNewPriv(ThreadContext *tc) { - use_old_priv = false; - } + void setOldPriv(ThreadContext *tc); + void useNewPriv(ThreadContext *tc); std::vector tlbL2L3; // our TLB diff --git a/src/cpu/o3/commit.cc b/src/cpu/o3/commit.cc index bfbe23550f..82884044de 100644 --- a/src/cpu/o3/commit.cc +++ b/src/cpu/o3/commit.cc @@ -964,9 +964,8 @@ Commit::handleInterrupt() cpu->difftestRaiseIntr(cpu->getInterruptsNO() | (1ULL << 63)); } traceLogHandleInterrupt(); - cpu->processInterrupts(cpu->getInterrupts()); - cpu->mmu->setOldPriv(cpu->getContext(0)); + cpu->processInterrupts(cpu->getInterrupts()); thread[0]->noSquashFromTC = false; @@ -1789,12 +1788,11 @@ Commit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) // needed to update the state as soon as possible. This // prevents external agents from changing any specific state // that the trap need. + cpu->mmu->setOldPriv(cpu->getContext(tid)); cpu->trap(inst_fault, tid, head_inst->notAnInst() ? nullStaticInstPtr : head_inst->staticInst); - cpu->mmu->setOldPriv(cpu->getContext(tid)); - // Exit state update mode to avoid accidental updating. thread[tid]->noSquashFromTC = false; From 43e1cc7338d35f7f7fc95f125734b00bf3ecf965 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 21 May 2026 11:16:15 +0800 Subject: [PATCH 33/38] cpu-o3: Fix SMT AMO difftest snapshot Change-Id: Iccef736835125a5ef35efb06c3eeced41f189b6c --- src/cpu/o3/lsq_unit.cc | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index c0e9283c55..be065323b9 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -367,19 +367,16 @@ LSQUnit::completeDataAccess(PacketPtr pkt) assert(size == inst->effSize); if (inst->isAtomic()) { - uint8_t current_golden[8] = {}; - panic_if(size > sizeof(current_golden), - "Unexpected AMO size %u at addr %#lx\n", + panic_if(size > sizeof(uint64_t), + "Unexpected AMO size %zu at addr %#lx\n", size, addr); - cpu->goldenMemManager()->readGoldenMem(addr, current_golden, - size); - // Preserve the DUT-observed old value until completeStore() - // derives the post-AMO memory image. The golden old-value - // snapshot used by difftest is captured when the request - // is first sent, before later concurrent updates can - // advance shared memory. + // derives the post-AMO memory image. Keep the actual + // response value for difftest, since the request may have + // been serialized behind another hart's AMO by the cache. inst->setGolden(loaded_data); + std::memcpy(inst->getAmoOldGoldenValuePtr(), loaded_data, + size); } else { // check data with golden mem uint8_t *golden_data = @@ -2933,7 +2930,6 @@ LSQUnit::completeStore(typename StoreQueue::iterator store_idx, bool from_sbuffe paddr, *((uint64_t *)(tmp_data)), 0xff, request->_size); cpu->goldenMemManager()->updateGoldenMem(paddr, tmp_data, 0xff, request->_size); - store_inst->setGolden(tmp_data); } } From e3751dfa79aae4bf9038363facd4387f1fc4055e Mon Sep 17 00:00:00 2001 From: Mo Haonan <66786667+mhnGitHubz@users.noreply.github.com> Date: Thu, 21 May 2026 11:27:23 +0800 Subject: [PATCH 34/38] cpu-o3: reserve a store buffer resource for each thread to prevent deadlock (#858) Co-authored-by: mo haonan --- src/cpu/o3/lsq.cc | 24 +++++++++++++++++++++++- src/cpu/o3/lsq.hh | 9 +++++++-- src/cpu/o3/lsq_unit.cc | 8 ++++---- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc index eb6bdb4ce6..1877412495 100644 --- a/src/cpu/o3/lsq.cc +++ b/src/cpu/o3/lsq.cc @@ -245,6 +245,7 @@ LSQ::StoreBuffer::setData(std::vector &data_vec) this->data_vec = data_vec; int way = data_vec.size(); _size = 0; + max_size = way; lru_index.set_capacity(way); free_list.set_capacity(way); crossRef.resize(way); @@ -255,12 +256,26 @@ LSQ::StoreBuffer::setData(std::vector &data_vec) } } +void +LSQ::StoreBuffer::setMaxThread(ThreadID _max_thread) +{ + max_thread = _max_thread; + vld_cnt_vec.resize(max_thread, 0); +} + bool LSQ::StoreBuffer::full() const { return free_list.size() == 0; } +bool +LSQ::StoreBuffer::full(ThreadID tid) const +{ + assert(vld_cnt_vec[tid] <= max_size); + return (vld_cnt_vec[tid] == (max_size - max_thread + 1)); +} + uint64_t LSQ::StoreBuffer::size() const { @@ -326,6 +341,8 @@ LSQ::StoreBuffer::insert(StoreBufferEntry *entry) assert(!data_vld[index]); assert(!lru_index.full()); _size++; + vld_cnt_vec[tid]++; + assert(vld_cnt_vec[tid] <= max_size); auto [it, _] = data_map.insert({hashKey(tid, addr), data_vec[index]}); crossRef[index] = it; data_vld[index] = true; @@ -411,6 +428,9 @@ LSQ::StoreBuffer::createVice(StoreBufferEntry *entry) assert(!entry->vice); entry->vice = vice; data_vld[vice->index] = true; + assert(entry->tid < max_thread); + vld_cnt_vec[entry->tid]++; + assert(vld_cnt_vec[entry->tid] <= max_size); // do not insert map and lru_index return vice; } @@ -420,6 +440,8 @@ LSQ::StoreBuffer::release(StoreBufferEntry *entry) { assert(_size > 0); _size--; + vld_cnt_vec[entry->tid]--; + assert(vld_cnt_vec[entry->tid] >= 0); int index = entry->index; data_vld[index] = false; data_map.erase(crossRef[index]); @@ -563,7 +585,7 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const BaseO3CPUParams ¶ms) store_buffer_entries.push_back(new StoreBufferEntry(cpu->cacheLineSize(), i)); } storeBuffer.setData(store_buffer_entries); - + storeBuffer.setMaxThread(numThreads); bankOccupied.resize(dcacheSetDivNum, std::vector(numBank, false)); pendingDcacheRefill.resize(dcacheSetDivNum, false); dcacheRefillDataRead.resize(dcacheSetDivNum, 0); diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 397f372c26..9371d49987 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -189,12 +189,15 @@ class LSQ // key = (paddr & cacheblockmask) uint64_t _size = 0; + int max_size = 0; + int max_thread = 0; std::unordered_map data_map; std::vector crossRef; boost::circular_buffer lru_index; boost::circular_buffer free_list; std::vector data_vec; std::vector data_vld; + std::vector vld_cnt_vec; uint64_t hashKey(ThreadID tid, Addr block_paddr) const { @@ -204,8 +207,10 @@ class LSQ public: void setData(std::vector &data_vec); - bool full() const; - uint64_t size() const; + void setMaxThread(ThreadID max_thread); + bool full() const; + bool full(ThreadID tid) const; + uint64_t size() const; uint64_t size(ThreadID tid) const; uint64_t size(ThreadID tid, InstSeqNum seq_num) const; uint64_t unsentSize() const; diff --git a/src/cpu/o3/lsq_unit.cc b/src/cpu/o3/lsq_unit.cc index be065323b9..7c7074090f 100644 --- a/src/cpu/o3/lsq_unit.cc +++ b/src/cpu/o3/lsq_unit.cc @@ -2472,8 +2472,8 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, blockPaddr, paddr); } else { // create vice for sending entry - if (storeBuffer.full()) { - DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr); + if (storeBuffer.full(lsqID) || storeBuffer.full()) { + DPRINTF(StoreBuffer, "[tid:%u] Insert %#x failed due to sbuffer full\n", lsqID, paddr); stats.sbufferFull++; return false; } @@ -2498,10 +2498,10 @@ LSQUnit::insertStoreBuffer(Addr vaddr, Addr paddr, uint8_t* datas, } } else { // create new entry - if (storeBuffer.full()) { + if (storeBuffer.full(lsqID) || storeBuffer.full()) { stats.sbufferFull++; // lsq->nextStoreBufferInsertTid = lsqID; - DPRINTF(StoreBuffer, "Insert %#x failed due to sbuffer full\n", paddr); + DPRINTF(StoreBuffer, "[tid:%u] Insert %#x failed due to sbuffer full\n", lsqID, paddr); return false; } // insert From cba429499431bcc837242eea8912af029b5e7a7f Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Thu, 21 May 2026 14:08:38 +0800 Subject: [PATCH 35/38] mem: Avoid materializing zero pages on zstd restore Change-Id: I615bad73ac28ceea5e51bd1f594f94acda1f7705 --- src/mem/physical.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mem/physical.cc b/src/mem/physical.cc index 8b4df199bf..02d4bae07c 100644 --- a/src/mem/physical.cc +++ b/src/mem/physical.cc @@ -851,9 +851,9 @@ PhysicalMemory::unserializeFromZstd(std::string filepath, unsigned store_id, lon } for (uint64_t x = 0; x < output.pos; x += sizeof(long)) { - pmem_current = (uint64_t*)(pmem + total_write_size + x); uint64_t read_data = *(decompress_file_buffer + x / sizeof(long)); - if (read_data != 0 || *pmem_current != 0) { + if (read_data != 0) { + pmem_current = (uint64_t*)(pmem + total_write_size + x); *pmem_current = read_data; non_zero_dword++; } From 8d75755582a8a08b6567d42ce018795f3bfd0d1c Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 25 May 2026 15:13:26 +0800 Subject: [PATCH 36/38] cpu-o3: Fix SMT decode stallbuffer backpressure Change-Id: I50ab39ce30eebfb6d129c2c0fafd8f855b536730 --- src/cpu/o3/decode.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/cpu/o3/decode.cc b/src/cpu/o3/decode.cc index d24e1d1efc..b53e6917a4 100644 --- a/src/cpu/o3/decode.cc +++ b/src/cpu/o3/decode.cc @@ -90,8 +90,7 @@ Decode::Decode(CPU *_cpu, const BaseO3CPUParams ¶ms) // This buffer preserves the fetch->decode pipeline contents when decode // stalls while TimeBuffer keeps advancing. Its depth matches the original // forward pipeline window; fetch is backpressured before full to absorb - // both the decode->fetch feedback delay and the request already issued in - // the current cycle before decode computes backpressure. + // the fetch groups already in that window. const auto stallGroupDepth = fetchToDecodeDelay + 1; stallBuffer = boost::circular_buffer( decodeWidth * stallGroupDepth); @@ -496,7 +495,7 @@ Decode::tick() }; const bool fifoBackpressured = !stallBuffer.empty() && - eachstallSize.size() + decodeToFetchDelay + 1 >= + eachstallSize.size() + fetchToDecodeDelay >= eachstallSize.capacity(); const ThreadID fifoHeadTid = !stallBuffer.empty() ? stallBuffer.front()->threadNumber : InvalidThreadID; From 446710fedce26caa140f092bd0007dc43a9208ff Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Mon, 25 May 2026 17:56:42 +0800 Subject: [PATCH 37/38] cpu: Fix BTBTAGE unit test history update call Change-Id: If3999d5c26bf325cfb052b100456e759388c2330 --- src/cpu/pred/btb/test/btb_tage.test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpu/pred/btb/test/btb_tage.test.cc b/src/cpu/pred/btb/test/btb_tage.test.cc index e945065e9f..56c67c28ad 100644 --- a/src/cpu/pred/btb/test/btb_tage.test.cc +++ b/src/cpu/pred/btb/test/btb_tage.test.cc @@ -393,12 +393,12 @@ TEST_F(BTBTAGETest, GlobalHistoryModeUpdate) { BTBTAGE ghrTage(4, 2, 1024, 4, false); boost::dynamic_bitset<> ghr(64, false); - ghrTage.doUpdateHist(ghr, 1, true, 0, 0); + ghrTage.doUpdateHist(ghr, 1, true, 0, 0, 0); applyOutcomeHistory(ghr, 1, true); ghrTage.checkFoldedHist(ghr, "ghr taken update"); boost::dynamic_bitset<> before_not_taken = ghr; - ghrTage.doUpdateHist(ghr, 1, false, 0, 0); + ghrTage.doUpdateHist(ghr, 1, false, 0, 0, 0); applyOutcomeHistory(ghr, 1, false); ghrTage.checkFoldedHist(ghr, "ghr not-taken update"); From 6ac3abca4ec515877254bf9940a5e1d84eede225 Mon Sep 17 00:00:00 2001 From: tastynoob <934348725@qq.com> Date: Tue, 26 May 2026 14:43:33 +0800 Subject: [PATCH 38/38] Revert "arch-riscv: fix agnostic vector load fill" This reverts commit 40bf365d1d262123eb1328740099996f3ff4ebd2. --- .../riscv/isa/vector/base/vector_mem.temp.isa | 28 ------------------- .../isa/vector/simple/vector_mem.temp.isa | 28 ------------------- 2 files changed, 56 deletions(-) diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa index 2448a9ad95..e97eef0940 100644 --- a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa @@ -1,24 +1,5 @@ output header {{ -#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ - std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) - -#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ - do { \ - for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ - const uint32_t _vdElemIdx = \ - (vmi.rs % (elem_num_per_vreg_)) + _i; \ - const size_t _ei = _i + vmi.rs; \ - const bool _is_tail = _ei >= rVl; \ - const bool _is_masked = !this->vm && !_is_tail && \ - !elem_mask(v0, _ei); \ - if ((_is_tail && machInst.vtype8.vta) || \ - (_is_masked && machInst.vtype8.vma)) { \ - FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ - } \ - } \ - } while (0) - inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -166,7 +147,6 @@ Fault { %(op_decl)s; %(op_rd)s; - auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -192,8 +172,6 @@ Fault %(memacc_code)s; } - APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); - %(op_wb)s; return fault; } @@ -283,7 +261,6 @@ Fault %(op_decl)s; %(op_rd)s; - auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -322,11 +299,6 @@ Fault } } -#if %(is_vecWhole)s -#else - APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); -#endif - %(vfof_get_code)s; %(op_wb)s; return NoFault; diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa index 4b64f5dac0..a8e5b71f99 100644 --- a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa +++ b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa @@ -1,24 +1,5 @@ output header {{ -#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \ - std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff) - -#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \ - do { \ - for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \ - const uint32_t _vdElemIdx = \ - (vmi.rs % (elem_num_per_vreg_)) + _i; \ - const size_t _ei = _i + vmi.rs; \ - const bool _is_tail = _ei >= rVl; \ - const bool _is_masked = !this->vm && !_is_tail && \ - !elem_mask(v0, _ei); \ - if ((_is_tail && machInst.vtype8.vta) || \ - (_is_masked && machInst.vtype8.vma)) { \ - FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \ - } \ - } \ - } while (0) - inline uint32_t calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) { uint32_t vend = std::min(rVl, re); @@ -166,7 +147,6 @@ Fault { %(op_decl)s; %(op_rd)s; - auto VdBytes = tmp_d0.as(); Addr EA; // EA = Rs1 + vmi.offset; @@ -192,8 +172,6 @@ Fault %(memacc_code)s; } - APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8); - %(op_wb)s; return fault; } @@ -283,7 +261,6 @@ Fault %(op_decl)s; %(op_rd)s; - auto VdBytes = tmp_d0.as(); #if %(is_vecWhole)s // VM_REQUIRED(); @@ -322,11 +299,6 @@ Fault } } -#if %(is_vecWhole)s -#else - APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb); -#endif - %(vfof_get_code)s; %(op_wb)s; return NoFault;