Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
1ab1a92
add frontend and backend smt (#791)
mhnGitHubz Mar 16, 2026
81b9cc0
cpu-o3: fix smt framework
tastynoob Mar 17, 2026
c96339a
cpu-o3: support shared-address-space fs smt
tastynoob Mar 18, 2026
099bf7c
cpu-pred: fix unit test compile
tastynoob Mar 18, 2026
9def466
cpu-o3: integrate FS-SMT support changes
tastynoob Mar 23, 2026
76939fc
cpu-o3: fix smt shared sbuffer
tastynoob Mar 27, 2026
8485ee6
cpu-o3: fix storeData uop squash
tastynoob Mar 30, 2026
1fcfb25
cpu-o3: fix squash drain and wakeup recovery
tastynoob Mar 31, 2026
a516e81
cpu-o3: fix iew smt squash (#809)
mhnGitHubz Mar 31, 2026
0b4960f
cpu-o3: fix smt fetch squash & load wakeup & iq init
tastynoob Apr 2, 2026
b11b00e
cpu-o3: fix lsq request lifetime and store completion
tastynoob Apr 2, 2026
40bf365
arch-riscv: fix agnostic vector load fill
tastynoob Apr 2, 2026
a125904
cpu: add asid hash to decoupled btb
tastynoob Apr 8, 2026
c79fec1
cpu-o3: fix smt thread-local inst stop threshold
tastynoob Apr 9, 2026
1105968
cpu-o3: fix Decoder scheduler,thread 1 count is incorrect (#816)
mhnGitHubz Apr 9, 2026
325d970
cpu: add shared lsq and ftq modes for smt
tastynoob Apr 9, 2026
cc766e3
cpu: avoid full memcpy_init for dedup difftest
tastynoob Apr 13, 2026
ea80c0a
cpu: apply asid hash to mgsc lookups
tastynoob Apr 13, 2026
491baad
cpu-o3: add backend SMT PMU (#827)
mhnGitHubz Apr 14, 2026
9792223
cpu-o3: enlarge smt l3 to 32MB
tastynoob Apr 15, 2026
d7beabe
cpu: Isolate ABTB pipeline for SMT
tastynoob Apr 27, 2026
9845df9
cpu-o3: isolate MDP replay tracking per thread
tastynoob Apr 27, 2026
e16b2a2
cpu-o3: 1. Add scheduler starvation prevention mechanism; 2. Modify s…
mhnGitHubz Apr 27, 2026
032f575
cpu: preserve BTB tag bits when hashing ASID
tastynoob Apr 27, 2026
036db12
cpu-o3: isolate committed stream state per thread
tastynoob Apr 28, 2026
ada41dc
cpu: Isolate VPU state per thread
tastynoob Apr 28, 2026
004a046
mem-cache: Preserve prefetch context
tastynoob Apr 29, 2026
32eed52
cpu-o3: Select IQ entries by dispatch age
tastynoob May 11, 2026
70a364f
cpu-o3: all threads have a store to offload and both fail, reset requ…
mhnGitHubz May 11, 2026
f4334ca
cpu-o3: Expose SMT borrowing tunables
tastynoob May 19, 2026
11d0ee4
cpu-o3: Guard empty LSQ head stall checks
tastynoob May 19, 2026
1d2a555
arch-riscv: Isolate old TLB privilege by thread
tastynoob May 20, 2026
43e1cc7
cpu-o3: Fix SMT AMO difftest snapshot
tastynoob May 21, 2026
e3751df
cpu-o3: reserve a store buffer resource for each thread to prevent de…
mhnGitHubz May 21, 2026
cba4294
mem: Avoid materializing zero pages on zstd restore
tastynoob May 21, 2026
8d75755
cpu-o3: Fix SMT decode stallbuffer backpressure
tastynoob May 25, 2026
446710f
cpu: Fix BTBTAGE unit test history update call
tastynoob May 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,10 @@ package.json

microbench/build/
microbench/output/
microbench/dramsim3*
microbench/dramsim3*

*.bin
*.db
*.log
*.gz
*.zstd
18 changes: 14 additions & 4 deletions configs/common/FSConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None):
self.system_port = self.membus.cpu_side_ports
return self

def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False):
self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby)
def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1,
ruby=False, num_threads=None):
self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby,
num_threads=num_threads)
self.workload = RiscvBareMetal()
self.workload.reset_vect = 0x80000000
return self


def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False,
num_threads=None):
self = System()
if not mdesc:
# generic system
mdesc = SysConfig()
if num_threads is None:
num_threads = np
self.mem_mode = mem_mode
self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
print(self.mem_ranges)
Expand All @@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
self.lint = Clint()
self.lint.pio = self.iobus.mem_side_ports
self.lint.pio_addr = 0x38000000
self.lint.num_threads = np
self.lint.num_threads = num_threads

self.hartctrl = HartCtrl()
self.hartctrl.pio = self.iobus.mem_side_ports
self.hartctrl.num_threads = num_threads

self.mmcs = NemuMMC()
self.mmcs.pio = self.iobus.mem_side_ports
Expand All @@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr +
self.uartlite.pio_size),
AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size),
AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size),
AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size),
AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size),
]
Expand Down
8 changes: 3 additions & 5 deletions configs/common/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,16 +349,14 @@ def addCommonOptions(parser, configure_xiangshan=False):
"that are present under any of the roots. If not given, dump all "
"stats. ")

parser.add_argument("--smt", action="store_true", default=False,
help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""")

if configure_xiangshan:
return
# Following options are not available in XiangShan

parser.add_argument("--checker", action="store_true")
parser.add_argument("--smt", action="store_true", default=False,
help="""
Only used if multiple programs are specified. If true,
then the number of threads per cpu is same as the
number of programs.""")
parser.add_argument(
"--elastic-trace-en", action="store_true",
help="""Enable capture of data dependency and instruction
Expand Down
22 changes: 15 additions & 7 deletions configs/common/xiangshan.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace):
if args.difftest_ref_so is not None:
ref_so = args.difftest_ref_so
print("Obtained ref_so from args.difftest_ref_so: ", ref_so)
elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ:
elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ:
ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"]
print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so)
elif "GCBV_REF_SO" in os.environ:
Expand Down Expand Up @@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
if args.raw_cpt:
# If using raw binary, no restorer is needed.
gcpt_restorer = None
elif args.num_cpus > 1:
elif args.num_cpus > 1 or args.smt:
if "GCB_MULTI_CORE_RESTORER" in os.environ:
gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"]
print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer)
else:
fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core")
fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest")
elif args.restore_rvv_cpt:
if "GCBV_RESTORER" in os.environ:
gcpt_restorer = os.environ["GCBV_RESTORER"]
Expand All @@ -355,8 +355,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer)
gcpt_restorer = args.gcpt_restorer

if args.num_cpus > 1:
print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).")
if args.num_cpus > 1 or args.smt:
print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).")
sys.gcpt_restorer_size_limit = 2**20
elif args.restore_rvv_cpt:
print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.")
Expand Down Expand Up @@ -403,7 +403,7 @@ def config_difftest(cpu_list, args, sys):
if not args.enable_difftest:
return
else:
if len(cpu_list) > 1:
if len(cpu_list) > 1 or args.smt:
sys.enable_mem_dedup = True
for cpu in cpu_list:
cpu.enable_mem_dedup = True
Expand Down Expand Up @@ -439,7 +439,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i)
for i in range(np)]
# Configure MMU for trace-aware FS mode
if args.smt:
test_sys.multi_thread = True

for cpu in test_sys.cpu:
if args.smt:
cpu.numThreads = 2
cpu.mmu.pma_checker = PMAChecker(
uncacheable=[AddrRange(0, size=0x80000000)])
cpu.mmu.functional = args.functional_tlb
Expand Down Expand Up @@ -822,8 +827,11 @@ def build_xiangshan_system(args):

TestCPUClass = get_xiangshan_cpu_class(args)
ruby = bool(hasattr(args, 'ruby') and args.ruby)
num_threads = np * (2 if getattr(args, 'smt', False) else 1)

test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby)
test_sys = makeBareMetalXiangshanSystem(
'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby,
num_threads=num_threads)

if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode:
if bool(getattr(args, 'trace_timing_ptw', False)):
Expand Down
49 changes: 49 additions & 0 deletions configs/example/smt_idealkmhv3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from m5.objects import Root

from m5.util import addToPath

addToPath('../')

from common import Simulation
from common.xiangshan import build_xiangshan_system, xiangshan_system_init
from idealkmhv3 import setKmhV3IdealParams


def setSharedLSQParams(args, system):
setKmhV3IdealParams(args, system)

for cpu in system.cpu:
# Reuse the ideal KMHV3 LSQ-related sizes, but interpret them as a
# shared SMT-wide pool. For example, LQEntries=128 means both threads
# compete for a total of 128 load entries instead of 128 each. The
# same shared-mode accounting applies to SQ/RARQ/RAWQ. Likewise,
# branchPred.ftq_size is interpreted as a shared SMT-wide FTQ pool.
# Keep FTQ partitioned by default so one thread cannot monopolize the
# shared target queue and starve the other thread's frontend.
cpu.smtLSQMode = 'Shared'
cpu.smtLSQPolicy = 'Dynamic'
cpu.smtROBPolicy = 'DynamicBorrowing'
cpu.branchPred.smtFTQMode = 'Shared'
cpu.branchPred.smtFTQPolicy = 'Partitioned'


if __name__ == '__m5_main__':
FutureClass = None

args = xiangshan_system_init()

assert not args.external_memory_system

args.smt = True
args.bp_type = 'DecoupledBPUWithBTB'
args.l2_size = '2MB'
args.l3_size = '32MB'

Simulation.setMemClass(args)

test_sys = build_xiangshan_system(args)
setSharedLSQParams(args, test_sys)

root = Root(full_system=True, system=test_sys)

Simulation.run_vanilla(args, root, test_sys, FutureClass)
28 changes: 28 additions & 0 deletions src/arch/riscv/isa/vector/base/vector_mem.temp.isa
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
output header {{

#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)

#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \
do { \
for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \
const uint32_t _vdElemIdx = \
(vmi.rs % (elem_num_per_vreg_)) + _i; \
const size_t _ei = _i + vmi.rs; \
const bool _is_tail = _ei >= rVl; \
const bool _is_masked = !this->vm && !_is_tail && \
!elem_mask(v0, _ei); \
if ((_is_tail && machInst.vtype8.vta) || \
(_is_masked && machInst.vtype8.vma)) { \
FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \
} \
} \
} while (0)

inline uint32_t
calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
uint32_t vend = std::min(rVl, re);
Expand Down Expand Up @@ -147,6 +166,7 @@ Fault
{
%(op_decl)s;
%(op_rd)s;
auto VdBytes = tmp_d0.as<uint8_t>();

Addr EA;
// EA = Rs1 + vmi.offset;
Expand All @@ -172,6 +192,8 @@ Fault
%(memacc_code)s;
}

APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);

%(op_wb)s;
return fault;
}
Expand Down Expand Up @@ -261,6 +283,7 @@ Fault

%(op_decl)s;
%(op_rd)s;
auto VdBytes = tmp_d0.as<uint8_t>();

#if %(is_vecWhole)s
// VM_REQUIRED();
Expand Down Expand Up @@ -299,6 +322,11 @@ Fault
}
}

#if %(is_vecWhole)s
#else
APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
#endif

%(vfof_get_code)s;
%(op_wb)s;
return NoFault;
Expand Down
28 changes: 28 additions & 0 deletions src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
Original file line number Diff line number Diff line change
@@ -1,5 +1,24 @@
output header {{

#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)

#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_) \
do { \
for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) { \
const uint32_t _vdElemIdx = \
(vmi.rs % (elem_num_per_vreg_)) + _i; \
const size_t _ei = _i + vmi.rs; \
const bool _is_tail = _ei >= rVl; \
const bool _is_masked = !this->vm && !_is_tail && \
!elem_mask(v0, _ei); \
if ((_is_tail && machInst.vtype8.vta) || \
(_is_masked && machInst.vtype8.vma)) { \
FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_)); \
} \
} \
} while (0)

inline uint32_t
calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
uint32_t vend = std::min(rVl, re);
Expand Down Expand Up @@ -147,6 +166,7 @@ Fault
{
%(op_decl)s;
%(op_rd)s;
auto VdBytes = tmp_d0.as<uint8_t>();

Addr EA;
// EA = Rs1 + vmi.offset;
Expand All @@ -172,6 +192,8 @@ Fault
%(memacc_code)s;
}

APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);

%(op_wb)s;
return fault;
}
Expand Down Expand Up @@ -261,6 +283,7 @@ Fault

%(op_decl)s;
%(op_rd)s;
auto VdBytes = tmp_d0.as<uint8_t>();

#if %(is_vecWhole)s
// VM_REQUIRED();
Expand Down Expand Up @@ -299,6 +322,11 @@ Fault
}
}

#if %(is_vecWhole)s
#else
APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
#endif

%(vfof_get_code)s;
%(op_wb)s;
return NoFault;
Expand Down
53 changes: 44 additions & 9 deletions src/arch/riscv/tlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2146,23 +2146,58 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,

return NoFault;
}

PrivilegeMode
TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
TLB::currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
{
if (use_old_priv && mode != BaseMMU::Execute) {
if (mode == BaseMMU::Execute) {
return old_priv_ex;
} else {
return old_priv_ldst;
}
}
STATUS status = (STATUS)tc->readMiscReg(MISCREG_STATUS);
PrivilegeMode pmode = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
if (mode != BaseMMU::Execute && status.mprv == 1)
pmode = (PrivilegeMode)(RegVal)status.mpp;
return pmode;
}

PrivilegeMode
TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
{
if (mode != BaseMMU::Execute) {
const int tid = tc->threadId();
if (tid >= 0) {
const auto thread_idx = static_cast<size_t>(tid);
if (thread_idx < oldPrivByThread.size() &&
oldPrivByThread[thread_idx].valid) {
return oldPrivByThread[thread_idx].ldst;
}
}
}
return currentMemPriv(tc, mode);
}

void
TLB::setOldPriv(ThreadContext *tc)
{
const int tid = tc->threadId();
assert(tid >= 0);
const auto thread_idx = static_cast<size_t>(tid);
if (oldPrivByThread.size() <= thread_idx) {
oldPrivByThread.resize(thread_idx + 1);
}
oldPrivByThread[thread_idx].valid = true;
oldPrivByThread[thread_idx].ldst = currentMemPriv(tc, BaseMMU::Read);
}

void
TLB::useNewPriv(ThreadContext *tc)
{
const int tid = tc->threadId();
if (tid < 0) {
return;
}
const auto thread_idx = static_cast<size_t>(tid);
if (thread_idx < oldPrivByThread.size()) {
oldPrivByThread[thread_idx].valid = false;
}
}

bool
TLB::hasTwoStageTranslation(ThreadContext *tc, const RequestPtr &req, BaseMMU::Mode mode)
{
Expand Down
Loading
Loading