OpenXiangShan · tastynoob · Mar 16, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.gitignore b/.gitignore
@@ -70,4 +70,10 @@ package.json
 
 microbench/build/
 microbench/output/
-microbench/dramsim3*
+microbench/dramsim3*
+
+*.bin
+*.db
+*.log
+*.gz
+*.zstd
diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py
@@ -657,18 +657,23 @@ def makeBareMetalRiscvSystem(mem_mode, mdesc=None, cmdline=None):
     self.system_port = self.membus.cpu_side_ports
     return self
 
-def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=False):
-    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby)
+def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1,
+                                 ruby=False, num_threads=None):
+    self = makeXiangshanPlatformSystem(mem_mode, mdesc, np=np, ruby=ruby,
+                                       num_threads=num_threads)
     self.workload = RiscvBareMetal()
     self.workload.reset_vect = 0x80000000
     return self
 
 
-def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
+def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False,
+                                num_threads=None):
     self = System()
     if not mdesc:
         # generic system
         mdesc = SysConfig()
+    if num_threads is None:
+        num_threads = np
     self.mem_mode = mem_mode
     self.mem_ranges = [AddrRange(start=0x80000000, size=mdesc.mem())]
     print(self.mem_ranges)
@@ -687,7 +692,11 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
     self.lint = Clint()
     self.lint.pio = self.iobus.mem_side_ports
     self.lint.pio_addr = 0x38000000
-    self.lint.num_threads = np
+    self.lint.num_threads = num_threads
+
+    self.hartctrl = HartCtrl()
+    self.hartctrl.pio = self.iobus.mem_side_ports
+    self.hartctrl.num_threads = num_threads
 
     self.mmcs = NemuMMC()
     self.mmcs.pio = self.iobus.mem_side_ports
@@ -700,6 +709,7 @@ def makeXiangshanPlatformSystem(mem_mode, mdesc=None, np=1, ruby=False):
             AddrRange(self.uartlite.pio_addr, self.uartlite.pio_addr +
             self.uartlite.pio_size),
             AddrRange(self.lint.pio_addr, self.lint.pio_addr + self.lint.pio_size),
+            AddrRange(self.hartctrl.pio_addr, self.hartctrl.pio_addr + self.hartctrl.pio_size),
             AddrRange(self.mmcs.pio_addr, self.mmcs.pio_addr + self.mmcs.pio_size),
             AddrRange(self.plic.pio_addr, self.plic.pio_addr + self.plic.pio_size),
             ]

diff --git a/configs/common/Options.py b/configs/common/Options.py
@@ -349,16 +349,14 @@ def addCommonOptions(parser, configure_xiangshan=False):
         "that are present under any of the roots. If not given, dump all "
         "stats. ")
 
+    parser.add_argument("--smt", action="store_true", default=False,
+                        help=""" RISCV SMT support, which requires multitThread-supported gcpt restore and diff-ref-so""")
+
     if configure_xiangshan:
         return
     # Following options are not available in XiangShan
 
     parser.add_argument("--checker", action="store_true")
-    parser.add_argument("--smt", action="store_true", default=False,
-                        help="""
-                      Only used if multiple programs are specified. If true,
-                      then the number of threads per cpu is same as the
-                      number of programs.""")
     parser.add_argument(
         "--elastic-trace-en", action="store_true",
         help="""Enable capture of data dependency and instruction

diff --git a/configs/common/xiangshan.py b/configs/common/xiangshan.py
@@ -290,7 +290,7 @@ def resolve_xiangshan_ref_so(args: argparse.Namespace):
     if args.difftest_ref_so is not None:
         ref_so = args.difftest_ref_so
         print("Obtained ref_so from args.difftest_ref_so: ", ref_so)
-    elif args.num_cpus > 1 and "GCBV_MULTI_CORE_REF_SO" in os.environ:
+    elif (args.num_cpus > 1 or args.smt) and "GCBV_MULTI_CORE_REF_SO" in os.environ:
         ref_so = os.environ["GCBV_MULTI_CORE_REF_SO"]
         print("Obtained ref_so from GCBV_MULTI_CORE_REF_SO: ", ref_so)
     elif "GCBV_REF_SO" in os.environ:
@@ -330,12 +330,12 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         if args.raw_cpt:
             # If using raw binary, no restorer is needed.
             gcpt_restorer = None
-        elif args.num_cpus > 1:
+        elif args.num_cpus > 1 or args.smt:
             if "GCB_MULTI_CORE_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCB_MULTI_CORE_RESTORER"]
                 print("Obtained gcpt_restorer from GCB_MULTI_CORE_RESTORER: ", gcpt_restorer)
             else:
-                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-core")
+                fatal("Plz set $GCB_MULTI_CORE_RESTORER when model Xiangshan with multi-context difftest")
         elif args.restore_rvv_cpt:
             if "GCBV_RESTORER" in os.environ:
                 gcpt_restorer = os.environ["GCBV_RESTORER"]
@@ -355,8 +355,8 @@ def config_xiangshan_inputs(args: argparse.Namespace, sys):
         print("Obtained gcpt_restorer from args.gcpt_restorer: ", args.gcpt_restorer)
         gcpt_restorer = args.gcpt_restorer
 
-    if args.num_cpus > 1:
-        print("Simulating a multi-core system, demanding a larger GCPT restorer size (2M).")
+    if args.num_cpus > 1 or args.smt:
+        print("Simulating a multi-context system, demanding a larger GCPT restorer size (2M).")
         sys.gcpt_restorer_size_limit = 2**20
     elif args.restore_rvv_cpt:
         print("Simulating single core with RVV, demanding GCPT restorer size of 0x1000.")
@@ -403,7 +403,7 @@ def config_difftest(cpu_list, args, sys):
     if not args.enable_difftest:
         return
     else:
-        if len(cpu_list) > 1:
+        if len(cpu_list) > 1 or args.smt:
             sys.enable_mem_dedup = True
             for cpu in cpu_list:
                 cpu.enable_mem_dedup = True
@@ -439,7 +439,12 @@ def _finish_xiangshan_system(args, test_sys, TestCPUClass, ruby):
     test_sys.cpu = [TestCPUClass(clk_domain=test_sys.cpu_clk_domain, cpu_id=i)
                     for i in range(np)]
     # Configure MMU for trace-aware FS mode
+    if args.smt:
+        test_sys.multi_thread = True
+
     for cpu in test_sys.cpu:
+        if args.smt:
+            cpu.numThreads = 2
         cpu.mmu.pma_checker = PMAChecker(
             uncacheable=[AddrRange(0, size=0x80000000)])
         cpu.mmu.functional = args.functional_tlb
@@ -822,8 +827,11 @@ def build_xiangshan_system(args):
 
     TestCPUClass = get_xiangshan_cpu_class(args)
     ruby = bool(hasattr(args, 'ruby') and args.ruby)
+    num_threads = np * (2 if getattr(args, 'smt', False) else 1)
 
-    test_sys = makeBareMetalXiangshanSystem('timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby)
+    test_sys = makeBareMetalXiangshanSystem(
+        'timing', SysConfig(mem=args.mem_size), None, np=np, ruby=ruby,
+        num_threads=num_threads)
 
     if hasattr(args, 'enable_trace_mode') and args.enable_trace_mode:
         if bool(getattr(args, 'trace_timing_ptw', False)):

diff --git a/configs/example/smt_idealkmhv3.py b/configs/example/smt_idealkmhv3.py
@@ -0,0 +1,49 @@
+from m5.objects import Root
+
+from m5.util import addToPath
+
+addToPath('../')
+
+from common import Simulation
+from common.xiangshan import build_xiangshan_system, xiangshan_system_init
+from idealkmhv3 import setKmhV3IdealParams
+
+
+def setSharedLSQParams(args, system):
+    setKmhV3IdealParams(args, system)
+
+    for cpu in system.cpu:
+        # Reuse the ideal KMHV3 LSQ-related sizes, but interpret them as a
+        # shared SMT-wide pool. For example, LQEntries=128 means both threads
+        # compete for a total of 128 load entries instead of 128 each. The
+        # same shared-mode accounting applies to SQ/RARQ/RAWQ. Likewise,
+        # branchPred.ftq_size is interpreted as a shared SMT-wide FTQ pool.
+        # Keep FTQ partitioned by default so one thread cannot monopolize the
+        # shared target queue and starve the other thread's frontend.
+        cpu.smtLSQMode = 'Shared'
+        cpu.smtLSQPolicy = 'Dynamic'
+        cpu.smtROBPolicy = 'DynamicBorrowing'
+        cpu.branchPred.smtFTQMode = 'Shared'
+        cpu.branchPred.smtFTQPolicy = 'Partitioned'
+
+
+if __name__ == '__m5_main__':
+    FutureClass = None
+
+    args = xiangshan_system_init()
+
+    assert not args.external_memory_system
+
+    args.smt = True
+    args.bp_type = 'DecoupledBPUWithBTB'
+    args.l2_size = '2MB'
+    args.l3_size = '32MB'
+
+    Simulation.setMemClass(args)
+
+    test_sys = build_xiangshan_system(args)
+    setSharedLSQParams(args, test_sys)
+
+    root = Root(full_system=True, system=test_sys)
+
+    Simulation.run_vanilla(args, root, test_sys, FutureClass)
diff --git a/src/arch/riscv/isa/vector/base/vector_mem.temp.isa b/src/arch/riscv/isa/vector/base/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;

diff --git a/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa b/src/arch/riscv/isa/vector/simple/vector_mem.temp.isa
@@ -1,5 +1,24 @@
 output header {{
 
+#define FILL_AGNOSTIC_ELEM(Vd_bytes, elem_idx, eewb_) \
+    std::fill_n((Vd_bytes) + (elem_idx) * (eewb_), (eewb_), 0xff)
+
+#define APPLY_VLOAD_AGNOSTIC(Vd_bytes, elem_num_per_vreg_, eewb_)            \
+    do {                                                                     \
+        for (size_t _i = 0; _i < vmi.re - vmi.rs; ++_i) {                    \
+            const uint32_t _vdElemIdx =                                      \
+                (vmi.rs % (elem_num_per_vreg_)) + _i;                        \
+            const size_t _ei = _i + vmi.rs;                                  \
+            const bool _is_tail = _ei >= rVl;                                \
+            const bool _is_masked = !this->vm && !_is_tail &&                \
+                !elem_mask(v0, _ei);                                         \
+            if ((_is_tail && machInst.vtype8.vta) ||                         \
+                (_is_masked && machInst.vtype8.vma)) {                       \
+                FILL_AGNOSTIC_ELEM((Vd_bytes), _vdElemIdx, (eewb_));         \
+            }                                                                \
+        }                                                                    \
+    } while (0)
+
 inline uint32_t
 calc_memsize(uint32_t rs, uint32_t re, uint32_t sew, uint32_t rVl) {
     uint32_t vend = std::min(rVl, re);
@@ -147,6 +166,7 @@ Fault
 {
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
     Addr EA;
     // EA = Rs1 + vmi.offset;
@@ -172,6 +192,8 @@ Fault
         %(memacc_code)s;
     }
 
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eew / 8);
+
     %(op_wb)s;
     return fault;
 }
@@ -261,6 +283,7 @@ Fault
 
     %(op_decl)s;
     %(op_rd)s;
+    auto VdBytes = tmp_d0.as<uint8_t>();
 
 #if %(is_vecWhole)s
     // VM_REQUIRED();
@@ -299,6 +322,11 @@ Fault
         }
     }
 
+#if %(is_vecWhole)s
+#else
+    APPLY_VLOAD_AGNOSTIC(VdBytes, elem_num_per_vreg, eewb);
+#endif
+
     %(vfof_get_code)s;
     %(op_wb)s;
     return NoFault;

diff --git a/src/arch/riscv/tlb.cc b/src/arch/riscv/tlb.cc
@@ -2146,23 +2146,58 @@ TLB::doTranslate(const RequestPtr &req, ThreadContext *tc,
 
     return NoFault;
 }
-
 PrivilegeMode
-TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
+TLB::currentMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
 {
-    if (use_old_priv && mode != BaseMMU::Execute) {
-        if (mode == BaseMMU::Execute) {
-            return old_priv_ex;
-        } else {
-            return old_priv_ldst;
-        }
-    }
     STATUS status = (STATUS)tc->readMiscReg(MISCREG_STATUS);
     PrivilegeMode pmode = (PrivilegeMode)tc->readMiscReg(MISCREG_PRV);
     if (mode != BaseMMU::Execute && status.mprv == 1)
         pmode = (PrivilegeMode)(RegVal)status.mpp;
     return pmode;
 }
+
+PrivilegeMode
+TLB::getMemPriv(ThreadContext *tc, BaseMMU::Mode mode)
+{
+    if (mode != BaseMMU::Execute) {
+        const int tid = tc->threadId();
+        if (tid >= 0) {
+            const auto thread_idx = static_cast<size_t>(tid);
+            if (thread_idx < oldPrivByThread.size() &&
+                oldPrivByThread[thread_idx].valid) {
+                return oldPrivByThread[thread_idx].ldst;
+            }
+        }
+    }
+    return currentMemPriv(tc, mode);
+}
+
+void
+TLB::setOldPriv(ThreadContext *tc)
+{
+    const int tid = tc->threadId();
+    assert(tid >= 0);
+    const auto thread_idx = static_cast<size_t>(tid);
+    if (oldPrivByThread.size() <= thread_idx) {
+        oldPrivByThread.resize(thread_idx + 1);
+    }
+    oldPrivByThread[thread_idx].valid = true;
+    oldPrivByThread[thread_idx].ldst = currentMemPriv(tc, BaseMMU::Read);
+}
+
+void
+TLB::useNewPriv(ThreadContext *tc)
+{
+    const int tid = tc->threadId();
+    if (tid < 0) {
+        return;
+    }
+    const auto thread_idx = static_cast<size_t>(tid);
+    if (thread_idx < oldPrivByThread.size()) {
+        oldPrivByThread[thread_idx].valid = false;
+    }
+}
+
 bool
 TLB::hasTwoStageTranslation(ThreadContext *tc, const RequestPtr &req, BaseMMU::Mode mode)
 {