Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions configs/common/CacheConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,28 @@ def _get_cache_opts(cpu, level, options):

return opts

def apply_matrix_timing_options(cpu, options):
timing_attrs = [
"matrix_issue_interval_cycles",
"matrix_load_base_cycles",
"matrix_store_base_cycles",
"matrix_zero_cycles",
"matrix_compute_base_cycles",
"matrix_compute_read_cycles",
"matrix_release_cycles",
"matrix_local_mmu_issue_per_cycle",
"matrix_local_mmu_arb_cycles",
"matrix_l2_request_pipeline_cycles",
"matrix_l2_response_pipeline_cycles",
"matrix_local_mmu_read_latency_cycles",
"matrix_local_mmu_write_ack_latency_cycles",
]
for isa in getattr(cpu, "isa", []):
for attr in timing_attrs:
value = getattr(options, attr, None)
if value is not None and hasattr(isa, attr):
setattr(isa, attr, value)

def config_classic_l2(options, system, l2_cache_class):
# When using classic L2 cache, The prefetcher is inside the l2cache, instead of l2Wrapper
# So we need to move the prefetcher from l2Wrapper to l2cache
Expand Down Expand Up @@ -298,6 +320,8 @@ def config_cache(options, system):
system.memchecker = MemChecker()

for i in range(options.num_cpus):
apply_matrix_timing_options(system.cpu[i], options)

if options.caches:
icache = icache_class(**_get_cache_opts(system.cpu[i], 'l1i', options))
dcache = dcache_class(**_get_cache_opts(system.cpu[i], 'l1d', options))
Expand Down
37 changes: 37 additions & 0 deletions configs/common/Options.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,43 @@ def addCommonOptions(parser, configure_xiangshan=False):
parser.add_argument("--ideal-kmhv3", action= "store_true",
help="Use KunminghuV3 ideal params, which take priority over command-line arguments.")

# Coarse CUTE matrix timing knobs. These are intentionally behavior-level
# controls for sensitivity studies, not cycle-accurate RTL stage controls.
parser.add_argument("--matrix-issue-interval-cycles", type=int,
default=None,
help="CUTE matrix task issue interval in CPU cycles")
parser.add_argument("--matrix-load-base-cycles", type=int, default=None,
help="CUTE matrix load base latency in CPU cycles")
parser.add_argument("--matrix-store-base-cycles", type=int, default=None,
help="CUTE matrix store base latency in CPU cycles")
parser.add_argument("--matrix-zero-cycles", type=int, default=None,
help="CUTE matrix zero latency in CPU cycles")
parser.add_argument("--matrix-compute-base-cycles", type=int,
default=None,
help="Fixed CUTE matrix compute ready latency in CPU cycles")
parser.add_argument("--matrix-compute-read-cycles", type=int,
default=None,
help="Fixed CUTE matrix compute source read latency in CPU cycles")
parser.add_argument("--matrix-release-cycles", type=int, default=None,
help="CUTE matrix release latency in CPU cycles")
parser.add_argument("--matrix-local-mmu-issue-per-cycle", type=int,
default=None,
help="CUTE LocalMMU request issue throughput per CPU cycle")
parser.add_argument("--matrix-local-mmu-arb-cycles", type=int,
default=None,
help="CUTE LocalMMU arbitration latency in CPU cycles")
parser.add_argument("--matrix-l2-request-pipeline-cycles", type=int,
default=None,
help="CUTE-to-L2 request pipeline latency in CPU cycles")
parser.add_argument("--matrix-l2-response-pipeline-cycles", type=int,
default=None,
help="CUTE L2 response service interval in CPU cycles")
parser.add_argument("--matrix-local-mmu-read-latency-cycles", type=int,
default=None,
help="CUTE LocalMMU read response latency in CPU cycles")
parser.add_argument("--matrix-local-mmu-write-ack-latency-cycles",
type=int, default=None,
help="CUTE LocalMMU write acknowledgement latency in CPU cycles")

# for warmup without switching cpu
parser.add_argument("--warmup-insts-no-switch", action="store", type=int,
Expand Down
189 changes: 189 additions & 0 deletions configs/example/ai_idealkmhv3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import argparse
import os
import sys

import m5
from m5.defines import buildEnv
from m5.objects import *
from m5.util import addToPath, fatal, warn
from m5.util.fdthelper import *

addToPath('../')

from ruby import Ruby
from common.LSQBankConflict import set_lsq_bank_conflict_cache_params

from common.FSConfig import *
from common.SysPaths import *
from common.Benchmarks import *
from common import Simulation
from common.Caches import *
from common.xiangshan import *

from m5.objects.ValuePredictor import *


AI_MATRIX_TIMING_DEFAULTS = {
"matrix_issue_interval_cycles": 1,
"matrix_load_base_cycles": 4,
"matrix_store_base_cycles": 4,
"matrix_zero_cycles": 1,
"matrix_compute_base_cycles": 2,
"matrix_compute_read_cycles": 1,
"matrix_release_cycles": 1,
"matrix_local_mmu_issue_per_cycle": 1,
"matrix_local_mmu_arb_cycles": 1,
"matrix_l2_request_pipeline_cycles": 1,
"matrix_l2_response_pipeline_cycles": 1,
"matrix_local_mmu_read_latency_cycles": 20,
"matrix_local_mmu_write_ack_latency_cycles": 12,
}


def setAiMatrixTimingDefaults(args):
# Keep CUTE timing coarse by default. Command-line --matrix-* options remain
# higher priority so calibration runs can sweep these knobs directly.
for attr, value in AI_MATRIX_TIMING_DEFAULTS.items():
if getattr(args, attr, None) is None:
setattr(args, attr, value)


def setAiKmhV3IdealParams(args, system):
for cpu in system.cpu:

# fetch
cpu.mmu.itb.size = 96
cpu.fetchWidth = 32
cpu.iewToFetchDelay = 2 # for resolved update, should train branch after squash
cpu.commitToFetchDelay = 2
cpu.fetchQueueSize = 64

# decode
cpu.fetchToDecodeDelay = 5
cpu.decodeWidth = 8
cpu.enable_loadFusion = False
cpu.enableConstantFolding = False

# rename
cpu.renameWidth = 8
cpu.numPhysIntRegs = 224
cpu.numPhysFloatRegs = 256

# dispatch
cpu.enableDispatchStage = False
cpu.numDQEntries = [8, 8, 8]
cpu.dispWidth = [8, 8, 8]

# scheduler
cpu.scheduler = KMHV3Scheduler()

# rob
cpu.commitWidth = 12
cpu.squashWidth = 12
cpu.phyregReleaseWidth = 8
cpu.RobCompressPolicy = 'kmhv3'
cpu.numROBEntries = 160
cpu.CROB_instPerGroup = 2 # 1 if not using ROB compression

# lsu
cpu.StoreWbStage = 4
cpu.EnableLdMissReplay = True
cpu.EnablePipeNukeCheck = True
cpu.BankConflictCheck = True
cpu.sbufferBankWriteAccurately = True
cpu.DcacheSetDivNum = 2

# value predictor
cpu.valuePred = IdealConstantLVP()

# lsq
cpu.LQEntries = 128
cpu.SQEntries = 64
cpu.RARQEntries = 96
cpu.RAWQEntries = 56
cpu.LoadCompletionWidth = 8
cpu.StoreCompletionWidth = 4
cpu.RARDequeuePerCycle = 4
cpu.RAWDequeuePerCycle = 4
cpu.SbufferEntries = 24
cpu.SbufferEvictThreshold = 16
cpu.store_prefetch_train = False

# branch predictor
if args.bp_type == 'DecoupledBPUWithBTB':
cpu.branchPred.ftq_size = 64
cpu.branchPred.fsq_size = 64
# TAGE table sizes and numWays tuning
cpu.branchPred.tage.tableSizes = [2048, 2048, 8192, 8192, 8192, 8192, 8192, 2048]
cpu.branchPred.tage.numWays = [2, 2, 4, 2, 2, 2, 2, 2]
# cpu.branchPred.microtage.enabled = False

# l1 cache per core
if args.caches:
cpu.icache.size = '64kB'
cpu.dcache.size = '64kB'
cpu.dcache.tag_load_read_ports = 100
cpu.dcache.mshrs = 16
cpu.dcache.simulate_dcache_refill = True
set_lsq_bank_conflict_cache_params(cpu, system)

# l2 caches
if args.l2cache:
for i in range(args.num_cpus):
if args.classic_l2:
system.l2_caches[i].slice_num = 0 # 4 -> 0, no slice
else:
l2_wrapper = system.l2_wrappers[i]
l2_wrapper.data_sram_banks = 2
l2_wrapper.dir_sram_banks = 2
l2_wrapper.pipe_dir_write_stage = 4
l2_wrapper.dir_read_bypass = True
for j in range(args.l2_slices):
# Configure XSDRRIP replacement policy (DRRIP mode)
# Each slice: 2MB/4 = 512KB, 8-way, 64B line -> 1024 sets
l2_wrapper.slices[j].inner_cache.replacement_policy = XSDRRIPRP(mode=2, num_sets=1024)
system.tol2bus_list[i].forward_latency = 0 # 3->0
system.tol2bus_list[i].response_latency = 0 # 3->0
system.tol2bus_list[i].hint_wakeup_ahead_cycles = 0 # 2->0

# ReqLayer[0]: ICache+DCache+ITB+DTB -> L2, allow 2 requests per cycle
# RespLayer[1]: L2 -> DCache, allow 2 responses per cycle
system.tol2bus_list[i].layer_bandwidth_configs = [
LayerBandwidthConfig(direction="req", port_index=0, max_per_cycle=2),
LayerBandwidthConfig(direction="resp", port_index=1, max_per_cycle=2),
]

# l3 cache
if args.l3cache:
system.l3.mshrs = 128


if __name__ == '__m5_main__':
FutureClass = None

args = xiangshan_system_init()

assert not args.external_memory_system

# AI performance runs use the same ideal KMHV3 CPU/cache envelope as
# idealkmhv3.py, plus explicit coarse CUTE timing defaults.
args.bp_type = 'DecoupledBPUWithBTB'
args.l2_size = '2MB'
args.l3_size = '32MB'
args.enable_pf_buffer = False
args.enable_riscv_vector = True
setAiMatrixTimingDefaults(args)

# Match the memories with the CPUs, based on the options for the test system
TestMemClass = Simulation.setMemClass(args)

test_sys = build_xiangshan_system(args)
if args.raw_cpt and args.generic_rv_cpt and os.path.basename(args.generic_rv_cpt) == "linux.bin":
configure_xiangshan_linux_workload(test_sys, args)

# Set ideal parameters here with the highest priority, over command-line arguments
setAiKmhV3IdealParams(args, test_sys)

root = Root(full_system=True, system=test_sys)

Simulation.run_vanilla(args, root, test_sys, FutureClass)
25 changes: 22 additions & 3 deletions configs/example/se.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,23 @@ def get_processes(args):
if '--ruby' in sys.argv:
Ruby.define_options(parser)

def setDefaultArgs(args):
def explicitOptionDests(parser, argv):
option_to_dest = {}
for action in parser._actions:
for option in action.option_strings:
option_to_dest[option] = action.dest

explicit = set()
for arg in argv[1:]:
if arg == '--':
break
option = arg.split('=', 1)[0]
dest = option_to_dest.get(option)
if dest is not None:
explicit.add(dest)
return explicit

def setDefaultArgs(args, explicit_options):
"""Set default configurations to match xiangshan.py SE mode defaults"""

# Set defaults only if not already specified by user
Expand Down Expand Up @@ -154,7 +170,10 @@ def setDefaultArgs(args):
} # default warmup 100k instructions!

for key, value in defaults.items():
# if not hasattr(args, key) or getattr(args, key) is None:
if key in explicit_options:
continue
if key == 'l3cache' and 'no_l3cache' in explicit_options:
continue
setattr(args, key, value)

# Set dramsim3_ini path
Expand All @@ -165,7 +184,7 @@ def setDefaultArgs(args):
args = parser.parse_args()

# Set default configurations
setDefaultArgs(args)
setDefaultArgs(args, explicitOptionDests(parser, sys.argv))

multiprocesses = []
numThreads = 1
Expand Down
Loading
Loading