From 2bb62577fc7cf8330c51f2241aae89f994f7c391 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Tue, 13 Jan 2026 18:43:03 +0000 Subject: [PATCH 01/22] Configure the RAID1 member disks in qemu, and initialize disk selection policy when starting RAID. --- OSDK.toml | 2 - .../comps/virtio/src/device/block/device.rs | 2 +- kernel/src/fs/mod.rs | 44 +++++++++++++++---- test/Makefile | 11 ++--- test/apps/test_common.mk | 2 +- tools/qemu_args.sh | 6 ++- 6 files changed, 45 insertions(+), 22 deletions(-) diff --git a/OSDK.toml b/OSDK.toml index 5caf2ff00..2687e3257 100644 --- a/OSDK.toml +++ b/OSDK.toml @@ -62,8 +62,6 @@ qemu.args = """\ -chardev stdio,id=mux,mux=on,signal=off,logfile=qemu.log \ -drive if=none,format=raw,id=x0,file=./test/build/ext2.img \ -drive if=none,format=raw,id=x1,file=./test/build/exfat.img \ - -drive if=none,format=raw,id=r0,file=./test/build/raid1_0.img \ - -drive if=none,format=raw,id=r1,file=./test/build/raid1_1.img \ -device virtio-blk-device,drive=x0 \ -device virtio-keyboard-device \ -device virtio-serial-device \ diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index 65303b7dd..455965e79 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -132,7 +132,7 @@ impl BlockDevice { /// processes the request. pub fn handle_requests(&self) { let request = self.queue.dequeue(); - info!("Handle Request: {:?}", request); + // info!("Handle Request: {:?}", request); match request.type_() { BioType::Read => self.device.read(request), BioType::Write => self.device.write(request), diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 46e276d2c..e70b573fb 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -82,20 +82,24 @@ pub fn lazy_init() { } if let Some(raid) = aster_block::get_device(raid1_device_name) { - let raid_fs = Ext2::open(raid).unwrap(); - let target_path = FsPath::try_from("/raid1").unwrap(); - if let Err(err) = self::rootfs::mount_fs_at(raid_fs, &target_path) { - error!("[raid] failed to mount RAID-1 at /raid1: {:?}", err); + + match Ext2::open(raid) { + Ok(raid_fs) => { + let target_path = FsPath::try_from("/raid1").unwrap(); + self::rootfs::mount_fs_at(raid_fs, &target_path).unwrap(); + info!("[kernel] Mounted RAID-1 at {:?} ", target_path); + } + Err(err) => { + error!("[raid] failed to mount RAID-1 at /raid1: {:?}", err); + } } - info!("[kernel] Mounted RAID-1 at {:?} ", target_path); } else { error!("[raid] failed to get RAID-1 device: {:?}", Errno::ENOENT); } } fn setup_raid1_device(raid_device_name: &str) -> Result<()> { - const RAID_MEMBER_NAMES: &[&str] = &["raid0", "raid1"]; - // const RAID_MEMBER_NAMES: &[&str] = &["raid0"]; + const RAID_MEMBER_NAMES: &[&str] = &["raid0", "raid1", "raid2"]; info!( "[raid] initializing RAID-1 '{}' with members {:?}", raid_device_name, RAID_MEMBER_NAMES @@ -119,7 +123,6 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } } } - #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); #[cfg(not(baseline_asterinas))] @@ -147,6 +150,31 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } }; + // early stop for testing + // Ok(()); + + info!("[raid] creating selection policy"); + let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); + + Raid1Device::init(raid_device_name, members, selection_policy).map_err(|err| match err { + Raid1DeviceError::NotEnoughMembers => { + Error::with_message(Errno::EINVAL, "RAID-1 device requires at least two members") + } + })?; + info!("[raid] RAID-1 device created"); + + let worker = aster_block::get_device(raid_device_name).unwrap(); + // The registry stores `Arc`. Use `downcast_ref` on the captured Arc each + // iteration to call the RAID-specific helper without needing ownership of `Raid1Device`. + // TODO(Yingqi): Merge the starting of the RAID-1 thread inside block device server. + let task_fn = move || { + info!("spawn the RAID-1 device thread"); + let raid = worker.downcast_ref::().unwrap(); + loop { + raid.handle_requests(); + } + }; + crate::ThreadOptions::new(task_fn).spawn(); info!( diff --git a/test/Makefile b/test/Makefile index cf0c9d15d..b667f1f4c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -23,8 +23,6 @@ INITRAMFS_IMAGE := $(BUILD_DIR)/initramfs.cpio.gz endif EXT2_IMAGE := $(BUILD_DIR)/ext2.img EXFAT_IMAGE := $(BUILD_DIR)/exfat.img -RAID1_IMAGE0 := $(BUILD_DIR)/raid1_0.img -RAID1_IMAGE1 := $(BUILD_DIR)/raid1_1.img INITRAMFS_EMPTY_DIRS := \ $(INITRAMFS)/root \ @@ -39,6 +37,7 @@ INITRAMFS_EMPTY_DIRS := \ $(INITRAMFS)/.ssh \ $(INITRAMFS)/etc/dropbear + INITRAMFS_ALL_DIRS := \ $(INITRAMFS)/etc \ $(INITRAMFS)/lib/x86_64-linux-gnu \ @@ -243,20 +242,16 @@ $(INITRAMFS)/service: @cp $(CUR_DIR)/service/* $@ $(EXT2_IMAGE): - @dd if=/dev/zero of=$(EXT2_IMAGE) bs=2G count=1 + @dd if=/dev/zero of=$(EXT2_IMAGE) bs=1G count=2 @mke2fs $(EXT2_IMAGE) $(EXFAT_IMAGE): @dd if=/dev/zero of=$(EXFAT_IMAGE) bs=64M count=1 @mkfs.exfat $(EXFAT_IMAGE) -$(RAID1_IMAGE0) $(RAID1_IMAGE1): - @dd if=/dev/zero of=$(RAID1_IMAGE0) bs=128M count=1 - @mke2fs $(RAID1_IMAGE0) - @dd if=$(RAID1_IMAGE0) of=$(RAID1_IMAGE1) .PHONY: build -build: $(INITRAMFS_IMAGE) $(EXT2_IMAGE) $(EXFAT_IMAGE) $(RAID1_IMAGE0) $(RAID1_IMAGE1) +build: $(INITRAMFS_IMAGE) $(EXT2_IMAGE) $(EXFAT_IMAGE) .PHONY: format format: diff --git a/test/apps/test_common.mk b/test/apps/test_common.mk index 64afa40d8..f1b337233 100644 --- a/test/apps/test_common.mk +++ b/test/apps/test_common.mk @@ -13,7 +13,7 @@ C_DEPS := $(addprefix $(DEP_OUTPUT_DIR)/,$(C_SRCS:%.c=%.d)) ASM_SRCS := $(wildcard *.S) ASM_OBJS := $(addprefix $(OBJ_OUTPUT_DIR)/,$(ASM_SRCS:%.S=%)) CC := gcc -C_FLAGS := -Wall -Werror +# C_FLAGS := -Wall -Werror .PHONY: all all: $(C_OBJS) $(ASM_OBJS) diff --git a/tools/qemu_args.sh b/tools/qemu_args.sh index 3afa64af1..403589bcf 100755 --- a/tools/qemu_args.sh +++ b/tools/qemu_args.sh @@ -89,8 +89,9 @@ COMMON_QEMU_ARGS="\ -device isa-debug-exit,iobase=0xf4,iosize=0x04 \ -drive if=none,format=raw,id=x0,file=./test/build/ext2.img \ -drive if=none,format=raw,id=x1,file=./test/build/exfat.img \ - -drive if=none,format=raw,id=r0,file=./test/build/raid1_0.img,cache=directsync \ - -drive if=none,format=raw,id=r1,file=./test/build/raid1_1.img,cache=directsync \ + -drive if=none,format=raw,id=r0,file=/dev/nvme0n1p1 \ + -drive if=none,format=raw,id=r1,file=/dev/nvme1n1p1 \ + -drive if=none,format=raw,id=r2,file=/dev/nvme2n1p1 \ " if [ "$1" = "iommu" ]; then @@ -113,6 +114,7 @@ QEMU_ARGS="\ -device virtio-blk-pci,bus=pcie.0,addr=0x7,drive=x1,serial=vexfat,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-blk-pci,bus=pcie.0,addr=0x8,drive=r0,serial=raid0,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-blk-pci,bus=pcie.0,addr=0x9,drive=r1,serial=raid1,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ + -device virtio-blk-pci,bus=pcie.0,addr=0xa,drive=r2,serial=raid2,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-net-pci,netdev=net01,disable-legacy=on,disable-modern=off$VIRTIO_NET_FEATURES$IOMMU_DEV_EXTRA \ -device virtio-serial-pci,disable-legacy=on,disable-modern=off$IOMMU_DEV_EXTRA \ -device virtconsole,chardev=mux \ From ddcbdb5874f4a85796f5db027dd2d0b9a5e49074 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Mon, 9 Mar 2026 21:22:40 +0000 Subject: [PATCH 02/22] Change the RAID1 prcess_read to asynchronous implementation --- Cargo.lock | 1 + kernel/comps/raid/Cargo.toml | 1 + kernel/comps/raid/src/lib.rs | 7 ++++++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 7e8841f56..7a3afa39a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -262,6 +262,7 @@ name = "aster-raid" version = "0.1.0" dependencies = [ "aster-block", + "aster-time", "aster-virtio", "log", "ostd", diff --git a/kernel/comps/raid/Cargo.toml b/kernel/comps/raid/Cargo.toml index c2fcd8554..766b4b60d 100644 --- a/kernel/comps/raid/Cargo.toml +++ b/kernel/comps/raid/Cargo.toml @@ -9,6 +9,7 @@ edition = "2024" ostd = { path = "../../../ostd" } aster-block = { path = "../block" } aster-virtio = { path = "../virtio" } +aster-time = { path = "../time" } log = "0.4" [lints] diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 2c1aa40af..0ba584c48 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -28,6 +28,7 @@ pub mod selection_policies; pub mod server_traits; use alloc::{borrow::ToOwned, sync::Arc, vec::Vec}; +use ostd::task::scheduler::info; use core::{cmp, ops::Range}; use aster_block::{ @@ -148,6 +149,7 @@ impl Raid1Device { /// Dispatches a request by type. The RAID-1 device accepts the same BIOs as /// any `BlockDevice` and applies RAID semantics underneath. fn process_request(&self, request: BioRequest) { + // log::info!("Raid1Device process request, type: {:?}", request.type_()); match request.type_() { BioType::Read => self.process_read_async(request), BioType::Write => self.process_write(request), @@ -176,7 +178,7 @@ impl Raid1Device { #[cfg(baseline_asterinas)] fn process_read(&self, request: BioRequest) { for parent in request.bios() { - let member = self.members[0].clone(); + let member = self.selection_policy.select_block_device().unwrap(); let child = Bio::new( BioType::Read, parent.sid_range().start, @@ -236,6 +238,7 @@ impl Raid1Device { } } + /// Processes read requests asynchronously. /// /// Each `SubmittedBio` in the merged `BioRequest` is assigned to a read @@ -280,6 +283,8 @@ impl Raid1Device { let status = self.fanout_to_members(parent, BioType::Write, || Self::clone_segments(parent)); parent.complete(status); + // let status = BioStatus::Complete; + // parent.complete(status); } } From 669f9289ac7631e7316cb9800c067b27389e7696 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Mon, 9 Mar 2026 21:38:09 +0000 Subject: [PATCH 03/22] Resolve the long RAID1 tail latency issue by setting the RAID1 module's thread to use realtime scheduler --- kernel/src/fs/mod.rs | 43 +++++++++++++++++-------------------------- tools/qemu_args.sh | 8 ++++---- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index e70b573fb..5fcd396eb 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -76,6 +76,18 @@ pub fn lazy_init() { // info!("[kernel] Mount ExFat fs at {:?} ", target_path); // } + // single disk benchmark + // let nvme_device_name = "raid0"; + // if let Ok(block_device_nvme) = start_block_device(nvme_device_name) { + // let nvme_fs = Ext2::open(block_device_nvme).unwrap(); + // let target_path = FsPath::try_from("/raid1").unwrap(); + // self::rootfs::mount_fs_at(nvme_fs, &target_path).unwrap(); + // info!("[kernel] Mounted NVMe fs at {:?} ", target_path); + // } else { + // error!("[kernel] Failed to start NVMe block device '{}'", nvme_device_name); + // } + // return; + info!("[raid] initializing RAID-1 device: {:?}", raid1_device_name); if let Err(err) = setup_raid1_device(raid1_device_name) { error!("[raid] failed to setup RAID-1 device: {:?}", err); @@ -123,6 +135,7 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } } } + #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); #[cfg(not(baseline_asterinas))] @@ -150,32 +163,10 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } }; - // early stop for testing - // Ok(()); - - info!("[raid] creating selection policy"); - let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); - - Raid1Device::init(raid_device_name, members, selection_policy).map_err(|err| match err { - Raid1DeviceError::NotEnoughMembers => { - Error::with_message(Errno::EINVAL, "RAID-1 device requires at least two members") - } - })?; - info!("[raid] RAID-1 device created"); - - let worker = aster_block::get_device(raid_device_name).unwrap(); - // The registry stores `Arc`. Use `downcast_ref` on the captured Arc each - // iteration to call the RAID-specific helper without needing ownership of `Raid1Device`. - // TODO(Yingqi): Merge the starting of the RAID-1 thread inside block device server. - let task_fn = move || { - info!("spawn the RAID-1 device thread"); - let raid = worker.downcast_ref::().unwrap(); - loop { - raid.handle_requests(); - } - }; - - crate::ThreadOptions::new(task_fn).spawn(); + crate::ThreadOptions::new(task_fn).sched_policy(crate::sched::SchedPolicy::RealTime { + rt_prio: 50.try_into().unwrap(), + rt_policy: crate::sched::RealTimePolicy::RoundRobin { base_slice_factor: None }, + }).spawn(); info!( "[raid] RAID-1 device '{}' registered and worker thread spawned", diff --git a/tools/qemu_args.sh b/tools/qemu_args.sh index 403589bcf..88e9d9100 100755 --- a/tools/qemu_args.sh +++ b/tools/qemu_args.sh @@ -20,7 +20,7 @@ VHOST=${VHOST:-"off"} VSOCK=${VSOCK:-"off"} NETDEV=${NETDEV:-"user"} -SSH_RAND_PORT=${SSH_PORT:-22} +SSH_RAND_PORT=${SSH_PORT:-61541} NGINX_RAND_PORT=${NGINX_PORT:-8080} REDIS_RAND_PORT=${REDIS_PORT:-6379} IPERF_RAND_PORT=${IPERF_PORT:-5201} @@ -89,9 +89,9 @@ COMMON_QEMU_ARGS="\ -device isa-debug-exit,iobase=0xf4,iosize=0x04 \ -drive if=none,format=raw,id=x0,file=./test/build/ext2.img \ -drive if=none,format=raw,id=x1,file=./test/build/exfat.img \ - -drive if=none,format=raw,id=r0,file=/dev/nvme0n1p1 \ - -drive if=none,format=raw,id=r1,file=/dev/nvme1n1p1 \ - -drive if=none,format=raw,id=r2,file=/dev/nvme2n1p1 \ + -drive if=none,format=raw,id=r0,file=/dev/nvme0n1p1,cache=directsync \ + -drive if=none,format=raw,id=r1,file=/dev/nvme1n1p1,cache=directsync \ + -drive if=none,format=raw,id=r2,file=/dev/nvme2n1p1,cache=directsync \ " if [ "$1" = "iommu" ]; then From e22a1a12dcc5ac6b5ec072cac608584bcaf50226 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Tue, 31 Mar 2026 03:39:06 +0000 Subject: [PATCH 04/22] Update the RAID1 and VirtIO module to use the new OQueue API --- .../comps/virtio/src/device/block/device.rs | 31 +++++++++++++------ .../virtio/src/device/block/server_traits.rs | 17 +++++----- kernel/comps/virtio/src/device/mod.rs | 6 ++-- kernel/src/error.rs | 2 +- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index 455965e79..a7bea5a1e 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -2,13 +2,18 @@ use alloc::{ boxed::Box, - collections::BTreeMap, + collections::{BTreeMap, VecDeque}, string::{String, ToString}, sync::Arc, vec, vec::Vec, }; -use core::{fmt::Debug, hint::spin_loop, mem::size_of}; +use core::{ + fmt::Debug, + hint::spin_loop, + mem::size_of, + sync::atomic::{AtomicU64, Ordering}, +}; use aster_block::{ BlockDeviceMeta, @@ -23,7 +28,7 @@ use log::{debug, info}; #[cfg(not(baseline_asterinas))] use ostd::orpc::framework::spawn_thread; #[cfg(not(baseline_asterinas))] -use ostd::orpc::legacy_oqueue::{OQueueRef, Producer}; +use ostd::orpc::oqueue::{ConsumableOQueue as _, ConsumableOQueueRef, OQueue as _, OQueueRef}; #[cfg(not(baseline_asterinas))] use ostd::orpc::{orpc_impl, orpc_server}; use ostd::{ @@ -65,7 +70,7 @@ pub struct BlockDevice { #[cfg(not(baseline_asterinas))] #[orpc_impl] impl server_traits::BlockIOObservable for BlockDevice { - fn bio_submission_oqueue(&self) -> OQueueRef; + fn bio_submission_oqueue(&self) -> ConsumableOQueueRef; fn bio_completion_oqueue(&self) -> OQueueRef; } @@ -151,6 +156,11 @@ impl BlockDevice { pub fn submit(&self, bio: Bio) -> Result { bio.submit(self) } + + /// Sets the logical index for this device, used to tag I/O completion stats. + pub fn set_device_index(&self, index: u64) { + self.device.device_index.store(index, Ordering::Relaxed); + } } #[cfg(baseline_asterinas)] @@ -170,12 +180,13 @@ impl aster_block::BlockDevice for BlockDevice { #[cfg(not(baseline_asterinas))] impl aster_block::BlockDevice for BlockDevice { fn enqueue(&self, bio: SubmittedBio) -> Result<(), BioEnqueueError> { - let reply_handle: Box> = - self.bio_completion_oqueue().attach_producer()?; + let reply_handle = self.bio_completion_oqueue().attach_ref_producer()?; let mut bio = bio; - bio.prepare_enqueue(reply_handle, self.queue.clone()); - self.bio_submission_oqueue().produce(bio)?; + let device_index = self.device.device_index.load(Ordering::Relaxed); + bio.prepare_enqueue(reply_handle, self.queue.clone(), device_index); + let producer = self.bio_submission_oqueue().attach_value_producer()?; + producer.produce(bio); Ok(()) } @@ -197,6 +208,7 @@ struct DeviceInner { block_responses: DmaStream, id_allocator: SpinLock, submitted_requests: SpinLock>, + device_index: AtomicU64, } impl DeviceInner { @@ -245,6 +257,7 @@ impl DeviceInner { block_responses, id_allocator: SpinLock::new(IdAlloc::with_capacity(Self::QUEUE_SIZE as usize)), submitted_requests: SpinLock::new(BTreeMap::new()), + device_index: AtomicU64::new(u64::MAX), }); let cloned_device = device.clone(); @@ -273,7 +286,7 @@ impl DeviceInner { /// Handles the irq issued from the device fn handle_irq(&self) { - info!("Virtio block device handle irq"); + // info!("Virtio block device handle irq"); // When we enter the IRQs handling function, // IRQs have already been disabled, // so there is no need to call `disable_irq`. diff --git a/kernel/comps/virtio/src/device/block/server_traits.rs b/kernel/comps/virtio/src/device/block/server_traits.rs index c72d89eba..393366f9b 100644 --- a/kernel/comps/virtio/src/device/block/server_traits.rs +++ b/kernel/comps/virtio/src/device/block/server_traits.rs @@ -3,10 +3,7 @@ use aster_block::bio::{BlockDeviceCompletionStats, SubmittedBio}; use ostd::orpc::{ errors::RPCError, - legacy_oqueue::{ - OQueueAttachError, OQueueRef, - locking::{LockingQueue, ObservableLockingQueue}, - }, + oqueue::{ConsumableOQueue as _, ConsumableOQueueRef, OQueue as _, OQueueError, OQueueRef}, orpc_trait, }; @@ -18,9 +15,9 @@ impl From for VirtioDeviceError { } } -impl From for VirtioDeviceError { - fn from(value: OQueueAttachError) -> Self { - VirtioDeviceError::OQueueAttachError(value) +impl From for VirtioDeviceError { + fn from(value: OQueueError) -> Self { + VirtioDeviceError::OQueueError(value) } } @@ -28,14 +25,14 @@ impl From for VirtioDeviceError { pub trait BlockIOObservable { /// The OQueue containing every bio submission request. /// The submission queue doesn't needed to be observable. - fn bio_submission_oqueue(&self) -> OQueueRef { - LockingQueue::new(32) + fn bio_submission_oqueue(&self) -> ConsumableOQueueRef { + ConsumableOQueueRef::new_anonymous(32) } /// The OQueue containing every write request. This includes both sync and async writes and any /// other write operations on other traits fn bio_completion_oqueue(&self) -> OQueueRef { - ObservableLockingQueue::new(32, 1) + OQueueRef::new_anonymous(4096) } } diff --git a/kernel/comps/virtio/src/device/mod.rs b/kernel/comps/virtio/src/device/mod.rs index 990af57b9..6e32da023 100644 --- a/kernel/comps/virtio/src/device/mod.rs +++ b/kernel/comps/virtio/src/device/mod.rs @@ -2,7 +2,7 @@ use int_to_c_enum::TryFromInt; #[cfg(not(baseline_asterinas))] -use ostd::orpc::{errors::RPCError, legacy_oqueue::OQueueAttachError}; +use ostd::orpc::{errors::RPCError, oqueue::OQueueError}; use crate::queue::QueueError; @@ -52,9 +52,9 @@ pub enum VirtioDeviceError { /// The ORPC Errors #[cfg(not(baseline_asterinas))] RPCError(RPCError), - /// The OQueue attachment errors + /// The OQueue errors #[cfg(not(baseline_asterinas))] - OQueueAttachError(OQueueAttachError), + OQueueError(OQueueError), } impl From for VirtioDeviceError { diff --git a/kernel/src/error.rs b/kernel/src/error.rs index 89a19d919..54a9e3538 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -497,7 +497,7 @@ impl From for Error { Error::with_message(Errno::EINVAL, "Bio is too big") } #[cfg(not(baseline_asterinas))] - aster_block::bio::BioEnqueueError::OQueueAttachError(err) => err.into(), + aster_block::bio::BioEnqueueError::OQueueError(err) => err.into(), } } } From 0d69f437aa6f2c3b832acf09aae9616144b1bc47 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Tue, 31 Mar 2026 03:45:55 +0000 Subject: [PATCH 05/22] Setup ORPC data capture for RAID1 IO data --- Cargo.lock | 2 + kernel/comps/block/Cargo.toml | 1 + kernel/comps/block/src/bio.rs | 57 +++++--- kernel/comps/mariposa_data_capture/Cargo.toml | 1 + kernel/src/fs/mod.rs | 125 +++++++++++++++++- tools/qemu_args.sh | 2 + 6 files changed, 164 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a3afa39a..edb5fb30f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -102,6 +102,7 @@ version = "0.1.0" dependencies = [ "align_ext", "aster-time", + "binary_serde", "bitvec", "component", "int-to-c-enum", @@ -1172,6 +1173,7 @@ version = "0.1.0" dependencies = [ "aster-block", "aster-logger", + "aster-time", "binary_serde", "component", "log", diff --git a/kernel/comps/block/Cargo.toml b/kernel/comps/block/Cargo.toml index a0357a340..98e5d90e2 100644 --- a/kernel/comps/block/Cargo.toml +++ b/kernel/comps/block/Cargo.toml @@ -12,6 +12,7 @@ align_ext = { path = "../../../ostd/libs/align_ext" } int-to-c-enum = { path = "../../libs/int-to-c-enum" } component = { path = "../../libs/comp-sys/component" } aster-time = { path = "../time" } +binary_serde = "1.0.25" log = "0.4" bitvec = { version = "1.0.1", default-features = false, features = ["alloc"] } diff --git a/kernel/comps/block/src/bio.rs b/kernel/comps/block/src/bio.rs index aa546049b..ed3466434 100644 --- a/kernel/comps/block/src/bio.rs +++ b/kernel/comps/block/src/bio.rs @@ -1,14 +1,15 @@ // SPDX-License-Identifier: MPL-2.0 use alloc::{boxed::Box, sync::Weak}; -use core::{fmt::Display, time::Duration}; +use binary_serde::BinarySerde; +use core::fmt::Display; use align_ext::AlignExt; use aster_time::read_monotonic_time; use bitvec::array::BitArray; use int_to_c_enum::TryFromInt; #[cfg(not(baseline_asterinas))] -use ostd::orpc::legacy_oqueue::{OQueueAttachError, Producer}; +use ostd::orpc::oqueue::{OQueueError, RefProducer}; use ostd::{ Error, mm::{ @@ -25,12 +26,15 @@ use crate::{BLOCK_SIZE, SECTOR_SIZE, prelude::*, request_queue::BioRequestSingle /// Trace data for block device I/O completion. /// /// This struct captures performance metrics when a block I/O request completes. -#[derive(Clone)] +#[derive(Clone, Copy, Default, BinarySerde)] +#[repr(C)] pub struct BlockDeviceCompletionStats { - /// The latency of the I/O request (time from submission to completion). - pub latency: Duration, + /// The latency of the I/O request in microseconds. + pub latency_us: u64, /// The number of outstanding requests at completion time. - pub outstanding_requests: usize, + pub outstanding_requests: u64, + /// The index of the device that produced this stat. + pub device_index: u64, } /// The unit for block I/O. @@ -145,9 +149,11 @@ impl Bio { bio_inner: self.0.clone(), #[cfg(not(baseline_asterinas))] reply_handle: None, - submission_time: None, + submission_time_us: None, #[cfg(not(baseline_asterinas))] bio_request_single_queue: None, + #[cfg(not(baseline_asterinas))] + device_index: None, }) { // Fail to submit, revert the status. let result = self.0.status.compare_exchange( @@ -200,15 +206,15 @@ pub enum BioEnqueueError { Refused, /// Too big bio TooBig, - /// OQueue attachment failures + /// OQueue error #[cfg(not(baseline_asterinas))] - OQueueAttachError(OQueueAttachError), + OQueueError(OQueueError), } #[cfg(not(baseline_asterinas))] -impl From for BioEnqueueError { - fn from(err: OQueueAttachError) -> Self { - Self::OQueueAttachError(err) +impl From for BioEnqueueError { + fn from(err: OQueueError) -> Self { + Self::OQueueError(err) } } @@ -325,12 +331,15 @@ pub struct SubmittedBio { bio_inner: Arc, #[cfg(not(baseline_asterinas))] - reply_handle: Option>>, + reply_handle: Option>, - submission_time: Option, + submission_time_us: Option, #[cfg(not(baseline_asterinas))] bio_request_single_queue: Option>, + + #[cfg(not(baseline_asterinas))] + device_index: Option, } impl core::fmt::Debug for SubmittedBio { @@ -339,7 +348,7 @@ impl core::fmt::Debug for SubmittedBio { let d = d.field("bio_inner", &self.bio_inner); #[cfg(not(baseline_asterinas))] let d = d - .field("submission_time", &self.submission_time) + .field("submission_time_us", &self.submission_time_us) .field("bio_request_single_queue", &self.bio_request_single_queue) .field( "reply_handle", @@ -391,8 +400,8 @@ impl SubmittedBio { } } - pub fn submission_time(&self) -> Option { - self.submission_time + pub fn submission_time_us(&self) -> Option { + self.submission_time_us } #[cfg(not(baseline_asterinas))] @@ -406,12 +415,14 @@ impl SubmittedBio { #[cfg(not(baseline_asterinas))] pub fn prepare_enqueue( &mut self, - reply_handle: Box>, + reply_handle: RefProducer, bio_request_single_queue: Arc, + device_index: u64, ) { self.reply_handle = Some(reply_handle); self.bio_request_single_queue = Some(Arc::downgrade(&bio_request_single_queue)); - self.submission_time = Some(read_monotonic_time()); + self.submission_time_us = Some(read_monotonic_time().as_micros() as u64); + self.device_index = Some(device_index); } #[cfg(not(baseline_asterinas))] @@ -419,9 +430,11 @@ impl SubmittedBio { self.reply_handle .as_ref() .unwrap() - .produce(BlockDeviceCompletionStats { - latency: read_monotonic_time() - self.submission_time.unwrap(), - outstanding_requests: self.num_outstanding_requests().unwrap_or(0), + .try_produce_ref(&BlockDeviceCompletionStats { + latency_us: read_monotonic_time().as_micros() as u64 + - self.submission_time_us.unwrap(), + outstanding_requests: self.num_outstanding_requests().unwrap_or(0) as u64, + device_index: self.device_index.unwrap_or(u64::MAX), }); } } diff --git a/kernel/comps/mariposa_data_capture/Cargo.toml b/kernel/comps/mariposa_data_capture/Cargo.toml index 771e3152b..caa0a996f 100644 --- a/kernel/comps/mariposa_data_capture/Cargo.toml +++ b/kernel/comps/mariposa_data_capture/Cargo.toml @@ -9,6 +9,7 @@ edition = "2024" component = { path = "../../libs/comp-sys/component" } aster-logger = { path = "../logger" } aster-block = { path = "../block" } +aster-time = { path = "../time" } ostd = { path = "../../../ostd" } binary_serde = "1.0.25" log = "0.4" diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 5fcd396eb..626ed373b 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -47,7 +47,13 @@ fn start_block_device(device_name: &str) -> Result> { virtio_block_device.handle_requests(); } }; - crate::ThreadOptions::new(task_fn).spawn(); + // Elevate to RealTime 50 so these I/O threads are not starved by other RealTime threads. + crate::ThreadOptions::new(task_fn) + .sched_policy(crate::sched::SchedPolicy::RealTime { + rt_prio: 50.try_into().unwrap(), + rt_policy: crate::sched::RealTimePolicy::RoundRobin { base_slice_factor: None }, + }) + .spawn(); Ok(device) } else { return_errno_with_message!(Errno::ENOENT, "Device does not exist") @@ -120,10 +126,13 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { let mut members = Vec::with_capacity(RAID_MEMBER_NAMES.len()); // Start the RAID-1's underlying member devices. - for &name in RAID_MEMBER_NAMES { + for (index, &name) in RAID_MEMBER_NAMES.iter().enumerate() { match start_block_device(name) { Ok(device) => { info!("[raid] member '{}' online", name); + if let Some(virtio_dev) = device.downcast_ref::() { + virtio_dev.set_device_index(index as u64); + } members.push(device); } Err(err) => { @@ -136,6 +145,9 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } } + // #[cfg(not(baseline_asterinas))] + setup_data_capture(&members, RAID_MEMBER_NAMES); + #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); #[cfg(not(baseline_asterinas))] @@ -174,3 +186,112 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { ); Ok(()) } + +/// Set up data capture for the RAID-1 member devices' bio completion stats. +/// +/// This starts the capture block device and uses the legacy `DataCaptureDevice` / +/// `DataCaptureFile` server to observe each member's `bio_completion_oqueue` and write the +/// serialized data to disk. +#[cfg(not(baseline_asterinas))] +fn setup_data_capture( + members: &[Arc], + member_names: &[&str], +) { + use aster_block::{SECTOR_SIZE, bio::BlockDeviceCompletionStats}; + use aster_virtio::device::block::server_traits::BlockIOObservable as _; + use mariposa_data_capture::{ + DataCaptureDevice as _, DataCaptureDeviceServer, DataCaptureFile as _, FileDescriptor, + ObserverRegistration, + }; + use ostd::orpc::oqueue::{OQueueBase as _, ObservationQuery}; + + // Start the capture block device + // let capture_dev = match start_block_device("capture") { + // Ok(dev) => dev, + // Err(e) => { + // error!("[capture] failed to start capture device: {:?}", e); + // return; + // } + // }; + let device_name = "capture"; + let capture_dev = aster_block::get_device(device_name).unwrap_or_else(|| { + panic!("[capture] failed to get capture device '{}'", device_name); + }); + let cloned_device = capture_dev.clone(); + let task_fn = move || { + info!("[capture] spawn the virt-io-block thread for the capturing device"); + let virtio_block_device = cloned_device.downcast_ref::().unwrap(); + loop { + virtio_block_device.handle_requests(); + } + }; + crate::ThreadOptions::new(task_fn).sched_policy(crate::sched::SchedPolicy::RealTime { + rt_prio: 50.try_into().unwrap(), + rt_policy: crate::sched::RealTimePolicy::RoundRobin { base_slice_factor: None }, + }).spawn(); + + + // Display the capture device backend info + let capture_size = capture_dev.metadata().nr_sectors * SECTOR_SIZE; + info!( + "[capture] capture device online, size = {} bytes", + capture_size + ); + + // Create the data capture device and file + let capture_device = DataCaptureDeviceServer::new(capture_dev.clone()); + let capture_path = ostd::path!(data_capture.bio_completion); + let capture_file = match capture_device.new_file(FileDescriptor { length: 65536, path: capture_path.clone() }) { // 512MB * 1024 * 1024 / 2 / 4096 (using half of the space, and number of pages here) + Ok(builder) => builder.build::(), + Err(e) => { + error!("[capture] failed to create capture file: {:?}", e); + return; + } + }; + + // Attach a strong observer to each RAID member's bio_completion_oqueue + // and register it directly with the capture file. + for (member, &name) in members.iter().zip(member_names.iter()) { // (member, name) + let virtio_dev = member.downcast_ref::().unwrap(); + let oqueue = virtio_dev.bio_completion_oqueue(); + let observer_path = capture_path.append(&ostd::path!({name})); + match oqueue.attach_strong_observer(ObservationQuery::identity()) { + Ok(observer) => { + let registration = ObserverRegistration { path: observer_path, observer }; + if let Err(e) = capture_file.register_observer(registration) { + error!("[capture] failed to register observer for '{}': {:?}", name, e); + } else { + info!("[capture] attached observer to '{}'", name); + } + } + Err(e) => { + error!("[capture] failed to attach observer to '{}': {:?}", name, e); + } + } + } + + // Enable capturing + if let Err(e) = capture_file.start() { + error!("[capture] failed to enable capturing: {:?}", e); + } + + // Spawn a timer task that sends TimedFlush every 10 seconds to trigger + // a flush if data has been idle for that long. + let capture_file_for_timer = capture_file.clone(); + crate::ThreadOptions::new(move || { + use core::time::Duration; + use ostd::timer::Jiffies; + loop { + let target = Jiffies::elapsed().as_duration() + Duration::from_secs(5); + while Jiffies::elapsed().as_duration() < target { + ostd::task::Task::yield_now(); + } + if let Err(e) = capture_file_for_timer.timed_flush() { + log::error!("[capture] timed_flush failed: {:?}", e); + } + } + }) + .spawn(); + + info!("[capture] data capture enabled for bio completion stats"); +} diff --git a/tools/qemu_args.sh b/tools/qemu_args.sh index 88e9d9100..b654a109f 100755 --- a/tools/qemu_args.sh +++ b/tools/qemu_args.sh @@ -92,6 +92,7 @@ COMMON_QEMU_ARGS="\ -drive if=none,format=raw,id=r0,file=/dev/nvme0n1p1,cache=directsync \ -drive if=none,format=raw,id=r1,file=/dev/nvme1n1p1,cache=directsync \ -drive if=none,format=raw,id=r2,file=/dev/nvme2n1p1,cache=directsync \ + -drive if=none,format=raw,id=cap0,file=./dataset/capture.raw,cache=writeback \ " if [ "$1" = "iommu" ]; then @@ -115,6 +116,7 @@ QEMU_ARGS="\ -device virtio-blk-pci,bus=pcie.0,addr=0x8,drive=r0,serial=raid0,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-blk-pci,bus=pcie.0,addr=0x9,drive=r1,serial=raid1,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-blk-pci,bus=pcie.0,addr=0xa,drive=r2,serial=raid2,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ + -device virtio-blk-pci,bus=pcie.0,addr=0xb,drive=cap0,serial=capture,disable-legacy=on,disable-modern=off,queue-size=64,num-queues=1,request-merging=off,backend_defaults=off,discard=off,write-zeroes=off,event_idx=off,indirect_desc=off,queue_reset=off$IOMMU_DEV_EXTRA \ -device virtio-net-pci,netdev=net01,disable-legacy=on,disable-modern=off$VIRTIO_NET_FEATURES$IOMMU_DEV_EXTRA \ -device virtio-serial-pci,disable-legacy=on,disable-modern=off$IOMMU_DEV_EXTRA \ -device virtconsole,chardev=mux \ From 84607a77af6f3bf25c8ea9394c70f1b59e178742 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Tue, 31 Mar 2026 03:47:07 +0000 Subject: [PATCH 06/22] Fix the data capturing hanging problem by disable IRQ for OQueue strong observe and converting data writeout on the fly into flush all on inactive. --- .../src/data_buffering.rs | 16 ++++- .../src/data_capture_file.rs | 60 ++++++++++++++++++- ostd/src/orpc/oqueue/implementation.rs | 4 +- 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/kernel/comps/mariposa_data_capture/src/data_buffering.rs b/kernel/comps/mariposa_data_capture/src/data_buffering.rs index 3e26b8a69..d3651883c 100644 --- a/kernel/comps/mariposa_data_capture/src/data_buffering.rs +++ b/kernel/comps/mariposa_data_capture/src/data_buffering.rs @@ -12,8 +12,8 @@ use binary_serde::{BinarySerde, Endianness}; use ostd::orpc::path::Path; /// A buffer for managing data which will be written bit by bit, but the extracted in larger blocks. -struct DataBuf { - data: Vec, +pub(crate) struct DataBuf { + pub data: Vec, } impl DataBuf { @@ -63,7 +63,7 @@ impl DataBuf { /// Handles buffering and flushing data to a block device. pub(crate) struct ChunkingWriteWrapper { - data_buf: DataBuf, + pub data_buf: DataBuf, pub(crate) block_device: Arc, pub(crate) current_bid: Bid, } @@ -115,9 +115,19 @@ impl ChunkingWriteWrapper { let _ = self .block_device .write_blocks_async(self.current_bid, bio_segment)?; + waiter.wait(); Ok(n_written) } + /// Flushes all complete blocks from the buffer to storage. + /// Stops when fewer than BLOCK_SIZE bytes remain to avoid writing partial blocks. + pub fn flush_all(&mut self) -> Result<(), Box> { + while self.data_buf.len() > BLOCK_SIZE { + self.flush_if_needed()?; + } + Ok(()) + } + pub fn sync(&mut self) -> Result<(), Box> { self.block_device.sync()?; Ok(()) diff --git a/kernel/comps/mariposa_data_capture/src/data_capture_file.rs b/kernel/comps/mariposa_data_capture/src/data_capture_file.rs index 61c653882..1a0e03f83 100644 --- a/kernel/comps/mariposa_data_capture/src/data_capture_file.rs +++ b/kernel/comps/mariposa_data_capture/src/data_capture_file.rs @@ -21,6 +21,8 @@ use alloc::{boxed::Box, sync::Arc, vec::Vec}; use core::{any::Any, error::Error, sync::atomic::AtomicBool}; +use aster_time::read_monotonic_time; + use aster_block::{BLOCK_SIZE, BlockDevice, id::Bid}; use binary_serde::BinarySerde; use ostd::{ @@ -63,6 +65,10 @@ pub trait DataCaptureFile: Any { fn register_observer(&self, attachment: ObserverRegistration) -> Result<(), RPCError>; /// Flush any data remaining in the output buffers to disk. fn flush(&self) -> Result<(), RPCError>; + /// Flush All data in the output buffer to disk. + fn flush_all(&self) -> Result<(), RPCError>; + /// Flush if data has been observed but not flushed for at least 10 seconds. + fn timed_flush(&self) -> Result<(), RPCError>; /// Sync writes to disk. fn sync(&self) -> Result<(), RPCError>; /// Enable capturing to this file. @@ -75,6 +81,9 @@ pub trait DataCaptureFile: Any { enum DataCaptureFileCommand { RegisterObserver(ObserverRegistration), Flush, + FlushAll, + /// Flush only if data has been observed but not yet flushed for at least 10 seconds. + TimedFlush, Sync, Stop, } @@ -84,6 +93,8 @@ impl core::fmt::Debug for DataCaptureFil match self { Self::RegisterObserver(arg0) => f.debug_tuple("AttachOqueue").field(arg0).finish(), Self::Flush => write!(f, "Flush"), + Self::FlushAll => write!(f, "FlushAll"), + Self::TimedFlush => write!(f, "TimedFlush"), Self::Sync => write!(f, "Sync"), Self::Stop => write!(f, "Stop"), } @@ -109,12 +120,15 @@ pub struct DataCaptureFileServerThread { impl DataCaptureFileServerThread { fn run(&self) -> Result<(), Box> { let mut data_buf_handler = - ChunkingWriteWrapper::new(BLOCK_SIZE * 2, self.block_device.clone(), self.start_bid); + ChunkingWriteWrapper::new(BLOCK_SIZE * 65536, self.block_device.clone(), self.start_bid); let mut observers: Vec> = Default::default(); // The paths of the attached OQueues. Once the header is written this is set to None and // paths are no longer collected even if more OQueues are attached. let mut paths = Some(Vec::default()); let mut block_handler = BlockOnMany::new(); + // Tracks whether unflushed data exists and when the most recent value was observed. + let mut need_flush = false; + let mut latest_data_observed_us: Option = None; loop { let blockers = [(&self.command_consumer) as &dyn Blocker] @@ -140,6 +154,22 @@ impl DataCaptureFileServerThread { DataCaptureFileCommand::Sync => { data_buf_handler.sync()?; } + DataCaptureFileCommand::FlushAll => { + data_buf_handler.flush_all()?; + } + DataCaptureFileCommand::TimedFlush => { + if need_flush { + if let Some(last_us) = latest_data_observed_us { + let now_us = read_monotonic_time().as_micros() as u64; + if now_us.saturating_sub(last_us) > 5000000 { + log::info!("[capture] Timed flush triggered after {} seconds of inactivity", (now_us - last_us) as f64 / 1_000_000.0); + data_buf_handler.flush_all()?; + need_flush = false; + log::info!("[capture] Timed flush completed"); + } + } + } + } DataCaptureFileCommand::Stop => { self.server .stopped @@ -157,7 +187,14 @@ impl DataCaptureFileServerThread { for o in &observers { // We can't skip the try_strong_observe calls when not `capturing` because that // would leave the values in the OQueues and block them. - while let Ok(Some(v)) = o.try_strong_observe() { + let mut drain_count = 0usize; + while let Ok(Some(v)) = { + // Disable IRQs while holding the OQueue's SpinLock inside + // try_strong_observe to prevent deadlock with the IRQ handler + // that produces to the same OQueue (bio completion stats). + let _irq_guard = ostd::trap::irq::disable_local(); + o.try_strong_observe() + } { if started { if paths.is_some() { data_buf_handler.write_header::(paths.as_ref().unwrap())?; @@ -165,7 +202,14 @@ impl DataCaptureFileServerThread { } data_buf_handler.write_value(&v); - data_buf_handler.flush_if_needed()?; + latest_data_observed_us = Some(read_monotonic_time().as_micros() as u64); + need_flush = true; + // data_buf_handler.flush_if_needed()?; + if data_buf_handler.data_buf.len() % (32 * 1024) == 0 { // 32 * 1024 + log::info!("Captured Data from OQueue to Capture Buffer, size of buffer: {}, capacity: {}", + data_buf_handler.data_buf.len(), + data_buf_handler.data_buf.data.capacity()); + } if data_buf_handler.current_bid == self.end_bid { log::warn!("Data capture ran out of space."); } @@ -189,6 +233,16 @@ impl DataCaptureFile for DataCaptureFileServer< Ok(()) } + fn flush_all(&self) -> Result<(), RPCError> { + self.command_producer.produce(DataCaptureFileCommand::FlushAll); + Ok(()) + } + + fn timed_flush(&self) -> Result<(), RPCError> { + self.command_producer.produce(DataCaptureFileCommand::TimedFlush); + Ok(()) + } + fn sync(&self) -> Result<(), RPCError> { self.command_producer.produce(DataCaptureFileCommand::Sync); Ok(()) diff --git a/ostd/src/orpc/oqueue/implementation.rs b/ostd/src/orpc/oqueue/implementation.rs index eeffe9acf..c59f84178 100644 --- a/ostd/src/orpc/oqueue/implementation.rs +++ b/ostd/src/orpc/oqueue/implementation.rs @@ -774,7 +774,9 @@ impl UntypedOQueueImplementation for OQueueImplementation bool { - let mut inner = self.inner.lock(); + // Disable IRQs before acquiring the lock to prevent deadlock with IRQ handlers + // (e.g. handle_irq → try_produce_ref) that acquire the same lock on the same CPU. + let mut inner = self.inner.disable_irq().lock(); let ObservationRingBuffer { ring_buffer, .. } = inner .observer_ring_buffers .get_mut(observer_id) From 01c296fe21aa1cb33ea3fcacac2c376f14bfada9 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 1 Apr 2026 03:47:10 +0000 Subject: [PATCH 07/22] Change synchronous flushing to asynchronous flushing --- kernel/comps/mariposa_data_capture/src/data_buffering.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/comps/mariposa_data_capture/src/data_buffering.rs b/kernel/comps/mariposa_data_capture/src/data_buffering.rs index d3651883c..ee6e74229 100644 --- a/kernel/comps/mariposa_data_capture/src/data_buffering.rs +++ b/kernel/comps/mariposa_data_capture/src/data_buffering.rs @@ -112,7 +112,7 @@ impl ChunkingWriteWrapper { let raw_data = self.data_buf.written_data(); let bio_segment = BioSegment::alloc(1, BioDirection::ToDevice); let n_written = bio_segment.writer()?.write(&mut raw_data.into()); - let _ = self + let waiter = self .block_device .write_blocks_async(self.current_bid, bio_segment)?; waiter.wait(); From 4b947631ef6e76c49a9a2b8f92e066c5a584e5a9 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 1 Apr 2026 04:43:54 +0000 Subject: [PATCH 08/22] Update Selector Policy to use new OQueue API --- kernel/comps/raid/src/selection_policies.rs | 22 ++++++++++++++------- ostd/src/orpc/oqueue/implementation.rs | 4 +--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 3344fc8b0..1964e06aa 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -2,7 +2,7 @@ #![cfg(not(baseline_asterinas))] -use alloc::{boxed::Box, sync::Arc, vec::Vec}; +use alloc::{sync::Arc, vec::Vec}; use core::sync::atomic::{AtomicUsize, Ordering}; use aster_block::{ @@ -11,7 +11,10 @@ use aster_block::{ }; use ostd::{ Error, - orpc::{legacy_oqueue::WeakObserver, orpc_server}, + orpc::{ + oqueue::{OQueueBase, ObservationQuery}, + orpc_server, + }, sync::Mutex, }; @@ -85,7 +88,7 @@ impl SelectionPolicy for RoundRobinPolicy { pub struct LinnOSPolicy { read_cursor: AtomicUsize, members: Vec>, - observers: Vec>>>, + observers: Vec>>, hidden_layers: Vec<[[f32; 256]; 31]>, output_layers: Vec<[[f32; 2]; 256]>, } @@ -123,7 +126,7 @@ impl LinnOSPolicy { Mutex::new( device .bio_completion_oqueue() - .attach_weak_observer() + .attach_weak_observer(4, ObservationQuery::identity()) .expect("Failed to attach weak observer to bio_completion_oqueue"), ) }) @@ -151,7 +154,9 @@ impl SelectionPolicy for LinnOSPolicy { let idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); let device_idx = idx % num_devices; let observer = self.observers[device_idx].lock(); - let completion_trace = observer.weak_observe_recent(4); // observe 4 steps in the history + let completion_trace = observer + .weak_observe_recent(4) + .expect("Failed to observe completion trace"); // Build the 31-element input feature vector: // [0..3]: current outstanding requests (3 digits, from most recent trace) @@ -169,8 +174,11 @@ impl SelectionPolicy for LinnOSPolicy { // Feature Engineering in LinnOS: Decompose numbers into digits. // Historical features: 4 steps, each with 3 digits outstanding + 4 digits latency for (i, trace_entry) in completion_trace.iter().enumerate().take(4) { - let outstanding = trace_entry.outstanding_requests; - let latency_us = trace_entry.latency.as_micros() as usize; + let Some(trace_entry) = trace_entry else { + continue; + }; + let outstanding = trace_entry.outstanding_requests as usize; + let latency_us = trace_entry.latency_us as usize; let base = 3 + i * 7; // Outstanding requests -> 3 digits (hundreds, tens, ones) diff --git a/ostd/src/orpc/oqueue/implementation.rs b/ostd/src/orpc/oqueue/implementation.rs index c59f84178..eeffe9acf 100644 --- a/ostd/src/orpc/oqueue/implementation.rs +++ b/ostd/src/orpc/oqueue/implementation.rs @@ -774,9 +774,7 @@ impl UntypedOQueueImplementation for OQueueImplementation bool { - // Disable IRQs before acquiring the lock to prevent deadlock with IRQ handlers - // (e.g. handle_irq → try_produce_ref) that acquire the same lock on the same CPU. - let mut inner = self.inner.disable_irq().lock(); + let mut inner = self.inner.lock(); let ObservationRingBuffer { ring_buffer, .. } = inner .observer_ring_buffers .get_mut(observer_id) From c53cd49ff14a623fd66661743506045af4ba11b5 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Thu, 2 Apr 2026 04:44:10 +0000 Subject: [PATCH 09/22] Change the number of outstanding requests from the num requests in the BioRequestSingleQueue to a manually tracked atomic number. Update LinnOS Policy. --- kernel/comps/block/src/bio.rs | 39 +++--- .../src/data_capture_file.rs | 24 +--- kernel/comps/raid/src/generate_weights.py | 125 ++++++++++++++++++ kernel/comps/raid/src/linnos_weights.rs.j2 | 39 ++++-- kernel/comps/raid/src/selection_policies.rs | 72 +++++----- .../comps/virtio/src/device/block/device.rs | 15 ++- kernel/src/fs/mod.rs | 74 ++++++++--- kernel/src/lib.rs | 4 + 8 files changed, 285 insertions(+), 107 deletions(-) create mode 100644 kernel/comps/raid/src/generate_weights.py diff --git a/kernel/comps/block/src/bio.rs b/kernel/comps/block/src/bio.rs index ed3466434..54722b244 100644 --- a/kernel/comps/block/src/bio.rs +++ b/kernel/comps/block/src/bio.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: MPL-2.0 -use alloc::{boxed::Box, sync::Weak}; +use alloc::{boxed::Box}; use binary_serde::BinarySerde; use core::fmt::Display; @@ -31,8 +31,8 @@ use crate::{BLOCK_SIZE, SECTOR_SIZE, prelude::*, request_queue::BioRequestSingle pub struct BlockDeviceCompletionStats { /// The latency of the I/O request in microseconds. pub latency_us: u64, - /// The number of outstanding requests at completion time. - pub outstanding_requests: u64, + /// The number of outstanding 4KB pages at completion time. + pub outstanding_pages: u64, /// The index of the device that produced this stat. pub device_index: u64, } @@ -149,11 +149,12 @@ impl Bio { bio_inner: self.0.clone(), #[cfg(not(baseline_asterinas))] reply_handle: None, - submission_time_us: None, #[cfg(not(baseline_asterinas))] - bio_request_single_queue: None, + submission_time_us: None, #[cfg(not(baseline_asterinas))] device_index: None, + #[cfg(not(baseline_asterinas))] + num_pages: None, }) { // Fail to submit, revert the status. let result = self.0.status.compare_exchange( @@ -333,13 +334,14 @@ pub struct SubmittedBio { #[cfg(not(baseline_asterinas))] reply_handle: Option>, + #[cfg(not(baseline_asterinas))] submission_time_us: Option, #[cfg(not(baseline_asterinas))] - bio_request_single_queue: Option>, + device_index: Option, #[cfg(not(baseline_asterinas))] - device_index: Option, + num_pages: Option, } impl core::fmt::Debug for SubmittedBio { @@ -349,7 +351,7 @@ impl core::fmt::Debug for SubmittedBio { #[cfg(not(baseline_asterinas))] let d = d .field("submission_time_us", &self.submission_time_us) - .field("bio_request_single_queue", &self.bio_request_single_queue) + .field("device_index", &self.device_index) .field( "reply_handle", &self.reply_handle.as_ref().map(|_| ""), @@ -369,6 +371,12 @@ impl SubmittedBio { self.bio_inner.sid_range() } + /// Returns the number of 4KB pages covered by this bio's sector range. + pub fn num_pages(&self) -> u64 { + let sectors = self.bio_inner.sid_range().end.to_raw() - self.bio_inner.sid_range().start.to_raw(); + (sectors + 7) / 8 + } + /// Returns the slice to the memory segments. pub fn segments(&self) -> &[BioSegment] { self.bio_inner.segments() @@ -404,36 +412,27 @@ impl SubmittedBio { self.submission_time_us } - #[cfg(not(baseline_asterinas))] - pub fn num_outstanding_requests(&self) -> Option { - self.bio_request_single_queue - .as_ref() - .and_then(|w| w.upgrade()) - .map(|q| q.num_requests()) - } - #[cfg(not(baseline_asterinas))] pub fn prepare_enqueue( &mut self, reply_handle: RefProducer, - bio_request_single_queue: Arc, device_index: u64, ) { self.reply_handle = Some(reply_handle); - self.bio_request_single_queue = Some(Arc::downgrade(&bio_request_single_queue)); self.submission_time_us = Some(read_monotonic_time().as_micros() as u64); self.device_index = Some(device_index); + self.num_pages = Some(self.num_pages()); } #[cfg(not(baseline_asterinas))] - pub fn report_statistics(&self) { + pub fn report_statistics(&self, outstanding_pages: u64) { self.reply_handle .as_ref() .unwrap() .try_produce_ref(&BlockDeviceCompletionStats { latency_us: read_monotonic_time().as_micros() as u64 - self.submission_time_us.unwrap(), - outstanding_requests: self.num_outstanding_requests().unwrap_or(0) as u64, + outstanding_pages, device_index: self.device_index.unwrap_or(u64::MAX), }); } diff --git a/kernel/comps/mariposa_data_capture/src/data_capture_file.rs b/kernel/comps/mariposa_data_capture/src/data_capture_file.rs index 1a0e03f83..e26deec05 100644 --- a/kernel/comps/mariposa_data_capture/src/data_capture_file.rs +++ b/kernel/comps/mariposa_data_capture/src/data_capture_file.rs @@ -67,8 +67,6 @@ pub trait DataCaptureFile: Any { fn flush(&self) -> Result<(), RPCError>; /// Flush All data in the output buffer to disk. fn flush_all(&self) -> Result<(), RPCError>; - /// Flush if data has been observed but not flushed for at least 10 seconds. - fn timed_flush(&self) -> Result<(), RPCError>; /// Sync writes to disk. fn sync(&self) -> Result<(), RPCError>; /// Enable capturing to this file. @@ -82,8 +80,6 @@ enum DataCaptureFileCommand { RegisterObserver(ObserverRegistration), Flush, FlushAll, - /// Flush only if data has been observed but not yet flushed for at least 10 seconds. - TimedFlush, Sync, Stop, } @@ -94,7 +90,6 @@ impl core::fmt::Debug for DataCaptureFil Self::RegisterObserver(arg0) => f.debug_tuple("AttachOqueue").field(arg0).finish(), Self::Flush => write!(f, "Flush"), Self::FlushAll => write!(f, "FlushAll"), - Self::TimedFlush => write!(f, "TimedFlush"), Self::Sync => write!(f, "Sync"), Self::Stop => write!(f, "Stop"), } @@ -156,19 +151,7 @@ impl DataCaptureFileServerThread { } DataCaptureFileCommand::FlushAll => { data_buf_handler.flush_all()?; - } - DataCaptureFileCommand::TimedFlush => { - if need_flush { - if let Some(last_us) = latest_data_observed_us { - let now_us = read_monotonic_time().as_micros() as u64; - if now_us.saturating_sub(last_us) > 5000000 { - log::info!("[capture] Timed flush triggered after {} seconds of inactivity", (now_us - last_us) as f64 / 1_000_000.0); - data_buf_handler.flush_all()?; - need_flush = false; - log::info!("[capture] Timed flush completed"); - } - } - } + log::info!("[capture internal] Flush all completed"); } DataCaptureFileCommand::Stop => { self.server @@ -238,11 +221,6 @@ impl DataCaptureFile for DataCaptureFileServer< Ok(()) } - fn timed_flush(&self) -> Result<(), RPCError> { - self.command_producer.produce(DataCaptureFileCommand::TimedFlush); - Ok(()) - } - fn sync(&self) -> Result<(), RPCError> { self.command_producer.produce(DataCaptureFileCommand::Sync); Ok(()) diff --git a/kernel/comps/raid/src/generate_weights.py b/kernel/comps/raid/src/generate_weights.py new file mode 100644 index 000000000..b633b26d4 --- /dev/null +++ b/kernel/comps/raid/src/generate_weights.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MPL-2.0 + +""" +Load trained PyTorch LinnOS models and generate the Rust weights file +using the Jinja2 template. + +Usage: + python generate_weights.py \ + --models models/model_device0_lr0.001_bs32768_ep20.pt \ + models/model_device1_lr0.001_bs32768_ep20.pt \ + models/model_device2_lr0.001_bs32768_ep20.pt \ + --template kernel/comps/raid/src/linnos_weights.rs.j2 \ + --output kernel/comps/raid/src/linnos_weights.rs + +Run from the repository root. +""" + +import argparse +from pathlib import Path + +import torch +from jinja2 import Environment, FileSystemLoader + + +def load_model(path: str) -> dict: + """Load a model checkpoint and return its state dict.""" + state = torch.load(path, map_location="cpu", weights_only=False) + return state + + +def print_architecture(state: dict, device_idx: int) -> None: + """Print model architecture for sanity check.""" + print(f" Device {device_idx}:") + for name, tensor in state.items(): + print(f" {name:20s} shape={str(list(tensor.shape)):16s} dtype={tensor.dtype}") + + +def tensor_to_list(tensor: torch.Tensor) -> list: + """Convert a tensor to a nested Python list of floats.""" + return tensor.tolist() + + +def main(): + parser = argparse.ArgumentParser(description="Generate LinnOS Rust weight file from PyTorch models") + parser.add_argument( + "--models", nargs="+", required=True, + help="Paths to .pt model files, one per device in order", + ) + parser.add_argument( + "--template", required=True, + help="Path to the Jinja2 template (.rs.j2)", + ) + parser.add_argument( + "--output", required=True, + help="Path for the generated Rust file (.rs)", + ) + args = parser.parse_args() + + # Load all models + models = [] + for path in args.models: + models.append(load_model(path)) + + num_devices = len(models) + + # Sanity check: print architecture + print(f"Loaded {num_devices} model(s).\n") + print("Model architecture:") + for i, state in enumerate(models): + print_architecture(state, i) + print() + + # Extract dimensions from the first model + hidden_weight_shape = models[0]["net.0.weight"].shape # [hidden_size, 31] + hidden_size = hidden_weight_shape[0] + input_size = hidden_weight_shape[1] + output_size = models[0]["net.2.weight"].shape[0] # 2 + + print(f"Network: {input_size} -> {hidden_size} (ReLU) -> {output_size}") + print() + + # Extract weights and biases for each device + # Hidden weights: net.0.weight has shape [hidden_size, input_size]. + # In the Rust code, we index as hidden_weights[input][hidden], + # so we need to transpose: [input_size, hidden_size] = [31][hidden_size]. + hidden_weights = [] + hidden_biases = [] + output_weights = [] + output_biases = [] + + for i, state in enumerate(models): + # Transpose: [hidden_size, 31] -> [31, hidden_size] + hw = state["net.0.weight"].T # [31, hidden_size] + hidden_weights.append(tensor_to_list(hw)) + hidden_biases.append(tensor_to_list(state["net.0.bias"])) + + # Transpose: [2, hidden_size] -> [hidden_size, 2] + ow = state["net.2.weight"].T # [hidden_size, 2] + output_weights.append(tensor_to_list(ow)) + output_biases.append(tensor_to_list(state["net.2.bias"])) + + # Render template + template_path = Path(args.template) + env = Environment( + loader=FileSystemLoader(str(template_path.parent)), + keep_trailing_newline=True, + ) + template = env.get_template(template_path.name) + + rendered = template.render( + num_devices=num_devices, + hidden_size=hidden_size, + hidden_weights=hidden_weights, + hidden_biases=hidden_biases, + output_weights=output_weights, + output_biases=output_biases, + ) + + Path(args.output).write_text(rendered) + print(f"Generated {args.output} ({len(rendered)} bytes)") + + +if __name__ == "__main__": + main() diff --git a/kernel/comps/raid/src/linnos_weights.rs.j2 b/kernel/comps/raid/src/linnos_weights.rs.j2 index 3de554c10..7ed97f331 100644 --- a/kernel/comps/raid/src/linnos_weights.rs.j2 +++ b/kernel/comps/raid/src/linnos_weights.rs.j2 @@ -2,8 +2,8 @@ // LinnOS neural network weights hardcoded for {{ num_devices }} devices. // Each device has: -// - hidden layer: 31 x 256 matrix -// - output layer: 256 x 2 matrix +// - hidden layer: 31 x {{ hidden_size }} matrix + {{ hidden_size }} bias +// - output layer: {{ hidden_size }} x 2 matrix + 2 bias // // AUTO-GENERATED by generate_weights.py using Jinja2. // Do not edit this file manually. @@ -11,34 +11,57 @@ /// Number of devices with hardcoded weights. pub const NUM_DEVICES: usize = {{ num_devices }}; +/// Hidden layer size (number of neurons). +pub const HIDDEN_SIZE: usize = {{ hidden_size }}; + {% for dev in range(num_devices) %} -/// Hidden layer weights for device {{ dev }}: 31 inputs -> 256 neurons -pub static HIDDEN_WEIGHTS_{{ dev }}: [[f32; 256]; 31] = [ +/// Hidden layer weights for device {{ dev }}: 31 inputs -> {{ hidden_size }} neurons +pub static HIDDEN_WEIGHTS_{{ dev }}: [[f32; {{ hidden_size }}]; 31] = [ {% for row in hidden_weights[dev] %} [{{ row | join(', ') }}], {% endfor %} ]; +/// Hidden layer bias for device {{ dev }} +pub static HIDDEN_BIAS_{{ dev }}: [f32; {{ hidden_size }}] = [{{ hidden_biases[dev] | join(', ') }}]; + {% endfor %} {% for dev in range(num_devices) %} -/// Output layer weights for device {{ dev }}: 256 neurons -> 2 classes -pub static OUTPUT_WEIGHTS_{{ dev }}: [[f32; 2]; 256] = [ +/// Output layer weights for device {{ dev }}: {{ hidden_size }} neurons -> 2 classes +pub static OUTPUT_WEIGHTS_{{ dev }}: [[f32; 2]; {{ hidden_size }}] = [ {% for row in output_weights[dev] %} [{{ row | join(', ') }}], {% endfor %} ]; +/// Output layer bias for device {{ dev }} +pub static OUTPUT_BIAS_{{ dev }}: [f32; 2] = [{{ output_biases[dev] | join(', ') }}]; + {% endfor %} /// All hidden layer weights indexed by device. -pub static HIDDEN_WEIGHTS: [&[[f32; 256]; 31]; NUM_DEVICES] = [ +pub static HIDDEN_WEIGHTS: [&[[f32; {{ hidden_size }}]; 31]; NUM_DEVICES] = [ {% for dev in range(num_devices) %} &HIDDEN_WEIGHTS_{{ dev }}, {% endfor %} ]; +/// All hidden layer biases indexed by device. +pub static HIDDEN_BIASES: [&[f32; {{ hidden_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &HIDDEN_BIAS_{{ dev }}, +{% endfor %} +]; + /// All output layer weights indexed by device. -pub static OUTPUT_WEIGHTS: [&[[f32; 2]; 256]; NUM_DEVICES] = [ +pub static OUTPUT_WEIGHTS: [&[[f32; 2]; {{ hidden_size }}]; NUM_DEVICES] = [ {% for dev in range(num_devices) %} &OUTPUT_WEIGHTS_{{ dev }}, {% endfor %} ]; + +/// All output layer biases indexed by device. +pub static OUTPUT_BIASES: [&[f32; 2]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &OUTPUT_BIAS_{{ dev }}, +{% endfor %} +]; diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 1964e06aa..277806396 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -11,14 +11,11 @@ use aster_block::{ }; use ostd::{ Error, - orpc::{ - oqueue::{OQueueBase, ObservationQuery}, - orpc_server, - }, + orpc::orpc_server, sync::Mutex, }; -use crate::server_traits::{ObservableBlockDevice, SelectionPolicy}; +use crate::server_traits::SelectionPolicy; #[derive(Debug)] #[orpc_server] @@ -73,11 +70,12 @@ impl SelectionPolicy for RoundRobinPolicy { } } -/// hidden_layers and output_layers: machine learning model weights. +/// hidden_layers, hidden_biases, output_layers, output_biases: machine learning model weights. /// There is one model per device. Each model contains three layers, an input layer, /// a hidden layer with 256 neurons, and an output layer with 2 neurons for the binary -/// classification (fast/slow). Thus, there are two matrices per device, a 31*256 matrix -/// for the hidden layer and a 256*2 matrix for the output layer. +/// classification (fast/slow). Thus, there are two weight matrices and two bias vectors +/// per device: a 31x256 matrix + 256 bias for the hidden layer and a 256x2 matrix + 2 +/// bias for the output layer. /// Each latency number is decomposed into 4 digits, and each number of outstanding /// request number is decomposed into 3 digits. Thus, the total number of input features /// is 3+4*(3+4) = 31. The number of history is R=4. @@ -87,10 +85,12 @@ impl SelectionPolicy for RoundRobinPolicy { #[orpc_server] pub struct LinnOSPolicy { read_cursor: AtomicUsize, - members: Vec>, + members: Vec>, observers: Vec>>, hidden_layers: Vec<[[f32; 256]; 31]>, + hidden_biases: Vec<[f32; 256]>, output_layers: Vec<[[f32; 2]; 256]>, + output_biases: Vec<[f32; 2]>, } impl core::fmt::Debug for LinnOSPolicy { @@ -107,30 +107,23 @@ impl core::fmt::Debug for LinnOSPolicy { } impl LinnOSPolicy { - pub fn new(members: Vec>) -> Result, Error> { - use crate::linnos_weights::{HIDDEN_WEIGHTS, OUTPUT_WEIGHTS}; + pub fn new( + members: Vec>, + observers: Vec>>, + ) -> Result, Error> { + use crate::linnos_weights::{HIDDEN_BIASES, HIDDEN_WEIGHTS, OUTPUT_BIASES, OUTPUT_WEIGHTS}; let num_devices = members.len(); // Copy hardcoded weights into Vecs, one entry per device let hidden_layers: Vec<[[f32; 256]; 31]> = (0..num_devices).map(|i| *HIDDEN_WEIGHTS[i]).collect(); + let hidden_biases: Vec<[f32; 256]> = + (0..num_devices).map(|i| *HIDDEN_BIASES[i]).collect(); let output_layers: Vec<[[f32; 2]; 256]> = (0..num_devices).map(|i| *OUTPUT_WEIGHTS[i]).collect(); - - // Attach one weak observer per device, each peeking 4 steps in the history. - // Wrapped in Mutex because WeakObserver is Send but not Sync. - let observers: Vec<_> = members - .iter() - .map(|device| { - Mutex::new( - device - .bio_completion_oqueue() - .attach_weak_observer(4, ObservationQuery::identity()) - .expect("Failed to attach weak observer to bio_completion_oqueue"), - ) - }) - .collect(); + let output_biases: Vec<[f32; 2]> = + (0..num_devices).map(|i| *OUTPUT_BIASES[i]).collect(); let server = Self::new_with(|orpc_internal, _| Self { orpc_internal, @@ -138,7 +131,9 @@ impl LinnOSPolicy { members, observers, hidden_layers, + hidden_biases, output_layers, + output_biases, }); Ok(server) @@ -159,14 +154,19 @@ impl SelectionPolicy for LinnOSPolicy { .expect("Failed to observe completion trace"); // Build the 31-element input feature vector: - // [0..3]: current outstanding requests (3 digits, from most recent trace) + // [0..3]: current outstanding pages (3 digits, from most recent trace) // For each history step i (0..4): - // [3+i*7 .. 3+i*7+3]: outstanding requests (3 digits) + // [3+i*7 .. 3+i*7+3]: outstanding pages (3 digits) // [3+i*7+3 .. 3+i*7+7]: latency in microseconds (4 digits) let mut input = [0.0f32; 31]; - // Current outstanding requests: use most recent trace entry, decompose into 3 digits - let current_outstanding = submitted.num_outstanding_requests().unwrap_or(0); + // Current outstanding pages: use most recent trace entry, decompose into 3 digits + let current_outstanding = completion_trace + .iter() + .flatten() + .next() + .map(|t| t.outstanding_pages as usize) + .unwrap_or(0); input[0] = ((current_outstanding / 100) % 10) as f32; input[1] = ((current_outstanding / 10) % 10) as f32; input[2] = (current_outstanding % 10) as f32; @@ -177,11 +177,11 @@ impl SelectionPolicy for LinnOSPolicy { let Some(trace_entry) = trace_entry else { continue; }; - let outstanding = trace_entry.outstanding_requests as usize; + let outstanding = trace_entry.outstanding_pages as usize; let latency_us = trace_entry.latency_us as usize; let base = 3 + i * 7; - // Outstanding requests -> 3 digits (hundreds, tens, ones) + // Outstanding pages -> 3 digits (hundreds, tens, ones) input[base] = ((outstanding / 100) % 10) as f32; input[base + 1] = ((outstanding / 10) % 10) as f32; input[base + 2] = (outstanding % 10) as f32; @@ -193,11 +193,12 @@ impl SelectionPolicy for LinnOSPolicy { input[base + 6] = (latency_us % 10) as f32; } - // Hidden layer: input (31) x hidden_weights (31x256) -> hidden_out (256) + // Hidden layer: input (31) x hidden_weights (31x256) + bias (256) -> hidden_out (256) let hidden_weights = &self.hidden_layers[device_idx]; + let hidden_bias = &self.hidden_biases[device_idx]; let mut hidden_out = [0.0f32; 256]; for j in 0..256 { - let mut sum = 0.0f32; + let mut sum = hidden_bias[j]; for i in 0..31 { sum += input[i] * hidden_weights[i][j]; } @@ -205,9 +206,10 @@ impl SelectionPolicy for LinnOSPolicy { hidden_out[j] = if sum > 0.0 { sum } else { 0.0 }; } - // Output layer: hidden_out (256) x output_weights (256x2) -> output (2) + // Output layer: hidden_out (256) x output_weights (256x2) + bias (2) -> output (2) let output_weights = &self.output_layers[device_idx]; - let mut output = [0.0f32; 2]; + let output_bias = &self.output_biases[device_idx]; + let mut output = [output_bias[0], output_bias[1]]; for k in 0..2 { for j in 0..256 { output[k] += hidden_out[j] * output_weights[j][k]; diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index a7bea5a1e..610151008 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -184,7 +184,8 @@ impl aster_block::BlockDevice for BlockDevice { let mut bio = bio; let device_index = self.device.device_index.load(Ordering::Relaxed); - bio.prepare_enqueue(reply_handle, self.queue.clone(), device_index); + bio.prepare_enqueue(reply_handle, device_index); + self.device.inc_page_counter(bio.num_pages()); let producer = self.bio_submission_oqueue().attach_value_producer()?; producer.produce(bio); Ok(()) @@ -209,6 +210,7 @@ struct DeviceInner { id_allocator: SpinLock, submitted_requests: SpinLock>, device_index: AtomicU64, + num_outstanding_pages: AtomicU64 } impl DeviceInner { @@ -258,6 +260,7 @@ impl DeviceInner { id_allocator: SpinLock::new(IdAlloc::with_capacity(Self::QUEUE_SIZE as usize)), submitted_requests: SpinLock::new(BTreeMap::new()), device_index: AtomicU64::new(u64::MAX), + num_outstanding_pages: AtomicU64::new(0) }); let cloned_device = device.clone(); @@ -329,7 +332,11 @@ impl DeviceInner { complete_request.bio_request.bios().for_each(|bio| { bio.complete(BioStatus::Complete); #[cfg(not(baseline_asterinas))] - bio.report_statistics(); + { + let pages = bio.num_pages(); + let outstanding = self.num_outstanding_pages.fetch_sub(pages, Ordering::Relaxed) - pages; + bio.report_statistics(outstanding); + } }); } } @@ -582,6 +589,10 @@ impl DeviceInner { return; } } + + fn inc_page_counter(&self, n_pages: u64) { + self.num_outstanding_pages.fetch_add(n_pages, Ordering::Relaxed); + } } /// A submitted bio request for callback. diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 626ed373b..6f90778c7 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -34,6 +34,38 @@ use crate::{ prelude::*, }; +#[cfg(not(baseline_asterinas))] +use spin::Once; + +/// Global handle to the data capture file, set during `setup_data_capture`. +#[cfg(not(baseline_asterinas))] +static DATA_CAPTURE_FILE: Once< + Arc>, +> = Once::new(); + +/// Flush all buffered capture data to disk. Call before kernel exit. +/// +/// Commands are enqueued into the server's OQueue and processed in FIFO order. +/// `stop()` spins until the server thread acknowledges, so by the time it returns, +/// the preceding `flush_all` and `sync` are guaranteed to have been processed. +#[cfg(not(baseline_asterinas))] +pub fn flush_data_capture() { + if let Some(capture_file) = DATA_CAPTURE_FILE.get() { + info!("[capture] Flushing all capture data before exit..."); + if let Err(e) = capture_file.flush_all() { + error!("[capture] flush_all failed: {:?}", e); + } + if let Err(e) = capture_file.sync() { + error!("[capture] sync failed: {:?}", e); + } + // stop() blocks until the server thread processes all preceding commands. + if let Err(e) = capture_file.stop() { + error!("[capture] stop failed: {:?}", e); + } + info!("[capture] Capture data flushed."); + } +} + /// Start a thread of the block device to pop requests from the block device's /// request queue and process them if there are any. If the request queue is empty, /// the thread will wait until there is a request in the queue. @@ -146,13 +178,32 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } // #[cfg(not(baseline_asterinas))] - setup_data_capture(&members, RAID_MEMBER_NAMES); + // setup_data_capture(&members, RAID_MEMBER_NAMES); #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); - #[cfg(not(baseline_asterinas))] + // #[cfg(not(baseline_asterinas))] let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); #[cfg(not(baseline_asterinas))] + let observers = members + .iter() + .map(|dev| { + use aster_virtio::device::block::server_traits::BlockIOObservable; + use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; + let virtio_dev = dev + .downcast_ref::() + .expect("RAID member must be a VirtIoBlockDevice for LinnOS"); + ostd::sync::Mutex::new( + virtio_dev + .bio_completion_oqueue() + .attach_weak_observer(4, ObservationQuery::identity()) + .expect("Failed to attach weak observer to bio_completion_oqueue"), + ) + }) + .collect(); + #[cfg(not(baseline_asterinas))] + let selection_policy = LinnOSPolicy::new(members.clone(), observers).unwrap(); + #[cfg(not(baseline_asterinas))] let raid1device = Raid1Device::init(raid_device_name, members, selection_policy); #[cfg(baseline_asterinas)] let raid1device = Raid1Device::init(raid_device_name, members); @@ -275,23 +326,8 @@ fn setup_data_capture( error!("[capture] failed to enable capturing: {:?}", e); } - // Spawn a timer task that sends TimedFlush every 10 seconds to trigger - // a flush if data has been idle for that long. - let capture_file_for_timer = capture_file.clone(); - crate::ThreadOptions::new(move || { - use core::time::Duration; - use ostd::timer::Jiffies; - loop { - let target = Jiffies::elapsed().as_duration() + Duration::from_secs(5); - while Jiffies::elapsed().as_duration() < target { - ostd::task::Task::yield_now(); - } - if let Err(e) = capture_file_for_timer.timed_flush() { - log::error!("[capture] timed_flush failed: {:?}", e); - } - } - }) - .spawn(); + // Store the capture file globally so it can be flushed on kernel exit. + DATA_CAPTURE_FILE.call_once(|| capture_file); info!("[capture] data capture enabled for bio completion stats"); } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 85b14c3fb..77d4110f1 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -221,6 +221,10 @@ fn init_thread() { ostd::task::halt_cpu(); } + // Flush all capture data before exiting. + #[cfg(not(baseline_asterinas))] + fs::flush_data_capture(); + // TODO: exit via qemu isa debug device should not be the only way. let exit_code = if initproc.status().exit_code() == 0 { QemuExitCode::Success From 0f8bb74c22528d816a32f81e616c6ce17e51712e Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Mon, 6 Apr 2026 03:21:01 +0000 Subject: [PATCH 10/22] Convert RAID1 write to asynchronous. --- kernel/comps/raid/src/lib.rs | 65 ++++++++++++++++++- .../comps/virtio/src/device/block/device.rs | 3 + 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 0ba584c48..666127f51 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -152,7 +152,7 @@ impl Raid1Device { // log::info!("Raid1Device process request, type: {:?}", request.type_()); match request.type_() { BioType::Read => self.process_read_async(request), - BioType::Write => self.process_write(request), + BioType::Write => self.process_write_async(request), BioType::Flush => self.process_flush(request), BioType::Discard => self.process_discard(request), } @@ -277,6 +277,7 @@ impl Raid1Device { /// Processes write requests by fanning out to all mirrors and aggregating /// the results (all must succeed). + #[expect(dead_code)] fn process_write(&self, request: BioRequest) { for parent in request.bios() { // Submit the same write to all members. @@ -288,6 +289,68 @@ impl Raid1Device { } } + /// Processes write requests asynchronously by fanning out to all mirrors. + /// + /// Each child BIO carries a callback that atomically decrements a shared + /// counter. The last callback to fire (or the dispatch thread on submission + /// failure) completes the parent. Any failed member marks the write as + /// `IoError`; all members must succeed for `Complete` to be reported. + fn process_write_async(&self, request: BioRequest) { + use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + use ostd::sync::{LocalIrqDisabled, SpinLock}; + + for parent in request.into_bios() { + let n = self.members.len(); + let remaining = Arc::new(AtomicUsize::new(n)); + let had_error = Arc::new(AtomicBool::new(false)); + + // Extract before moving parent into the guard. + let start_sid = parent.sid_range().start; + let segments = parent.segments().to_vec(); + let guard = Arc::new(SpinLock::<_, LocalIrqDisabled>::new(Some(ParentGuard::new(parent)))); + + for member in &self.members { + let remaining_cb = remaining.clone(); + let had_error_cb = had_error.clone(); + let guard_cb = guard.clone(); + let remaining_err = remaining.clone(); + let had_error_err = had_error.clone(); + let guard_err = guard.clone(); + let member = member.clone(); + + let child = Bio::new_with_closure( + BioType::Write, + start_sid, + segments.clone(), + move |child_bio: &SubmittedBio| { + if child_bio.status() != BioStatus::Complete { + had_error_cb.store(true, Ordering::Release); + } + if remaining_cb.fetch_sub(1, Ordering::AcqRel) == 1 { + let status = if had_error_cb.load(Ordering::Acquire) { + BioStatus::IoError + } else { + BioStatus::Complete + }; + if let Some(g) = guard_cb.lock().take() { + g.complete(status); + } + } + }, + ); + + if member.submit(child).is_err() { + had_error_err.store(true, Ordering::Release); + if remaining_err.fetch_sub(1, Ordering::AcqRel) == 1 { + if let Some(g) = guard_err.lock().take() { + g.complete(BioStatus::IoError); + } + } + } + } + } + } + /// Propagates a flush to all members and completes after they finish. fn process_flush(&self, request: BioRequest) { for parent in request.bios() { diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index 610151008..16f70268d 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -186,6 +186,7 @@ impl aster_block::BlockDevice for BlockDevice { let device_index = self.device.device_index.load(Ordering::Relaxed); bio.prepare_enqueue(reply_handle, device_index); self.device.inc_page_counter(bio.num_pages()); + // log::info!("\x1b[32mIncremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", bio.num_pages(), self.device.num_outstanding_pages.load(Ordering::Relaxed), device_index, bio.type_()); let producer = self.bio_submission_oqueue().attach_value_producer()?; producer.produce(bio); Ok(()) @@ -329,12 +330,14 @@ impl DeviceInner { } // Completes the bio request + // let req_type = complete_request.bio_request.type_(); complete_request.bio_request.bios().for_each(|bio| { bio.complete(BioStatus::Complete); #[cfg(not(baseline_asterinas))] { let pages = bio.num_pages(); let outstanding = self.num_outstanding_pages.fetch_sub(pages, Ordering::Relaxed) - pages; + // log::info!("\x1b[31mDecremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", pages, outstanding, self.device_index.load(Ordering::Relaxed), req_type); bio.report_statistics(outstanding); } }); From b13133151af251a2e096c88397d291c7c402317d Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Fri, 10 Apr 2026 23:33:48 +0000 Subject: [PATCH 11/22] IMPORTANT: fix oqueue panics by preventing the cursor underflow using wrapping_sub. --- ostd/src/orpc/oqueue/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ostd/src/orpc/oqueue/mod.rs b/ostd/src/orpc/oqueue/mod.rs index 0b298f3a4..9db2e038c 100644 --- a/ostd/src/orpc/oqueue/mod.rs +++ b/ostd/src/orpc/oqueue/mod.rs @@ -599,7 +599,7 @@ impl Sub for Cursor { type Output = Cursor; fn sub(self, rhs: usize) -> Self::Output { - Cursor(self.0 - rhs) + Cursor(self.0.wrapping_sub(rhs)) } } From 0bd648dd74c37f60987102bfb28e255af876462a Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Sat, 11 Apr 2026 01:39:59 +0000 Subject: [PATCH 12/22] Ignoring futex error for benchmark --- kernel/src/process/posix_thread/futex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/process/posix_thread/futex.rs b/kernel/src/process/posix_thread/futex.rs index 99015fb16..ddcadf4b2 100644 --- a/kernel/src/process/posix_thread/futex.rs +++ b/kernel/src/process/posix_thread/futex.rs @@ -478,7 +478,7 @@ impl FutexKey { pub fn load_val(&self, ctx: &Context) -> Result { // FIXME: how to implement a atomic load? - warn!("implement an atomic load"); + // warn!("implement an atomic load"); ctx.user_space().read_val(self.addr) } From c46005e5e43055242ce0d8486b96a1511825d6cc Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Sat, 11 Apr 2026 02:59:24 +0000 Subject: [PATCH 13/22] Updated the LinnOS Weight Placeholder --- kernel/comps/raid/src/linnos_weights.rs | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/comps/raid/src/linnos_weights.rs b/kernel/comps/raid/src/linnos_weights.rs index 43472f64b..d07ea21c2 100644 --- a/kernel/comps/raid/src/linnos_weights.rs +++ b/kernel/comps/raid/src/linnos_weights.rs @@ -17,25 +17,51 @@ pub const NUM_DEVICES: usize = 3; /// Hidden layer weights for device 0: 31 inputs -> 256 neurons pub static HIDDEN_WEIGHTS_0: [[f32; 256]; 31] = [[0.0; 256]; 31]; +/// Hidden layer bias for device 0 +pub static HIDDEN_BIAS_0: [f32; 256] = [0.0; 256]; + /// Hidden layer weights for device 1 pub static HIDDEN_WEIGHTS_1: [[f32; 256]; 31] = [[0.0; 256]; 31]; +/// Hidden layer bias for device 1 +pub static HIDDEN_BIAS_1: [f32; 256] = [0.0; 256]; + /// Hidden layer weights for device 2 pub static HIDDEN_WEIGHTS_2: [[f32; 256]; 31] = [[0.0; 256]; 31]; +/// Hidden layer bias for device 2 +pub static HIDDEN_BIAS_2: [f32; 256] = [0.0; 256]; + /// Output layer weights for device 0: 256 neurons -> 2 classes pub static OUTPUT_WEIGHTS_0: [[f32; 2]; 256] = [[0.0; 2]; 256]; +/// Output layer bias for device 0 +pub static OUTPUT_BIAS_0: [f32; 2] = [0.0; 2]; + /// Output layer weights for device 1 pub static OUTPUT_WEIGHTS_1: [[f32; 2]; 256] = [[0.0; 2]; 256]; +/// Output layer bias for device 1 +pub static OUTPUT_BIAS_1: [f32; 2] = [0.0; 2]; + /// Output layer weights for device 2 pub static OUTPUT_WEIGHTS_2: [[f32; 2]; 256] = [[0.0; 2]; 256]; +/// Output layer bias for device 2 +pub static OUTPUT_BIAS_2: [f32; 2] = [0.0; 2]; + /// All hidden layer weights indexed by device. pub static HIDDEN_WEIGHTS: [&[[f32; 256]; 31]; NUM_DEVICES] = [&HIDDEN_WEIGHTS_0, &HIDDEN_WEIGHTS_1, &HIDDEN_WEIGHTS_2]; +/// All hidden layer biases indexed by device. +pub static HIDDEN_BIASES: [&[f32; 256]; NUM_DEVICES] = + [&HIDDEN_BIAS_0, &HIDDEN_BIAS_1, &HIDDEN_BIAS_2]; + /// All output layer weights indexed by device. pub static OUTPUT_WEIGHTS: [&[[f32; 2]; 256]; NUM_DEVICES] = [&OUTPUT_WEIGHTS_0, &OUTPUT_WEIGHTS_1, &OUTPUT_WEIGHTS_2]; + +/// All output layer biases indexed by device. +pub static OUTPUT_BIASES: [&[f32; 2]; NUM_DEVICES] = + [&OUTPUT_BIAS_0, &OUTPUT_BIAS_1, &OUTPUT_BIAS_2]; From 53f47e014aeabb9688e33e215b9c67d4d504ac51 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Sat, 11 Apr 2026 03:44:40 +0000 Subject: [PATCH 14/22] Remove linnos weights from tracking --- kernel/comps/raid/src/linnos_weights.rs | 67 ------------------------- 1 file changed, 67 deletions(-) delete mode 100644 kernel/comps/raid/src/linnos_weights.rs diff --git a/kernel/comps/raid/src/linnos_weights.rs b/kernel/comps/raid/src/linnos_weights.rs deleted file mode 100644 index d07ea21c2..000000000 --- a/kernel/comps/raid/src/linnos_weights.rs +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: MPL-2.0 - -// LinnOS neural network weights hardcoded for 3 devices. -// Each device has: -// - hidden layer: 31 x 256 matrix -// - output layer: 256 x 2 matrix -// -// These weights will be filled in by a Python script with trained values. -// For now, all weights are initialized to 0.0 as placeholders. -// -// The actual weights numbers are expected to be filled with the jinja2 -// templates by the Python scripts that trains the model. - -/// Number of devices with hardcoded weights. -pub const NUM_DEVICES: usize = 3; - -/// Hidden layer weights for device 0: 31 inputs -> 256 neurons -pub static HIDDEN_WEIGHTS_0: [[f32; 256]; 31] = [[0.0; 256]; 31]; - -/// Hidden layer bias for device 0 -pub static HIDDEN_BIAS_0: [f32; 256] = [0.0; 256]; - -/// Hidden layer weights for device 1 -pub static HIDDEN_WEIGHTS_1: [[f32; 256]; 31] = [[0.0; 256]; 31]; - -/// Hidden layer bias for device 1 -pub static HIDDEN_BIAS_1: [f32; 256] = [0.0; 256]; - -/// Hidden layer weights for device 2 -pub static HIDDEN_WEIGHTS_2: [[f32; 256]; 31] = [[0.0; 256]; 31]; - -/// Hidden layer bias for device 2 -pub static HIDDEN_BIAS_2: [f32; 256] = [0.0; 256]; - -/// Output layer weights for device 0: 256 neurons -> 2 classes -pub static OUTPUT_WEIGHTS_0: [[f32; 2]; 256] = [[0.0; 2]; 256]; - -/// Output layer bias for device 0 -pub static OUTPUT_BIAS_0: [f32; 2] = [0.0; 2]; - -/// Output layer weights for device 1 -pub static OUTPUT_WEIGHTS_1: [[f32; 2]; 256] = [[0.0; 2]; 256]; - -/// Output layer bias for device 1 -pub static OUTPUT_BIAS_1: [f32; 2] = [0.0; 2]; - -/// Output layer weights for device 2 -pub static OUTPUT_WEIGHTS_2: [[f32; 2]; 256] = [[0.0; 2]; 256]; - -/// Output layer bias for device 2 -pub static OUTPUT_BIAS_2: [f32; 2] = [0.0; 2]; - -/// All hidden layer weights indexed by device. -pub static HIDDEN_WEIGHTS: [&[[f32; 256]; 31]; NUM_DEVICES] = - [&HIDDEN_WEIGHTS_0, &HIDDEN_WEIGHTS_1, &HIDDEN_WEIGHTS_2]; - -/// All hidden layer biases indexed by device. -pub static HIDDEN_BIASES: [&[f32; 256]; NUM_DEVICES] = - [&HIDDEN_BIAS_0, &HIDDEN_BIAS_1, &HIDDEN_BIAS_2]; - -/// All output layer weights indexed by device. -pub static OUTPUT_WEIGHTS: [&[[f32; 2]; 256]; NUM_DEVICES] = - [&OUTPUT_WEIGHTS_0, &OUTPUT_WEIGHTS_1, &OUTPUT_WEIGHTS_2]; - -/// All output layer biases indexed by device. -pub static OUTPUT_BIASES: [&[f32; 2]; NUM_DEVICES] = - [&OUTPUT_BIAS_0, &OUTPUT_BIAS_1, &OUTPUT_BIAS_2]; From fc00a08d3ded82160a832c73c8e1eb5156b5a45d Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Sat, 11 Apr 2026 06:58:35 +0000 Subject: [PATCH 15/22] Kept track of the number of outstanding pages in SubmittedBio, log this feature at the time the IO arrives, rather than completing, and corrected the current IO's pages feature. --- kernel/comps/block/src/bio.rs | 35 +++++++++++++++---- kernel/comps/block/src/lib.rs | 5 +++ kernel/comps/raid/src/lib.rs | 15 ++++---- kernel/comps/raid/src/selection_policies.rs | 30 ++++++++++------ kernel/comps/raid/src/server_traits.rs | 2 +- .../comps/virtio/src/device/block/device.rs | 12 ++++--- 6 files changed, 68 insertions(+), 31 deletions(-) diff --git a/kernel/comps/block/src/bio.rs b/kernel/comps/block/src/bio.rs index 54722b244..04d3ecef7 100644 --- a/kernel/comps/block/src/bio.rs +++ b/kernel/comps/block/src/bio.rs @@ -145,6 +145,7 @@ impl Bio { // enqueue to the block device // A SubmittedBio is created here from a Bio, and then pass down to the lower layers. + // Those empty fields will be set just before in the block_device.enqueue function in the prepare_enqueue function. if let Err(e) = block_device.enqueue(SubmittedBio { bio_inner: self.0.clone(), #[cfg(not(baseline_asterinas))] @@ -155,6 +156,8 @@ impl Bio { device_index: None, #[cfg(not(baseline_asterinas))] num_pages: None, + #[cfg(not(baseline_asterinas))] + outstanding_pages: None, }) { // Fail to submit, revert the status. let result = self.0.status.compare_exchange( @@ -342,6 +345,9 @@ pub struct SubmittedBio { #[cfg(not(baseline_asterinas))] num_pages: Option, + + #[cfg(not(baseline_asterinas))] + outstanding_pages: Option, } impl core::fmt::Debug for SubmittedBio { @@ -355,7 +361,8 @@ impl core::fmt::Debug for SubmittedBio { .field( "reply_handle", &self.reply_handle.as_ref().map(|_| ""), - ); + ) + .field("outstanding_pages", &self.outstanding_pages); d.finish() } } @@ -371,10 +378,18 @@ impl SubmittedBio { self.bio_inner.sid_range() } + /// an immutable version of the num_pages function. Panic if the num_pages field is not set yet. + pub fn get_num_pages(&self) -> u64 { + self.num_pages.expect("num_pages is not set yet") + } + /// Returns the number of 4KB pages covered by this bio's sector range. - pub fn num_pages(&self) -> u64 { - let sectors = self.bio_inner.sid_range().end.to_raw() - self.bio_inner.sid_range().start.to_raw(); - (sectors + 7) / 8 + /// Note the field num_pages is only available when calling this function, but accessing it directly is not available. + pub fn num_pages(&mut self) -> u64 { + *self.num_pages.get_or_insert_with(|| { + let sectors = self.bio_inner.sid_range().end.to_raw() - self.bio_inner.sid_range().start.to_raw(); + (sectors + 7) / 8 + }) } /// Returns the slice to the memory segments. @@ -412,27 +427,33 @@ impl SubmittedBio { self.submission_time_us } + /// Argument: + /// - `num_pages`: The number of pages covered by this bio's sector range. This is used to update the outstanding page counter in the block device, and also used for performance statistics reporting. + /// - `outstanding_pages`: The number of outstanding pages on the fly before enqueing this bio request. #[cfg(not(baseline_asterinas))] pub fn prepare_enqueue( &mut self, reply_handle: RefProducer, device_index: u64, + outstanding_pages: u64 ) { + self.reply_handle = Some(reply_handle); self.submission_time_us = Some(read_monotonic_time().as_micros() as u64); self.device_index = Some(device_index); - self.num_pages = Some(self.num_pages()); + self.num_pages(); // set the num_pages field + self.outstanding_pages = Some(outstanding_pages + self.num_pages.unwrap()); // accumulate the number of outstanding pages } #[cfg(not(baseline_asterinas))] - pub fn report_statistics(&self, outstanding_pages: u64) { + pub fn report_statistics(&self) { self.reply_handle .as_ref() .unwrap() .try_produce_ref(&BlockDeviceCompletionStats { latency_us: read_monotonic_time().as_micros() as u64 - self.submission_time_us.unwrap(), - outstanding_pages, + outstanding_pages: self.outstanding_pages.unwrap_or(u64::MAX), device_index: self.device_index.unwrap_or(u64::MAX), }); } diff --git a/kernel/comps/block/src/lib.rs b/kernel/comps/block/src/lib.rs index 9e89e4432..9d85f674f 100644 --- a/kernel/comps/block/src/lib.rs +++ b/kernel/comps/block/src/lib.rs @@ -58,6 +58,11 @@ pub trait BlockDevice: Send + Sync + Any + Debug { /// Returns the metadata of the block device. fn metadata(&self) -> BlockDeviceMeta; + + /// Returns the number of outstanding pages for this device. + fn num_outstanding_pages(&self) -> u64 { + 0 + } } /// Metadata for a block device. diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 666127f51..4c2c80ab0 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -208,21 +208,20 @@ impl Raid1Device { #[cfg(not(baseline_asterinas))] fn process_read(&self, request: BioRequest) { // Submit all children first to overlap device I/O. - let mut pending: alloc::vec::Vec<(&SubmittedBio, BioWaiter)> = alloc::vec::Vec::new(); + let mut pending: alloc::vec::Vec<(SubmittedBio, BioWaiter)> = alloc::vec::Vec::new(); - for parent in request.bios() { - let member = self.selection_policy.select_block_device(parent).unwrap(); + for mut parent in request.into_bios() { + let member = self.selection_policy.select_block_device(&mut parent).unwrap(); let child = Bio::new( // Child BIO mirrors the parent’s type, range, and buffers. BioType::Read, parent.sid_range().start, - Self::clone_segments(parent), + Self::clone_segments(&parent), None, ); match child.submit(&*member) { Ok(waiter) => pending.push((parent, waiter)), - // Err(_) => parent.complete(BioStatus::IoError), - Err(_) => todo!("Failed to submit child BIO, Don't know what to do"), + Err(_) => todo!("Failed to submit child BIO, Don’t know what to do"), } } @@ -245,9 +244,9 @@ impl Raid1Device { /// member by the selection policy (device 0 if asterinas baseline) and submitted with `Bio::submit` to overlap device /// I/O. Completion of the parent is reported after the child finishes. fn process_read_async(&self, request: BioRequest) { - for parent in request.into_bios() { + for mut parent in request.into_bios() { #[cfg(not(baseline_asterinas))] - let member = self.selection_policy.select_block_device(&parent).unwrap(); + let member = self.selection_policy.select_block_device(&mut parent).unwrap(); #[cfg(baseline_asterinas)] let member = self.members[0].clone(); diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 277806396..57b44053c 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -36,7 +36,7 @@ impl Dummy0Policy { impl SelectionPolicy for Dummy0Policy { fn select_block_device( &self, - _submitted: &SubmittedBio, + _submitted: &mut SubmittedBio, ) -> Result, Error> { Ok(self.members[0].clone()) } @@ -63,7 +63,7 @@ impl RoundRobinPolicy { impl SelectionPolicy for RoundRobinPolicy { fn select_block_device( &self, - _submitted: &SubmittedBio, + _submitted: &mut SubmittedBio, ) -> Result, Error> { let idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); Ok(self.members[idx % self.members.len()].clone()) @@ -141,9 +141,10 @@ impl LinnOSPolicy { } impl SelectionPolicy for LinnOSPolicy { - fn select_block_device(&self, submitted: &SubmittedBio) -> Result, Error> { + fn select_block_device(&self, submitted: &mut SubmittedBio) -> Result, Error> { let num_devices = self.members.len(); let mut fail_cnt = 0; + let num_pages = submitted.num_pages(); loop { let idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); @@ -161,18 +162,14 @@ impl SelectionPolicy for LinnOSPolicy { let mut input = [0.0f32; 31]; // Current outstanding pages: use most recent trace entry, decompose into 3 digits - let current_outstanding = completion_trace - .iter() - .flatten() - .next() - .map(|t| t.outstanding_pages as usize) - .unwrap_or(0); + let current_outstanding = num_pages as usize + self.members[device_idx].num_outstanding_pages() as usize; input[0] = ((current_outstanding / 100) % 10) as f32; input[1] = ((current_outstanding / 10) % 10) as f32; input[2] = (current_outstanding % 10) as f32; // Feature Engineering in LinnOS: Decompose numbers into digits. // Historical features: 4 steps, each with 3 digits outstanding + 4 digits latency + let mut observed: [(usize, usize); 4] = [(0, 0); 4]; for (i, trace_entry) in completion_trace.iter().enumerate().take(4) { let Some(trace_entry) = trace_entry else { continue; @@ -181,6 +178,8 @@ impl SelectionPolicy for LinnOSPolicy { let latency_us = trace_entry.latency_us as usize; let base = 3 + i * 7; + observed[i] = (outstanding, latency_us); + // Outstanding pages -> 3 digits (hundreds, tens, ones) input[base] = ((outstanding / 100) % 10) as f32; input[base + 1] = ((outstanding / 10) % 10) as f32; @@ -193,6 +192,13 @@ impl SelectionPolicy for LinnOSPolicy { input[base + 6] = (latency_us % 10) as f32; } + // log::info!( + // "LinnOS dev={} cur_outstanding={} outstanding=[{},{},{},{}] latency_us=[{},{},{},{}]", + // device_idx, current_outstanding, + // observed[0].0, observed[1].0, observed[2].0, observed[3].0, + // observed[0].1, observed[1].1, observed[2].1, observed[3].1, + // ); + // Hidden layer: input (31) x hidden_weights (31x256) + bias (256) -> hidden_out (256) let hidden_weights = &self.hidden_layers[device_idx]; let hidden_bias = &self.hidden_biases[device_idx]; @@ -216,8 +222,9 @@ impl SelectionPolicy for LinnOSPolicy { } } - // Argmax: output[0] > output[1] means fast, otherwise slow - if output[0] > output[1] { + // Argmax: output[0] < output[1] means fast, otherwise slow + if output[0] < output[1] { + log::info!("Submitting to device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); return Ok(self.members[device_idx].clone()); } @@ -225,6 +232,7 @@ impl SelectionPolicy for LinnOSPolicy { // All devices predicted slow -- fall back to round-robin if fail_cnt >= num_devices { let fallback_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed) % num_devices; + log::info!("Submitting to device {} as all devices are busy. output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); return Ok(self.members[fallback_idx].clone()); } } diff --git a/kernel/comps/raid/src/server_traits.rs b/kernel/comps/raid/src/server_traits.rs index f390cd927..7515d5b96 100644 --- a/kernel/comps/raid/src/server_traits.rs +++ b/kernel/comps/raid/src/server_traits.rs @@ -26,5 +26,5 @@ pub trait SelectionPolicy: Debug { /// Get the block device to read from. The policy cannot decide, for whatever reason, this should /// return an error. The caller will use some fallback. If the returned block device does not /// exist, then the caller will also fallback. - fn select_block_device(&self, submitted: &SubmittedBio) -> Result, Error>; + fn select_block_device(&self, submitted: &mut SubmittedBio) -> Result, Error>; } diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index 16f70268d..d420034ce 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -184,7 +184,7 @@ impl aster_block::BlockDevice for BlockDevice { let mut bio = bio; let device_index = self.device.device_index.load(Ordering::Relaxed); - bio.prepare_enqueue(reply_handle, device_index); + bio.prepare_enqueue(reply_handle, device_index, self.device.num_outstanding_pages.load(Ordering::Relaxed)); self.device.inc_page_counter(bio.num_pages()); // log::info!("\x1b[32mIncremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", bio.num_pages(), self.device.num_outstanding_pages.load(Ordering::Relaxed), device_index, bio.type_()); let producer = self.bio_submission_oqueue().attach_value_producer()?; @@ -198,6 +198,10 @@ impl aster_block::BlockDevice for BlockDevice { nr_sectors: self.device.config_manager.capacity_sectors(), } } + + fn num_outstanding_pages(&self) -> u64 { + self.device.num_outstanding_pages.load(Ordering::Relaxed) + } } #[derive(Debug)] @@ -335,10 +339,10 @@ impl DeviceInner { bio.complete(BioStatus::Complete); #[cfg(not(baseline_asterinas))] { - let pages = bio.num_pages(); - let outstanding = self.num_outstanding_pages.fetch_sub(pages, Ordering::Relaxed) - pages; + let pages = bio.get_num_pages(); + let outstanding = self.num_outstanding_pages.fetch_sub(pages, Ordering::Relaxed); // log::info!("\x1b[31mDecremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", pages, outstanding, self.device_index.load(Ordering::Relaxed), req_type); - bio.report_statistics(outstanding); + bio.report_statistics(); } }); } From c6ccfaa60af6d5d8f544a6767efc8baacfccdc9f Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Sat, 11 Apr 2026 20:47:08 +0000 Subject: [PATCH 16/22] Added LinnOS Plus --- .../raid/src/generate_linnos_plus_weights.py | 147 ++++++++++++++++ kernel/comps/raid/src/lib.rs | 2 + .../comps/raid/src/linnos_plus_weights.rs.j2 | 97 ++++++++++ kernel/comps/raid/src/selection_policies.rs | 165 ++++++++++++++++++ 4 files changed, 411 insertions(+) create mode 100644 kernel/comps/raid/src/generate_linnos_plus_weights.py create mode 100644 kernel/comps/raid/src/linnos_plus_weights.rs.j2 diff --git a/kernel/comps/raid/src/generate_linnos_plus_weights.py b/kernel/comps/raid/src/generate_linnos_plus_weights.py new file mode 100644 index 000000000..2a6da1987 --- /dev/null +++ b/kernel/comps/raid/src/generate_linnos_plus_weights.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MPL-2.0 + +""" +Load trained PyTorch LinnOSPlus models and generate the Rust weights file +using the Jinja2 template. + +The LinnOSPlus architecture has three linear layers: + Linear(31, 8) -> ReLU -> Linear(8, 8) -> ReLU -> Linear(8, 2) + +PyTorch state dict keys: + net.0.weight [8, 31] net.0.bias [8] + net.2.weight [8, 8] net.2.bias [8] + net.4.weight [2, 8] net.4.bias [2] + +Usage: + python generate_linnos_plus_weights.py \ + --models models/linnos_plus_device0.pt \ + models/linnos_plus_device1.pt \ + models/linnos_plus_device2.pt \ + --template kernel/comps/raid/src/linnos_plus_weights.rs.j2 \ + --output kernel/comps/raid/src/linnos_plus_weights.rs + +Run from the repository root. +""" + +import argparse +from pathlib import Path + +import torch +from jinja2 import Environment, FileSystemLoader + + +def load_model(path: str) -> dict: + """Load a model checkpoint and return its state dict.""" + state = torch.load(path, map_location="cpu", weights_only=False) + return state + + +def print_architecture(state: dict, device_idx: int) -> None: + """Print model architecture for sanity check.""" + print(f" Device {device_idx}:") + for name, tensor in state.items(): + print(f" {name:20s} shape={str(list(tensor.shape)):16s} dtype={tensor.dtype}") + + +def tensor_to_list(tensor: torch.Tensor) -> list: + """Convert a tensor to a nested Python list of floats.""" + return tensor.tolist() + + +def main(): + parser = argparse.ArgumentParser( + description="Generate LinnOSPlus Rust weight file from PyTorch models" + ) + parser.add_argument( + "--models", nargs="+", required=True, + help="Paths to .pt model files, one per device in order", + ) + parser.add_argument( + "--template", required=True, + help="Path to the Jinja2 template (.rs.j2)", + ) + parser.add_argument( + "--output", required=True, + help="Path for the generated Rust file (.rs)", + ) + args = parser.parse_args() + + # Load all models + models = [] + for path in args.models: + models.append(load_model(path)) + + num_devices = len(models) + + # Sanity check: print architecture + print(f"Loaded {num_devices} model(s).\n") + print("Model architecture:") + for i, state in enumerate(models): + print_architecture(state, i) + print() + + # Extract dimensions from the first model + # net.0: Linear(31, hidden1_size) + # net.2: Linear(hidden1_size, hidden2_size) + # net.4: Linear(hidden2_size, 2) + hidden1_size = models[0]["net.0.weight"].shape[0] + input_size = models[0]["net.0.weight"].shape[1] + hidden2_size = models[0]["net.2.weight"].shape[0] + output_size = models[0]["net.4.weight"].shape[0] + + print(f"Network: {input_size} -> {hidden1_size} (ReLU) -> {hidden2_size} (ReLU) -> {output_size}") + print() + + # Extract weights and biases for each device + # PyTorch stores weights as [out_features, in_features]. + # In Rust we index as weights[input][output], so we transpose. + hidden1_weights = [] + hidden1_biases = [] + hidden2_weights = [] + hidden2_biases = [] + output_weights = [] + output_biases = [] + + for i, state in enumerate(models): + # Hidden layer 1: [hidden1_size, 31] -> [31, hidden1_size] + hw1 = state["net.0.weight"].T + hidden1_weights.append(tensor_to_list(hw1)) + hidden1_biases.append(tensor_to_list(state["net.0.bias"])) + + # Hidden layer 2: [hidden2_size, hidden1_size] -> [hidden1_size, hidden2_size] + hw2 = state["net.2.weight"].T + hidden2_weights.append(tensor_to_list(hw2)) + hidden2_biases.append(tensor_to_list(state["net.2.bias"])) + + # Output layer: [2, hidden2_size] -> [hidden2_size, 2] + ow = state["net.4.weight"].T + output_weights.append(tensor_to_list(ow)) + output_biases.append(tensor_to_list(state["net.4.bias"])) + + # Render template + template_path = Path(args.template) + env = Environment( + loader=FileSystemLoader(str(template_path.parent)), + keep_trailing_newline=True, + ) + template = env.get_template(template_path.name) + + rendered = template.render( + num_devices=num_devices, + hidden1_size=hidden1_size, + hidden2_size=hidden2_size, + hidden1_weights=hidden1_weights, + hidden1_biases=hidden1_biases, + hidden2_weights=hidden2_weights, + hidden2_biases=hidden2_biases, + output_weights=output_weights, + output_biases=output_biases, + ) + + Path(args.output).write_text(rendered) + print(f"Generated {args.output} ({len(rendered)} bytes)") + + +if __name__ == "__main__": + main() diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 4c2c80ab0..9511cb5b3 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -23,6 +23,8 @@ extern crate alloc; #[cfg(not(baseline_asterinas))] pub mod linnos_weights; #[cfg(not(baseline_asterinas))] +pub mod linnos_plus_weights; +#[cfg(not(baseline_asterinas))] pub mod selection_policies; #[cfg(not(baseline_asterinas))] pub mod server_traits; diff --git a/kernel/comps/raid/src/linnos_plus_weights.rs.j2 b/kernel/comps/raid/src/linnos_plus_weights.rs.j2 new file mode 100644 index 000000000..ccde1b011 --- /dev/null +++ b/kernel/comps/raid/src/linnos_plus_weights.rs.j2 @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: MPL-2.0 + +// LinnOSPlus neural network weights hardcoded for {{ num_devices }} devices. +// Each device has: +// - hidden layer 1: 31 x {{ hidden1_size }} matrix + {{ hidden1_size }} bias +// - hidden layer 2: {{ hidden1_size }} x {{ hidden2_size }} matrix + {{ hidden2_size }} bias +// - output layer: {{ hidden2_size }} x 2 matrix + 2 bias +// +// AUTO-GENERATED by generate_linnos_plus_weights.py using Jinja2. +// Do not edit this file manually. + +/// Number of devices with hardcoded weights. +pub const NUM_DEVICES: usize = {{ num_devices }}; + +/// Hidden layer 1 size (number of neurons). +pub const HIDDEN1_SIZE: usize = {{ hidden1_size }}; + +/// Hidden layer 2 size (number of neurons). +pub const HIDDEN2_SIZE: usize = {{ hidden2_size }}; + +{% for dev in range(num_devices) %} +/// Hidden layer 1 weights for device {{ dev }}: 31 inputs -> {{ hidden1_size }} neurons +pub static HIDDEN1_WEIGHTS_{{ dev }}: [[f32; {{ hidden1_size }}]; 31] = [ +{% for row in hidden1_weights[dev] %} + [{{ row | join(', ') }}], +{% endfor %} +]; + +/// Hidden layer 1 bias for device {{ dev }} +pub static HIDDEN1_BIAS_{{ dev }}: [f32; {{ hidden1_size }}] = [{{ hidden1_biases[dev] | join(', ') }}]; + +{% endfor %} +{% for dev in range(num_devices) %} +/// Hidden layer 2 weights for device {{ dev }}: {{ hidden1_size }} -> {{ hidden2_size }} neurons +pub static HIDDEN2_WEIGHTS_{{ dev }}: [[f32; {{ hidden2_size }}]; {{ hidden1_size }}] = [ +{% for row in hidden2_weights[dev] %} + [{{ row | join(', ') }}], +{% endfor %} +]; + +/// Hidden layer 2 bias for device {{ dev }} +pub static HIDDEN2_BIAS_{{ dev }}: [f32; {{ hidden2_size }}] = [{{ hidden2_biases[dev] | join(', ') }}]; + +{% endfor %} +{% for dev in range(num_devices) %} +/// Output layer weights for device {{ dev }}: {{ hidden2_size }} neurons -> 2 classes +pub static OUTPUT_WEIGHTS_{{ dev }}: [[f32; 2]; {{ hidden2_size }}] = [ +{% for row in output_weights[dev] %} + [{{ row | join(', ') }}], +{% endfor %} +]; + +/// Output layer bias for device {{ dev }} +pub static OUTPUT_BIAS_{{ dev }}: [f32; 2] = [{{ output_biases[dev] | join(', ') }}]; + +{% endfor %} +/// All hidden layer 1 weights indexed by device. +pub static HIDDEN1_WEIGHTS: [&[[f32; {{ hidden1_size }}]; 31]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &HIDDEN1_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All hidden layer 1 biases indexed by device. +pub static HIDDEN1_BIASES: [&[f32; {{ hidden1_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &HIDDEN1_BIAS_{{ dev }}, +{% endfor %} +]; + +/// All hidden layer 2 weights indexed by device. +pub static HIDDEN2_WEIGHTS: [&[[f32; {{ hidden2_size }}]; {{ hidden1_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &HIDDEN2_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All hidden layer 2 biases indexed by device. +pub static HIDDEN2_BIASES: [&[f32; {{ hidden2_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &HIDDEN2_BIAS_{{ dev }}, +{% endfor %} +]; + +/// All output layer weights indexed by device. +pub static OUTPUT_WEIGHTS: [&[[f32; 2]; {{ hidden2_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &OUTPUT_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All output layer biases indexed by device. +pub static OUTPUT_BIASES: [&[f32; 2]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &OUTPUT_BIAS_{{ dev }}, +{% endfor %} +]; diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 57b44053c..0b07bfc76 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -238,3 +238,168 @@ impl SelectionPolicy for LinnOSPolicy { } } } + +/// LinnOSPlus: a deeper variant of the LinnOS neural-network selection policy. +/// +/// Architecture (per device): +/// Linear(31, 8) -> ReLU -> Linear(8, 8) -> ReLU -> Linear(8, 2) +/// +/// The input feature vector is identical to LinnOS (31 elements). +/// Weights are loaded from `linnos_plus_weights`. +#[orpc_server] +pub struct LinnOSPlusPolicy { + read_cursor: AtomicUsize, + members: Vec>, + observers: Vec>>, + hidden1_weights: Vec<[[f32; 8]; 31]>, + hidden1_biases: Vec<[f32; 8]>, + hidden2_weights: Vec<[[f32; 8]; 8]>, + hidden2_biases: Vec<[f32; 8]>, + output_weights: Vec<[[f32; 2]; 8]>, + output_biases: Vec<[f32; 2]>, +} + +impl core::fmt::Debug for LinnOSPlusPolicy { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("LinnOSPlusPolicy") + .field("read_cursor", &self.read_cursor) + .field("members", &self.members) + .field( + "observers", + &format_args!("[{} observers]", self.observers.len()), + ) + .finish() + } +} + +impl LinnOSPlusPolicy { + pub fn new( + members: Vec>, + observers: Vec>>, + ) -> Result, Error> { + use crate::linnos_plus_weights::{ + HIDDEN1_BIASES, HIDDEN1_WEIGHTS, HIDDEN2_BIASES, HIDDEN2_WEIGHTS, OUTPUT_BIASES, + OUTPUT_WEIGHTS, + }; + + let num_devices = members.len(); + + let hidden1_weights: Vec<[[f32; 8]; 31]> = + (0..num_devices).map(|i| *HIDDEN1_WEIGHTS[i]).collect(); + let hidden1_biases: Vec<[f32; 8]> = + (0..num_devices).map(|i| *HIDDEN1_BIASES[i]).collect(); + let hidden2_weights: Vec<[[f32; 8]; 8]> = + (0..num_devices).map(|i| *HIDDEN2_WEIGHTS[i]).collect(); + let hidden2_biases: Vec<[f32; 8]> = + (0..num_devices).map(|i| *HIDDEN2_BIASES[i]).collect(); + let output_weights: Vec<[[f32; 2]; 8]> = + (0..num_devices).map(|i| *OUTPUT_WEIGHTS[i]).collect(); + let output_biases: Vec<[f32; 2]> = + (0..num_devices).map(|i| *OUTPUT_BIASES[i]).collect(); + + let server = Self::new_with(|orpc_internal, _| Self { + orpc_internal, + read_cursor: AtomicUsize::new(0), + members, + observers, + hidden1_weights, + hidden1_biases, + hidden2_weights, + hidden2_biases, + output_weights, + output_biases, + }); + + Ok(server) + } +} + +impl SelectionPolicy for LinnOSPlusPolicy { + fn select_block_device(&self, submitted: &mut SubmittedBio) -> Result, Error> { + let num_devices = self.members.len(); + let mut fail_cnt = 0; + let num_pages = submitted.num_pages(); + + loop { + let idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); + let device_idx = idx % num_devices; + let observer = self.observers[device_idx].lock(); + let completion_trace = observer + .weak_observe_recent(4) + .expect("Failed to observe completion trace"); + + // Build the 31-element input feature vector (same as LinnOS) + let mut input = [0.0f32; 31]; + + let current_outstanding = num_pages as usize + self.members[device_idx].num_outstanding_pages() as usize; + input[0] = ((current_outstanding / 100) % 10) as f32; + input[1] = ((current_outstanding / 10) % 10) as f32; + input[2] = (current_outstanding % 10) as f32; + + for (i, trace_entry) in completion_trace.iter().enumerate().take(4) { + let Some(trace_entry) = trace_entry else { + continue; + }; + let outstanding = trace_entry.outstanding_pages as usize; + let latency_us = trace_entry.latency_us as usize; + let base = 3 + i * 7; + + input[base] = ((outstanding / 100) % 10) as f32; + input[base + 1] = ((outstanding / 10) % 10) as f32; + input[base + 2] = (outstanding % 10) as f32; + + input[base + 3] = ((latency_us / 1000) % 10) as f32; + input[base + 4] = ((latency_us / 100) % 10) as f32; + input[base + 5] = ((latency_us / 10) % 10) as f32; + input[base + 6] = (latency_us % 10) as f32; + } + + // Hidden layer 1: input (31) x hidden1_weights (31x8) + bias (8) -> hidden1_out (8) + let h1_weights = &self.hidden1_weights[device_idx]; + let h1_bias = &self.hidden1_biases[device_idx]; + let mut hidden1_out = [0.0f32; 8]; + for j in 0..8 { + let mut sum = h1_bias[j]; + for i in 0..31 { + sum += input[i] * h1_weights[i][j]; + } + hidden1_out[j] = if sum > 0.0 { sum } else { 0.0 }; + } + + // Hidden layer 2: hidden1_out (8) x hidden2_weights (8x8) + bias (8) -> hidden2_out (8) + let h2_weights = &self.hidden2_weights[device_idx]; + let h2_bias = &self.hidden2_biases[device_idx]; + let mut hidden2_out = [0.0f32; 8]; + for j in 0..8 { + let mut sum = h2_bias[j]; + for i in 0..8 { + sum += hidden1_out[i] * h2_weights[i][j]; + } + hidden2_out[j] = if sum > 0.0 { sum } else { 0.0 }; + } + + // Output layer: hidden2_out (8) x output_weights (8x2) + bias (2) -> output (2) + let out_weights = &self.output_weights[device_idx]; + let out_bias = &self.output_biases[device_idx]; + let mut output = [out_bias[0], out_bias[1]]; + for k in 0..2 { + for j in 0..8 { + output[k] += hidden2_out[j] * out_weights[j][k]; + } + } + + // Argmax: output[0] < output[1] means fast, otherwise slow + if output[0] < output[1] { + log::info!("LinnOSPlus: device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); + return Ok(self.members[device_idx].clone()); + } + + fail_cnt += 1; + if fail_cnt >= num_devices { + let fallback_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed) % num_devices; + log::info!("LinnOSPlus: device {} fallback (all busy). output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); + return Ok(self.members[fallback_idx].clone()); + } + } + } +} From f0d3654b9c0806f9e725d157cd5671610e9561e3 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Mon, 13 Apr 2026 03:42:38 +0000 Subject: [PATCH 17/22] Decision Tree policy, and using kernel build parameters to select the policy to build into the kernel. --- Cargo.toml | 2 +- Makefile | 8 + .../raid/src/decision_tree_predictions.rs | 803 ++++++++++++++++++ kernel/comps/raid/src/lib.rs | 2 + kernel/comps/raid/src/selection_policies.rs | 115 ++- kernel/src/fs/mod.rs | 62 +- 6 files changed, 965 insertions(+), 27 deletions(-) create mode 100644 kernel/comps/raid/src/decision_tree_predictions.rs diff --git a/Cargo.toml b/Cargo.toml index 4750a0f90..a547c2c4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ exclude = [ function_casts_as_integer = "allow" mismatched_lifetime_syntaxes = "allow" missing_crate_level_docs = "warn" -unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)'] } +unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)', 'cfg(capture_data)', 'cfg(raid_selection, values("roundrobin", "linnos", "linnos_plus", "decision_tree"))'] } unpredictable-function-pointer-comparisons = "allow" unsafe_op_in_unsafe_fn = "deny" unused_parens = "allow" diff --git a/Makefile b/Makefile index 1c8b23828..e439b4e6a 100644 --- a/Makefile +++ b/Makefile @@ -168,6 +168,14 @@ RUSTFLAGS += --cfg=baseline_asterinas CLIPPY_COMMON_ARGS += --cfg=baseline_asterinas -A unused-imports -A dead-code -A unfulfilled-lint-expectations endif +ifeq ($(CAPTURE_DATA), 1) +RUSTFLAGS += --cfg=capture_data +endif + +ifdef RAID_SELECTION +RUSTFLAGS += --cfg=raid_selection="$(RAID_SELECTION)" +endif + # To test the linux-efi-handover64 boot protocol, we need to use Debian's # GRUB release, which is installed in /usr/bin in our Docker image. ifeq ($(BOOT_PROTOCOL), linux-efi-handover64) diff --git a/kernel/comps/raid/src/decision_tree_predictions.rs b/kernel/comps/raid/src/decision_tree_predictions.rs new file mode 100644 index 000000000..0846d795c --- /dev/null +++ b/kernel/comps/raid/src/decision_tree_predictions.rs @@ -0,0 +1,803 @@ +// SPDX-License-Identifier: MPL-2.0 + +// Per-device decision tree prediction functions. +// +// Each function is generated by export_dt.py and pasted here: +// +// python export_dt.py \ +// --model results/.pkl \ +// --format rust \ +// --fn_name predict_device \ +// --out /tmp/dt_device.rs +// +// Input: &[u8; 31] — one byte per feature digit (0–9), same layout as the LinnOS +// feature vector: +// x[0..3] current outstanding pages (3 digits: hundreds, tens, ones) +// x[3..10] history step 0 (3 outstanding digits + 4 latency-us digits) +// x[10..17] history step 1 +// x[17..24] history step 2 +// x[24..31] history step 3 +// +// Returns: 0 (slow) or 1 (fast) +// +// ── DEVICE 0 ───────────────────────────────────────────────────────────────── +// PASTE the output of `export_dt.py --fn_name predict_device0` below, +// replacing this placeholder function. + +/// Predict fast (1) or slow (0) for device 0. +#[inline] +pub fn predict_device0(x: &[u8; 31]) -> u8 { + if x[30] <= 1 { // cur_out_2 <= 1.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[27] <= 1 { // prev1_out_2 <= 1.5000 + if x[26] <= 0 { // prev1_out_1 <= 0.5000 + if x[18] <= 5 { // prev4_out_2 <= 5.5000 + if x[6] <= 0 { // prev3_lat_2 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[1] <= 0 { // prev4_lat_1 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[21] <= 2 { // prev3_out_2 <= 2.5000 + if x[3] <= 8 { // prev4_lat_3 <= 8.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[17] <= 0 { // prev4_out_1 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } + } else { + if x[0] <= 3 { // prev4_lat_0 <= 3.5000 + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[13] <= 1 { // prev1_lat_1 <= 1.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[8] <= 3 { // prev2_lat_0 <= 3.5000 + if x[18] <= 0 { // prev4_out_2 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[4] <= 3 { // prev3_lat_0 <= 3.5000 + 1 // fast (counts: [0, 1]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + if x[5] <= 3 { // prev3_lat_1 <= 3.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[7] <= 5 { // prev3_lat_3 <= 5.5000 + 0 // slow (counts: [1, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[9] <= 4 { // prev2_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[18] <= 1 { // prev4_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 1 { // prev1_out_2 <= 1.5000 + if x[4] <= 0 { // prev3_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[15] <= 0 { // prev1_lat_3 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 2 { // prev2_out_2 <= 2.5000 + if x[1] <= 1 { // prev4_lat_1 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[15] <= 7 { // prev1_lat_3 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } else { + if x[30] <= 5 { // cur_out_2 <= 5.5000 + if x[30] <= 4 { // cur_out_2 <= 4.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[30] <= 3 { // cur_out_2 <= 3.5000 + if x[8] <= 1 { // prev2_lat_0 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[12] <= 1 { // prev1_lat_0 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[23] <= 0 { // prev2_out_1 <= 0.5000 + if x[8] <= 1 { // prev2_lat_0 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[12] <= 1 { // prev1_lat_0 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[12] <= 1 { // prev1_lat_0 <= 1.5000 + if x[27] <= 2 { // prev1_out_2 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + if x[13] <= 2 { // prev1_lat_1 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[24] <= 3 { // prev2_out_2 <= 3.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[30] <= 7 { // cur_out_2 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[0] <= 1 { // prev4_lat_0 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 2 { // prev1_out_2 <= 2.5000 + if x[30] <= 6 { // cur_out_2 <= 6.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[13] <= 4 { // prev1_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[30] <= 6 { // cur_out_2 <= 6.5000 + if x[5] <= 0 { // prev3_lat_1 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[30] <= 7 { // cur_out_2 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } +} + +// ── DEVICE 1 ───────────────────────────────────────────────────────────────── +// PASTE the output of `export_dt.py --fn_name predict_device1` below, +// replacing this placeholder function. + +/// Predict fast (1) or slow (0) for device 1. +#[inline] +pub fn predict_device1(x: &[u8; 31]) -> u8 { + if x[30] <= 1 { // cur_out_2 <= 1.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[0] <= 2 { // prev4_lat_0 <= 2.5000 + if x[27] <= 1 { // prev1_out_2 <= 1.5000 + if x[26] <= 0 { // prev1_out_1 <= 0.5000 + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 1]) + } + } else { + if x[9] <= 3 { // prev2_lat_1 <= 3.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[9] <= 8 { // prev2_lat_1 <= 8.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } + } else { + if x[8] <= 2 { // prev2_lat_0 <= 2.5000 + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[0] <= 4 { // prev4_lat_0 <= 4.5000 + 0 // slow (counts: [1, 0]) + } else { + 1 // fast (counts: [0, 1]) + } + } else { + if x[6] <= 0 { // prev3_lat_2 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[4] <= 1 { // prev3_lat_0 <= 1.5000 + 1 // fast (counts: [0, 1]) + } else { + if x[12] <= 2 { // prev1_lat_0 <= 2.5000 + 1 // fast (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[21] <= 2 { // prev3_out_2 <= 2.5000 + if x[15] <= 0 { // prev1_lat_3 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[3] <= 8 { // prev4_lat_3 <= 8.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [1, 0]) + } + } else { + if x[18] <= 2 { // prev4_out_2 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 5 { // prev1_out_2 <= 5.5000 + if x[4] <= 0 { // prev3_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[13] <= 3 { // prev1_lat_1 <= 3.5000 + 1 // fast (counts: [0, 1]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[11] <= 4 { // prev2_lat_3 <= 4.5000 + if x[26] <= 0 { // prev1_out_1 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[9] <= 8 { // prev2_lat_1 <= 8.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } else { + if x[30] <= 5 { // cur_out_2 <= 5.5000 + if x[30] <= 4 { // cur_out_2 <= 4.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[30] <= 3 { // cur_out_2 <= 3.5000 + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[24] <= 3 { // prev2_out_2 <= 3.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 2 { // prev1_out_2 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[13] <= 8 { // prev1_lat_1 <= 8.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[5] <= 4 { // prev3_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[30] <= 6 { // cur_out_2 <= 6.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[0] <= 0 { // prev4_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 3 { // prev1_out_2 <= 3.5000 + if x[4] <= 0 { // prev3_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[13] <= 4 { // prev1_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[30] <= 6 { // cur_out_2 <= 6.5000 + if x[12] <= 2 { // prev1_lat_0 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[30] <= 7 { // cur_out_2 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } +} + +// ── DEVICE 2 ───────────────────────────────────────────────────────────────── +// PASTE the output of `export_dt.py --fn_name predict_device2` below, +// replacing this placeholder function. + +/// Predict fast (1) or slow (0) for device 2. +#[inline] +pub fn predict_device2(x: &[u8; 31]) -> u8 { + if x[30] <= 1 { // cur_out_2 <= 1.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[4] <= 2 { // prev3_lat_0 <= 2.5000 + if x[27] <= 1 { // prev1_out_2 <= 1.5000 + if x[26] <= 0 { // prev1_out_1 <= 0.5000 + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 1]) + } + } else { + if x[12] <= 3 { // prev1_lat_0 <= 3.0000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[1] <= 0 { // prev4_lat_1 <= 0.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[13] <= 7 { // prev1_lat_1 <= 7.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } + } else { + if x[0] <= 1 { // prev4_lat_0 <= 1.5000 + if x[24] <= 8 { // prev2_out_2 <= 8.5000 + if x[4] <= 3 { // prev3_lat_0 <= 3.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[1] <= 3 { // prev4_lat_1 <= 3.5000 + 0 // slow (counts: [1, 0]) + } else { + 1 // fast (counts: [0, 1]) + } + } + } else { + if x[8] <= 1 { // prev2_lat_0 <= 1.5000 + if x[11] <= 2 { // prev2_lat_3 <= 2.5000 + 1 // fast (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } else { + if x[12] <= 2 { // prev1_lat_0 <= 2.5000 + 1 // fast (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[23] <= 0 { // prev2_out_1 <= 0.5000 + if x[9] <= 4 { // prev2_lat_1 <= 4.5000 + if x[9] <= 2 { // prev2_lat_1 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[6] <= 4 { // prev3_lat_2 <= 4.5000 + 0 // slow (counts: [1, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[2] <= 6 { // prev4_lat_2 <= 6.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[15] <= 8 { // prev1_lat_3 <= 8.5000 + 0 // slow (counts: [0, 0]) + } else { + 1 // fast (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 1 { // prev1_out_2 <= 1.5000 + if x[5] <= 4 { // prev3_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[8] <= 3 { // prev2_lat_0 <= 3.0000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[4] <= 0 { // prev3_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[27] <= 6 { // prev1_out_2 <= 6.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } else { + if x[30] <= 5 { // cur_out_2 <= 5.5000 + if x[30] <= 4 { // cur_out_2 <= 4.5000 + if x[29] <= 0 { // cur_out_1 <= 0.5000 + if x[30] <= 3 { // cur_out_2 <= 3.5000 + if x[24] <= 2 { // prev2_out_2 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[24] <= 2 { // prev2_out_2 <= 2.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 3 { // prev1_out_2 <= 3.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[29] <= 0 { // cur_out_1 <= 0.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } else { + if x[8] <= 0 { // prev2_lat_0 <= 0.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + if x[20] <= 0 { // prev3_out_1 <= 0.5000 + if x[30] <= 7 { // cur_out_2 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[24] <= 1 { // prev2_out_2 <= 1.5000 + if x[30] <= 6 { // cur_out_2 <= 6.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[18] <= 1 { // prev4_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } else { + if x[12] <= 0 { // prev1_lat_0 <= 0.5000 + if x[27] <= 2 { // prev1_out_2 <= 2.5000 + if x[21] <= 1 { // prev3_out_2 <= 1.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[13] <= 4 { // prev1_lat_1 <= 4.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } else { + if x[30] <= 6 { // cur_out_2 <= 6.5000 + if x[14] <= 3 { // prev1_lat_2 <= 3.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } else { + if x[30] <= 7 { // cur_out_2 <= 7.5000 + 0 // slow (counts: [0, 0]) + } else { + 0 // slow (counts: [0, 0]) + } + } + } + } + } + } +} diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 9511cb5b3..25e603dfc 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -25,6 +25,8 @@ pub mod linnos_weights; #[cfg(not(baseline_asterinas))] pub mod linnos_plus_weights; #[cfg(not(baseline_asterinas))] +pub mod decision_tree_predictions; +#[cfg(not(baseline_asterinas))] pub mod selection_policies; #[cfg(not(baseline_asterinas))] pub mod server_traits; diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 0b07bfc76..264b7a43f 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -224,7 +224,7 @@ impl SelectionPolicy for LinnOSPolicy { // Argmax: output[0] < output[1] means fast, otherwise slow if output[0] < output[1] { - log::info!("Submitting to device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); + // log::info!("Submitting to device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); return Ok(self.members[device_idx].clone()); } @@ -232,7 +232,114 @@ impl SelectionPolicy for LinnOSPolicy { // All devices predicted slow -- fall back to round-robin if fail_cnt >= num_devices { let fallback_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed) % num_devices; - log::info!("Submitting to device {} as all devices are busy. output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); + // log::info!("Submitting to device {} as all devices are busy. output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); + return Ok(self.members[fallback_idx].clone()); + } + } + } +} + +/// Decision tree selection policy. +/// +/// Uses a per-device binary decision tree trained on the same 31-element LinnOS +/// feature vector (3 digits current outstanding + 4 history steps × 7 digits). +/// The prediction functions are generated by `export_dt.py --format rust` and +/// live in `decision_tree_predictions`. Each function takes `&[u8; 31]` (one +/// digit per feature, 0–9) and returns 0 (slow) or 1 (fast). +/// +/// Looping and fallback logic mirrors LinnOS exactly. +#[orpc_server] +pub struct DecisionTreePolicy { + read_cursor: AtomicUsize, + members: Vec>, + observers: Vec>>, +} + +impl core::fmt::Debug for DecisionTreePolicy { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("DecisionTreePolicy") + .field("read_cursor", &self.read_cursor) + .field("members", &self.members) + .field( + "observers", + &format_args!("[{} observers]", self.observers.len()), + ) + .finish() + } +} + +impl DecisionTreePolicy { + pub fn new( + members: Vec>, + observers: Vec>>, + ) -> Result, Error> { + let server = Self::new_with(|orpc_internal, _| Self { + orpc_internal, + read_cursor: AtomicUsize::new(0), + members, + observers, + }); + Ok(server) + } +} + +impl SelectionPolicy for DecisionTreePolicy { + fn select_block_device(&self, submitted: &mut SubmittedBio) -> Result, Error> { + use crate::decision_tree_predictions::{predict_device0, predict_device1, predict_device2}; + + let num_devices = self.members.len(); + let mut fail_cnt = 0; + let num_pages = submitted.num_pages(); + + loop { + let idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); + let device_idx = idx % num_devices; + let observer = self.observers[device_idx].lock(); + let completion_trace = observer + .weak_observe_recent(4) + .expect("Failed to observe completion trace"); + + // Build the 31-element input feature vector as u8 digits (0–9) + let mut input = [0u8; 31]; + + let current_outstanding = num_pages as usize + + self.members[device_idx].num_outstanding_pages() as usize; + input[0] = ((current_outstanding / 100) % 10) as u8; + input[1] = ((current_outstanding / 10) % 10) as u8; + input[2] = (current_outstanding % 10) as u8; + + for (i, trace_entry) in completion_trace.iter().enumerate().take(4) { + let Some(trace_entry) = trace_entry else { + continue; + }; + let outstanding = trace_entry.outstanding_pages as usize; + let latency_us = trace_entry.latency_us as usize; + let base = 3 + i * 7; + + input[base] = ((outstanding / 100) % 10) as u8; + input[base + 1] = ((outstanding / 10) % 10) as u8; + input[base + 2] = (outstanding % 10) as u8; + + input[base + 3] = ((latency_us / 1000) % 10) as u8; + input[base + 4] = ((latency_us / 100) % 10) as u8; + input[base + 5] = ((latency_us / 10) % 10) as u8; + input[base + 6] = (latency_us % 10) as u8; + } + + let prediction = match device_idx { + 0 => predict_device0(&input), + 1 => predict_device1(&input), + 2 => predict_device2(&input), + _ => 1, // unknown device: predict fast + }; + + if prediction == 1 { + return Ok(self.members[device_idx].clone()); + } + + fail_cnt += 1; + if fail_cnt >= num_devices { + let fallback_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed) % num_devices; return Ok(self.members[fallback_idx].clone()); } } @@ -390,14 +497,14 @@ impl SelectionPolicy for LinnOSPlusPolicy { // Argmax: output[0] < output[1] means fast, otherwise slow if output[0] < output[1] { - log::info!("LinnOSPlus: device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); + // log::info!("LinnOSPlus: device {} predicted FAST. output=[{:.4},{:.4}]", device_idx, output[0], output[1]); return Ok(self.members[device_idx].clone()); } fail_cnt += 1; if fail_cnt >= num_devices { let fallback_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed) % num_devices; - log::info!("LinnOSPlus: device {} fallback (all busy). output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); + // log::info!("LinnOSPlus: device {} fallback (all busy). output=[{:.4},{:.4}]", fallback_idx, output[0], output[1]); return Ok(self.members[fallback_idx].clone()); } } diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 6f90778c7..188066b36 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -25,7 +25,7 @@ pub mod utils; use aster_block::BlockDevice; #[cfg(not(baseline_asterinas))] #[expect(unused_imports)] -use aster_raid::selection_policies::{Dummy0Policy, LinnOSPolicy, RoundRobinPolicy}; +use aster_raid::selection_policies::{DecisionTreePolicy, Dummy0Policy, LinnOSPolicy, LinnOSPlusPolicy, RoundRobinPolicy}; use aster_raid::{Raid1Device, Raid1DeviceError}; use aster_virtio::device::block::device::BlockDevice as VirtIoBlockDevice; @@ -177,32 +177,50 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { } } - // #[cfg(not(baseline_asterinas))] - // setup_data_capture(&members, RAID_MEMBER_NAMES); + #[cfg(all(not(baseline_asterinas), capture_data))] + setup_data_capture(&members, RAID_MEMBER_NAMES); #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); - // #[cfg(not(baseline_asterinas))] + + // Round Robin Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "roundrobin"))] let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); - #[cfg(not(baseline_asterinas))] - let observers = members - .iter() - .map(|dev| { - use aster_virtio::device::block::server_traits::BlockIOObservable; - use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; - let virtio_dev = dev - .downcast_ref::() - .expect("RAID member must be a VirtIoBlockDevice for LinnOS"); - ostd::sync::Mutex::new( - virtio_dev - .bio_completion_oqueue() - .attach_weak_observer(4, ObservationQuery::identity()) - .expect("Failed to attach weak observer to bio_completion_oqueue"), - ) - }) - .collect(); - #[cfg(not(baseline_asterinas))] + + // Shared weak observer setup for all observer-based policies (LinnOS, LinnOS Plus, Decision Tree) + #[cfg(all(not(baseline_asterinas), any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree")))] + let observers = { + use aster_virtio::device::block::server_traits::BlockIOObservable; + use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; + members + .iter() + .map(|dev| { + let virtio_dev = dev + .downcast_ref::() + .expect("RAID member must be a VirtIoBlockDevice"); + ostd::sync::Mutex::new( + virtio_dev + .bio_completion_oqueue() + .attach_weak_observer(4, ObservationQuery::identity()) + .expect("Failed to attach weak observer to bio_completion_oqueue"), + ) + }) + .collect() + }; + + // LinnOS Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "linnos"))] let selection_policy = LinnOSPolicy::new(members.clone(), observers).unwrap(); + + // LinnOS Plus Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "linnos_plus"))] + let selection_policy = LinnOSPlusPolicy::new(members.clone(), observers).unwrap(); + + // Decision Tree Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "decision_tree"))] + let selection_policy = DecisionTreePolicy::new(members.clone(), observers).unwrap(); + + // Initialize and Register RAID-1 device #[cfg(not(baseline_asterinas))] let raid1device = Raid1Device::init(raid_device_name, members, selection_policy); #[cfg(baseline_asterinas)] From 2fec8671a0a0499ada2d7bf5a50086adc1577f36 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 15 Apr 2026 04:23:18 +0000 Subject: [PATCH 18/22] Created Heimdall Module (Not wired with Submission Policy Yet) --- kernel/comps/block/src/bio.rs | 37 +- kernel/comps/block/src/lib.rs | 7 +- .../raid/src/generate_heimdall_weights.py | 150 ++++++++ kernel/comps/raid/src/heimdall.rs | 320 ++++++++++++++++++ kernel/comps/raid/src/heimdall_weights.rs.j2 | 96 ++++++ kernel/comps/raid/src/lib.rs | 4 + .../comps/virtio/src/device/block/device.rs | 30 +- kernel/src/fs/mod.rs | 56 ++- 8 files changed, 673 insertions(+), 27 deletions(-) create mode 100644 kernel/comps/raid/src/generate_heimdall_weights.py create mode 100644 kernel/comps/raid/src/heimdall.rs create mode 100644 kernel/comps/raid/src/heimdall_weights.rs.j2 diff --git a/kernel/comps/block/src/bio.rs b/kernel/comps/block/src/bio.rs index 04d3ecef7..72b5c7ad3 100644 --- a/kernel/comps/block/src/bio.rs +++ b/kernel/comps/block/src/bio.rs @@ -32,9 +32,13 @@ pub struct BlockDeviceCompletionStats { /// The latency of the I/O request in microseconds. pub latency_us: u64, /// The number of outstanding 4KB pages at completion time. - pub outstanding_pages: u64, + pub outstanding_pages: u32, + /// Length of the IO queue at the time the IO arrives, which is num_outstanding_request of a block device. + pub queue_len: u32, + /// Size of the IO request, which is num_pages of a bio request. + pub request_size_pages: u32, /// The index of the device that produced this stat. - pub device_index: u64, + pub device_index: u32, } /// The unit for block I/O. @@ -158,6 +162,8 @@ impl Bio { num_pages: None, #[cfg(not(baseline_asterinas))] outstanding_pages: None, + #[cfg(not(baseline_asterinas))] + outstanding_requests: None, }) { // Fail to submit, revert the status. let result = self.0.status.compare_exchange( @@ -341,13 +347,16 @@ pub struct SubmittedBio { submission_time_us: Option, #[cfg(not(baseline_asterinas))] - device_index: Option, + device_index: Option, + + #[cfg(not(baseline_asterinas))] + num_pages: Option, #[cfg(not(baseline_asterinas))] - num_pages: Option, + outstanding_pages: Option, #[cfg(not(baseline_asterinas))] - outstanding_pages: Option, + outstanding_requests: Option, } impl core::fmt::Debug for SubmittedBio { @@ -379,16 +388,16 @@ impl SubmittedBio { } /// an immutable version of the num_pages function. Panic if the num_pages field is not set yet. - pub fn get_num_pages(&self) -> u64 { + pub fn get_num_pages(&self) -> u32 { self.num_pages.expect("num_pages is not set yet") } /// Returns the number of 4KB pages covered by this bio's sector range. /// Note the field num_pages is only available when calling this function, but accessing it directly is not available. - pub fn num_pages(&mut self) -> u64 { + pub fn num_pages(&mut self) -> u32 { *self.num_pages.get_or_insert_with(|| { let sectors = self.bio_inner.sid_range().end.to_raw() - self.bio_inner.sid_range().start.to_raw(); - (sectors + 7) / 8 + ((sectors + 7) / 8) as u32 // each page has 8 sectors }) } @@ -434,8 +443,9 @@ impl SubmittedBio { pub fn prepare_enqueue( &mut self, reply_handle: RefProducer, - device_index: u64, - outstanding_pages: u64 + device_index: u32, + outstanding_pages: u32, + outstanding_requests: u32, ) { self.reply_handle = Some(reply_handle); @@ -443,6 +453,7 @@ impl SubmittedBio { self.device_index = Some(device_index); self.num_pages(); // set the num_pages field self.outstanding_pages = Some(outstanding_pages + self.num_pages.unwrap()); // accumulate the number of outstanding pages + self.outstanding_requests = Some(outstanding_requests); } #[cfg(not(baseline_asterinas))] @@ -453,8 +464,10 @@ impl SubmittedBio { .try_produce_ref(&BlockDeviceCompletionStats { latency_us: read_monotonic_time().as_micros() as u64 - self.submission_time_us.unwrap(), - outstanding_pages: self.outstanding_pages.unwrap_or(u64::MAX), - device_index: self.device_index.unwrap_or(u64::MAX), + outstanding_pages: self.outstanding_pages.unwrap_or(u32::MAX), + queue_len: self.outstanding_requests.unwrap_or(u32::MAX), + request_size_pages: self.num_pages.unwrap_or(u32::MAX), + device_index: self.device_index.unwrap_or(u32::MAX), }); } } diff --git a/kernel/comps/block/src/lib.rs b/kernel/comps/block/src/lib.rs index 9d85f674f..810f34fbc 100644 --- a/kernel/comps/block/src/lib.rs +++ b/kernel/comps/block/src/lib.rs @@ -60,7 +60,12 @@ pub trait BlockDevice: Send + Sync + Any + Debug { fn metadata(&self) -> BlockDeviceMeta; /// Returns the number of outstanding pages for this device. - fn num_outstanding_pages(&self) -> u64 { + fn num_outstanding_pages(&self) -> u32 { + 0 + } + + /// Returns the number of outstanding requests for this device. + fn num_outstanding_requests(&self) -> u32 { 0 } } diff --git a/kernel/comps/raid/src/generate_heimdall_weights.py b/kernel/comps/raid/src/generate_heimdall_weights.py new file mode 100644 index 000000000..5098bd4ad --- /dev/null +++ b/kernel/comps/raid/src/generate_heimdall_weights.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MPL-2.0 + +""" +Load trained PyTorch Heimdall models and generate the Rust weights file +using the Jinja2 template. + +The HeimdallNet architecture has three linear layers (one model per device): + Linear(input_dim, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid + +PyTorch state dict keys: + fc1.weight [128, input_dim] fc1.bias [128] + fc2.weight [16, 128] fc2.bias [16] + fc3.weight [1, 16] fc3.bias [1] + +Usage: + python generate_heimdall_weights.py \\ + --models models/heimdall_device0.pt \\ + models/heimdall_device1.pt \\ + models/heimdall_device2.pt \\ + --template kernel/comps/raid/src/heimdall_weights.rs.j2 \\ + --output kernel/comps/raid/src/heimdall_weights.rs + +Run from the repository root. +""" + +import argparse +from pathlib import Path + +import torch +from jinja2 import Environment, FileSystemLoader + + +def load_model(path: str) -> dict: + """Load a model checkpoint and return its state dict.""" + state = torch.load(path, map_location="cpu", weights_only=False) + return state + + +def print_architecture(state: dict, device_idx: int) -> None: + """Print model architecture for sanity check.""" + print(f" Device {device_idx}:") + for name, tensor in state.items(): + print(f" {name:20s} shape={str(list(tensor.shape)):16s} dtype={tensor.dtype}") + + +def tensor_to_list(tensor: torch.Tensor) -> list: + """Convert a tensor to a nested Python list of floats.""" + return tensor.tolist() + + +def main(): + parser = argparse.ArgumentParser( + description="Generate Heimdall Rust weight file from PyTorch models" + ) + parser.add_argument( + "--models", nargs="+", required=True, + help="Paths to .pt model files, one per device in order", + ) + parser.add_argument( + "--template", required=True, + help="Path to the Jinja2 template (.rs.j2)", + ) + parser.add_argument( + "--output", required=True, + help="Path for the generated Rust file (.rs)", + ) + args = parser.parse_args() + + # Load all models + models = [] + for path in args.models: + models.append(load_model(path)) + + num_devices = len(models) + + # Sanity check: print architecture + print(f"Loaded {num_devices} model(s).\n") + print("Model architecture:") + for i, state in enumerate(models): + print_architecture(state, i) + print() + + # Extract dimensions from the first model + # fc1: Linear(input_dim, hidden1_size) + # fc2: Linear(hidden1_size, hidden2_size) + # fc3: Linear(hidden2_size, 1) + input_dim = models[0]["fc1.weight"].shape[1] + hidden1_size = models[0]["fc1.weight"].shape[0] + hidden2_size = models[0]["fc2.weight"].shape[0] + output_size = models[0]["fc3.weight"].shape[0] + + assert output_size == 1, f"Expected output size 1 (sigmoid), got {output_size}" + + print(f"Network: {input_dim} -> {hidden1_size} (ReLU) -> {hidden2_size} (ReLU) -> {output_size} (Sigmoid)") + print() + + # Extract weights and biases for each device + # PyTorch stores weights as [out_features, in_features]. + # In Rust we index as weights[input][output], so we transpose. + fc1_weights = [] + fc1_biases = [] + fc2_weights = [] + fc2_biases = [] + fc3_weights = [] + fc3_biases = [] + + for i, state in enumerate(models): + # fc1: [hidden1_size, input_dim] -> [input_dim, hidden1_size] + w1 = state["fc1.weight"].T + fc1_weights.append(tensor_to_list(w1)) + fc1_biases.append(tensor_to_list(state["fc1.bias"])) + + # fc2: [hidden2_size, hidden1_size] -> [hidden1_size, hidden2_size] + w2 = state["fc2.weight"].T + fc2_weights.append(tensor_to_list(w2)) + fc2_biases.append(tensor_to_list(state["fc2.bias"])) + + # fc3: [1, hidden2_size] -> [hidden2_size] (squeeze since output is scalar) + w3 = state["fc3.weight"].squeeze(0) + fc3_weights.append(tensor_to_list(w3)) + fc3_biases.append(state["fc3.bias"].item()) + + # Render template + template_path = Path(args.template) + env = Environment( + loader=FileSystemLoader(str(template_path.parent)), + keep_trailing_newline=True, + ) + template = env.get_template(template_path.name) + + rendered = template.render( + num_devices=num_devices, + input_dim=input_dim, + hidden1_size=hidden1_size, + hidden2_size=hidden2_size, + fc1_weights=fc1_weights, + fc1_biases=fc1_biases, + fc2_weights=fc2_weights, + fc2_biases=fc2_biases, + fc3_weights=fc3_weights, + fc3_biases=fc3_biases, + ) + + Path(args.output).write_text(rendered) + print(f"Generated {args.output} ({len(rendered)} bytes)") + + +if __name__ == "__main__": + main() diff --git a/kernel/comps/raid/src/heimdall.rs b/kernel/comps/raid/src/heimdall.rs new file mode 100644 index 000000000..14f72a8fe --- /dev/null +++ b/kernel/comps/raid/src/heimdall.rs @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: MPL-2.0 + +#![cfg(not(baseline_asterinas))] + +use alloc::{sync::Arc, vec::Vec}; +use core::sync::atomic::{AtomicBool, Ordering}; + +use aster_block::{ + BlockDevice, + bio::BlockDeviceCompletionStats, +}; +use ostd::{ + Error, + orpc::oqueue::{OQueueError, StrongObserver}, + sync::Mutex, +}; + +/// Heimdall: an asynchronous device performance monitor for RAID-1 arrays. +/// +/// Heimdall maintains one ML model and one strong observer per member device. +/// A dedicated background thread continuously drains completion stats from each +/// device's OQueue. Every `BATCH_SIZE` (16) completions, it runs an ML inference +/// to update that device's fast/slow indicator. +/// +/// Model architecture (per device): +/// Linear(INPUT_DIM, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid +/// +/// Selection policies can query `is_device_fast(idx)` to incorporate Heimdall's +/// classification into their scheduling decisions. +pub struct Heimdall { + members: Vec>, + observers: Vec>>, + /// Per-device fast/slow indicator. `true` means the device is predicted fast. + fast_indicators: Vec, + /// Per-device fc1 weights: [INPUT_DIM][HIDDEN1_SIZE] + fc1_weights: Vec<[[f32; HIDDEN1_SIZE]; INPUT_DIM]>, + /// Per-device fc1 biases: [HIDDEN1_SIZE] + fc1_biases: Vec<[f32; HIDDEN1_SIZE]>, + /// Per-device fc2 weights: [HIDDEN1_SIZE][HIDDEN2_SIZE] + fc2_weights: Vec<[[f32; HIDDEN2_SIZE]; HIDDEN1_SIZE]>, + /// Per-device fc2 biases: [HIDDEN2_SIZE] + fc2_biases: Vec<[f32; HIDDEN2_SIZE]>, + /// Per-device fc3 weights: [HIDDEN2_SIZE] + fc3_weights: Vec<[f32; HIDDEN2_SIZE]>, + /// Per-device fc3 biases: scalar + fc3_biases: Vec, +} + +impl core::fmt::Debug for Heimdall { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("Heimdall") + .field("members", &self.members) + .field( + "observers", + &format_args!("[{} observers]", self.observers.len()), + ) + .field("fast_indicators", &self.fast_indicators) + .finish() + } +} + +use crate::heimdall_weights::{HIDDEN1_SIZE, HIDDEN2_SIZE, INPUT_DIM}; + +/// Number of completion records to drain before running an inference. +const BATCH_SIZE: usize = 16; + +/// Inference timeout in milliseconds. If this duration elapses since the last +/// inference for a device, inference is triggered even if fewer than `BATCH_SIZE` +/// records have been observed. +const INFERENCE_TIMEOUT_MS: u64 = 5; + +impl Heimdall { + /// Creates a new Heimdall monitor. + /// + /// `members` — the RAID-1 member devices to monitor. + /// `observers` — one strong observer per member, attached to its bio completion OQueue. + pub fn new( + members: Vec>, + observers: Vec>, + ) -> Result, Error> { + use crate::heimdall_weights::{ + FC1_BIASES, FC1_WEIGHTS, FC2_BIASES, FC2_WEIGHTS, FC3_BIASES, FC3_WEIGHTS, + }; + + let num_devices = members.len(); + + let fast_indicators: Vec = (0..num_devices) + .map(|_| AtomicBool::new(true)) // optimistic: assume fast initially + .collect(); + + let observers: Vec>> = observers + .into_iter() + .map(Mutex::new) + .collect(); + + let fc1_weights: Vec<_> = (0..num_devices).map(|i| *FC1_WEIGHTS[i]).collect(); + let fc1_biases: Vec<_> = (0..num_devices).map(|i| *FC1_BIASES[i]).collect(); + let fc2_weights: Vec<_> = (0..num_devices).map(|i| *FC2_WEIGHTS[i]).collect(); + let fc2_biases: Vec<_> = (0..num_devices).map(|i| *FC2_BIASES[i]).collect(); + let fc3_weights: Vec<_> = (0..num_devices).map(|i| *FC3_WEIGHTS[i]).collect(); + let fc3_biases: Vec<_> = (0..num_devices).map(|i| FC3_BIASES[i]).collect(); + + log::info!( + "Heimdall created with {} devices", + fast_indicators.len() + ); + + Ok(Arc::new(Self { + members, + observers, + fast_indicators, + fc1_weights, + fc1_biases, + fc2_weights, + fc2_biases, + fc3_weights, + fc3_biases, + })) + } + + /// Returns whether device `idx` is currently classified as fast. + pub fn is_device_fast(&self, idx: usize) -> bool { + self.fast_indicators[idx].load(Ordering::Relaxed) + } + + /// The number of member devices being monitored. + pub fn num_devices(&self) -> usize { + self.members.len() + } + + /// Main monitoring loop. This should be spawned on a dedicated thread. + /// + /// For each device, drains completion records from its strong observer. + /// Inference is triggered for a device when either condition is met first: + /// 1. `BATCH_SIZE` (16) records have been drained, or + /// 2. `INFERENCE_TIMEOUT_MS` (5 ms) have elapsed since the last inference. + pub fn run(&self) { + use ostd::timer::Jiffies; + + let num_devices = self.members.len(); + // TIMER_FREQ is 1000 Hz, so 1 jiffy = 1 ms. 5 ms = 5 jiffies. + let timeout_jiffies = INFERENCE_TIMEOUT_MS * ostd::arch::timer::TIMER_FREQ / 1000; + + // Per-device batch buffers for accumulating stats between inferences. + let mut batch_buffers: Vec> = (0..num_devices) + .map(|_| Vec::with_capacity(BATCH_SIZE)) + .collect(); + + // Per-device jiffies timestamp of the last inference (or loop start). + let now = Jiffies::elapsed().as_u64(); + let mut last_inference_jiffies = alloc::vec![now; num_devices]; + + loop { + for device_idx in 0..num_devices { + let observer = self.observers[device_idx].lock(); + + // Drain all available records (non-blocking). + loop { + match observer.try_strong_observe() { + Ok(Some(stats)) => { + batch_buffers[device_idx].push(stats); + + // Condition 1: batch is full. + // Do device inference, then break to give other devices a turn. + if batch_buffers[device_idx].len() >= BATCH_SIZE { + self.run_inference(device_idx, &mut batch_buffers[device_idx]); + last_inference_jiffies[device_idx] = Jiffies::elapsed().as_u64(); + break; + } + } + Ok(None) => break, // queue is empty, move on to timeout check + Err(OQueueError::Detached { .. }) => { + log::warn!( + "Heimdall: observer for device {} detached", + device_idx + ); + break; + } + Err(e) => { + log::warn!( + "Heimdall: error observing device {}: {:?}", + device_idx, e + ); + break; + } + } + } + + drop(observer); + + // Condition 2: timeout elapsed and there is at least some data + // (or even no data — we still re-evaluate so the device can + // transition back to fast when IO pressure drops). + let elapsed = Jiffies::elapsed().as_u64().wrapping_sub(last_inference_jiffies[device_idx]); + if elapsed >= timeout_jiffies && !batch_buffers[device_idx].is_empty() { + self.run_inference(device_idx, &mut batch_buffers[device_idx]); + last_inference_jiffies[device_idx] = Jiffies::elapsed().as_u64(); + } + } + + // Yield to avoid busy-spinning when all queues are empty. + ostd::task::Task::yield_now(); + } + } + + /// Run inference for a single device and update its fast indicator. + fn run_inference( + &self, + device_idx: usize, + batch: &mut Vec, + ) { + // Model output: 1 → slow (reject IO), 0 → fast (accept IO). + let is_slow = self.infer_device_speed(device_idx, batch); + self.fast_indicators[device_idx].store(!is_slow, Ordering::Relaxed); + log::info!( + "Heimdall: labeling device {} to {} (by {} records)", + device_idx, + if is_slow { "slow" } else { "fast" }, + batch.len() + ); + batch.clear(); + } + + /// Build the 11-element input feature vector from a batch of completion stats. + /// + /// Features (matching HeimdallNet training input): + /// [0] queue_len_now — queue_len of the most recent record + /// [1] size_now — request_size_pages of the most recent record + /// [2] hist_que_len_t-1 — queue_len of the 2nd-most-recent record + /// [3] hist_que_len_t-2 — queue_len of the 3rd-most-recent record + /// [4] hist_que_len_t-3 — queue_len of the 4th-most-recent record + /// [5] hist_lat_t-1 — latency_us of the 2nd-most-recent record + /// [6] hist_lat_t-2 — latency_us of the 3rd-most-recent record + /// [7] hist_lat_t-3 — latency_us of the 4th-most-recent record + /// [8] hist_thpt_t-1 — request_size_pages / latency_us (2nd-most-recent) + /// [9] hist_thpt_t-2 — request_size_pages / latency_us (3rd-most-recent) + /// [10] hist_thpt_t-3 — request_size_pages / latency_us (4th-most-recent) + fn build_features(&self, batch: &[BlockDeviceCompletionStats]) -> [f32; INPUT_DIM] { + let mut input = [0.0f32; INPUT_DIM]; + let n = batch.len(); + if n == 0 { + return input; + } + + // Most recent record: "now" features. + let now = &batch[n - 1]; + input[0] = now.queue_len as f32; + input[1] = now.request_size_pages as f32; + + // Historical records: t-1 = batch[n-2], t-2 = batch[n-3], t-3 = batch[n-4]. + // Missing history (batch too small) stays 0.0. + for hist in 0..3usize { + let idx = n.wrapping_sub(hist + 2); + if idx < n { + let rec = &batch[idx]; + input[2 + hist] = rec.queue_len as f32; + input[5 + hist] = rec.latency_us as f32; + input[8 + hist] = if rec.latency_us > 0 { + rec.request_size_pages as f32 / rec.latency_us as f32 + } else { + 0.0 + }; + } + } + + input + } + + /// Run the HeimdallNet forward pass for `device_idx` on the given batch. + /// + /// Each device has its own model weights. + /// Architecture: + /// Linear(INPUT_DIM, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid + /// + /// Returns `true` if the device is predicted fast (sigmoid output >= 0.5). + fn infer_device_speed( + &self, + device_idx: usize, + batch: &[BlockDeviceCompletionStats], + ) -> bool { + let input = self.build_features(batch); + + // fc1: input (INPUT_DIM) x fc1_weights (INPUT_DIM x HIDDEN1_SIZE) + bias -> ReLU + let w1 = &self.fc1_weights[device_idx]; + let b1 = &self.fc1_biases[device_idx]; + let mut h1 = [0.0f32; HIDDEN1_SIZE]; + for j in 0..HIDDEN1_SIZE { + let mut sum = b1[j]; + for i in 0..INPUT_DIM { + sum += input[i] * w1[i][j]; + } + h1[j] = if sum > 0.0 { sum } else { 0.0 }; // ReLU + } + + // fc2: h1 (HIDDEN1_SIZE) x fc2_weights (HIDDEN1_SIZE x HIDDEN2_SIZE) + bias -> ReLU + let w2 = &self.fc2_weights[device_idx]; + let b2 = &self.fc2_biases[device_idx]; + let mut h2 = [0.0f32; HIDDEN2_SIZE]; + for j in 0..HIDDEN2_SIZE { + let mut sum = b2[j]; + for i in 0..HIDDEN1_SIZE { + sum += h1[i] * w2[i][j]; + } + h2[j] = if sum > 0.0 { sum } else { 0.0 }; // ReLU + } + + // fc3: h2 (HIDDEN2_SIZE) x fc3_weights (HIDDEN2_SIZE) + bias -> Sigmoid + let w3 = &self.fc3_weights[device_idx]; + let b3 = self.fc3_biases[device_idx]; + let mut logit = b3; + for j in 0..HIDDEN2_SIZE { + logit += h2[j] * w3[j]; + } + + // Sigmoid: 1 / (1 + exp(-x)). Equivalent to: logit >= 0. + // We skip the actual sigmoid computation since we only need the + // threshold comparison at 0.5. + logit >= 0.0 + } +} diff --git a/kernel/comps/raid/src/heimdall_weights.rs.j2 b/kernel/comps/raid/src/heimdall_weights.rs.j2 new file mode 100644 index 000000000..51f13aea3 --- /dev/null +++ b/kernel/comps/raid/src/heimdall_weights.rs.j2 @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: MPL-2.0 + +// Heimdall neural network weights hardcoded for {{ num_devices }} devices. +// Each device has: +// - fc1: {{ input_dim }} x {{ hidden1_size }} matrix + {{ hidden1_size }} bias (ReLU) +// - fc2: {{ hidden1_size }} x {{ hidden2_size }} matrix + {{ hidden2_size }} bias (ReLU) +// - fc3: {{ hidden2_size }} x 1 matrix + 1 bias (Sigmoid) +// +// AUTO-GENERATED by generate_heimdall_weights.py using Jinja2. +// Do not edit this file manually. + +/// Number of devices with hardcoded weights. +pub const NUM_DEVICES: usize = {{ num_devices }}; + +/// Input dimension. +pub const INPUT_DIM: usize = {{ input_dim }}; + +/// First hidden layer size. +pub const HIDDEN1_SIZE: usize = {{ hidden1_size }}; + +/// Second hidden layer size. +pub const HIDDEN2_SIZE: usize = {{ hidden2_size }}; + +{% for dev in range(num_devices) %} +/// fc1 weights for device {{ dev }}: {{ input_dim }} inputs -> {{ hidden1_size }} neurons +pub static FC1_WEIGHTS_{{ dev }}: [[f32; {{ hidden1_size }}]; {{ input_dim }}] = [ +{% for row in fc1_weights[dev] %} + [{{ row | join(', ') }}], +{% endfor %} +]; + +/// fc1 bias for device {{ dev }} +pub static FC1_BIAS_{{ dev }}: [f32; {{ hidden1_size }}] = [{{ fc1_biases[dev] | join(', ') }}]; + +{% endfor %} +{% for dev in range(num_devices) %} +/// fc2 weights for device {{ dev }}: {{ hidden1_size }} -> {{ hidden2_size }} neurons +pub static FC2_WEIGHTS_{{ dev }}: [[f32; {{ hidden2_size }}]; {{ hidden1_size }}] = [ +{% for row in fc2_weights[dev] %} + [{{ row | join(', ') }}], +{% endfor %} +]; + +/// fc2 bias for device {{ dev }} +pub static FC2_BIAS_{{ dev }}: [f32; {{ hidden2_size }}] = [{{ fc2_biases[dev] | join(', ') }}]; + +{% endfor %} +{% for dev in range(num_devices) %} +/// fc3 weights for device {{ dev }}: {{ hidden2_size }} -> 1 output +pub static FC3_WEIGHTS_{{ dev }}: [f32; {{ hidden2_size }}] = [{{ fc3_weights[dev] | join(', ') }}]; + +/// fc3 bias for device {{ dev }} +pub static FC3_BIAS_{{ dev }}: f32 = {{ fc3_biases[dev] }}; + +{% endfor %} +/// All fc1 weights indexed by device. +pub static FC1_WEIGHTS: [&[[f32; {{ hidden1_size }}]; {{ input_dim }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &FC1_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All fc1 biases indexed by device. +pub static FC1_BIASES: [&[f32; {{ hidden1_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &FC1_BIAS_{{ dev }}, +{% endfor %} +]; + +/// All fc2 weights indexed by device. +pub static FC2_WEIGHTS: [&[[f32; {{ hidden2_size }}]; {{ hidden1_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &FC2_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All fc2 biases indexed by device. +pub static FC2_BIASES: [&[f32; {{ hidden2_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &FC2_BIAS_{{ dev }}, +{% endfor %} +]; + +/// All fc3 weights indexed by device. +pub static FC3_WEIGHTS: [&[f32; {{ hidden2_size }}]; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + &FC3_WEIGHTS_{{ dev }}, +{% endfor %} +]; + +/// All fc3 biases indexed by device. +pub static FC3_BIASES: [f32; NUM_DEVICES] = [ +{% for dev in range(num_devices) %} + FC3_BIAS_{{ dev }}, +{% endfor %} +]; diff --git a/kernel/comps/raid/src/lib.rs b/kernel/comps/raid/src/lib.rs index 25e603dfc..dbb56ac4c 100644 --- a/kernel/comps/raid/src/lib.rs +++ b/kernel/comps/raid/src/lib.rs @@ -27,6 +27,10 @@ pub mod linnos_plus_weights; #[cfg(not(baseline_asterinas))] pub mod decision_tree_predictions; #[cfg(not(baseline_asterinas))] +pub mod heimdall; +#[cfg(not(baseline_asterinas))] +pub mod heimdall_weights; +#[cfg(not(baseline_asterinas))] pub mod selection_policies; #[cfg(not(baseline_asterinas))] pub mod server_traits; diff --git a/kernel/comps/virtio/src/device/block/device.rs b/kernel/comps/virtio/src/device/block/device.rs index d420034ce..fb9ed8721 100644 --- a/kernel/comps/virtio/src/device/block/device.rs +++ b/kernel/comps/virtio/src/device/block/device.rs @@ -12,7 +12,7 @@ use core::{ fmt::Debug, hint::spin_loop, mem::size_of, - sync::atomic::{AtomicU64, Ordering}, + sync::atomic::{AtomicU32, AtomicU64, Ordering}, }; use aster_block::{ @@ -158,7 +158,7 @@ impl BlockDevice { } /// Sets the logical index for this device, used to tag I/O completion stats. - pub fn set_device_index(&self, index: u64) { + pub fn set_device_index(&self, index: u32) { self.device.device_index.store(index, Ordering::Relaxed); } } @@ -184,8 +184,9 @@ impl aster_block::BlockDevice for BlockDevice { let mut bio = bio; let device_index = self.device.device_index.load(Ordering::Relaxed); - bio.prepare_enqueue(reply_handle, device_index, self.device.num_outstanding_pages.load(Ordering::Relaxed)); + bio.prepare_enqueue(reply_handle, device_index as u32, self.device.num_outstanding_pages.load(Ordering::Relaxed) as u32, self.device.num_outstanding_requests.load(Ordering::Relaxed) as u32); self.device.inc_page_counter(bio.num_pages()); + self.device.inc_request_counter(); // log::info!("\x1b[32mIncremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", bio.num_pages(), self.device.num_outstanding_pages.load(Ordering::Relaxed), device_index, bio.type_()); let producer = self.bio_submission_oqueue().attach_value_producer()?; producer.produce(bio); @@ -199,9 +200,13 @@ impl aster_block::BlockDevice for BlockDevice { } } - fn num_outstanding_pages(&self) -> u64 { + fn num_outstanding_pages(&self) -> u32 { self.device.num_outstanding_pages.load(Ordering::Relaxed) } + + fn num_outstanding_requests(&self) -> u32 { + self.device.num_outstanding_requests.load(Ordering::Relaxed) + } } #[derive(Debug)] @@ -214,8 +219,9 @@ struct DeviceInner { block_responses: DmaStream, id_allocator: SpinLock, submitted_requests: SpinLock>, - device_index: AtomicU64, - num_outstanding_pages: AtomicU64 + device_index: AtomicU32, + num_outstanding_pages: AtomicU32, + num_outstanding_requests: AtomicU32, } impl DeviceInner { @@ -264,8 +270,9 @@ impl DeviceInner { block_responses, id_allocator: SpinLock::new(IdAlloc::with_capacity(Self::QUEUE_SIZE as usize)), submitted_requests: SpinLock::new(BTreeMap::new()), - device_index: AtomicU64::new(u64::MAX), - num_outstanding_pages: AtomicU64::new(0) + num_outstanding_pages: AtomicU32::new(0), + num_outstanding_requests: AtomicU32::new(0), + device_index: AtomicU32::new(u32::MAX-1), }); let cloned_device = device.clone(); @@ -341,6 +348,7 @@ impl DeviceInner { { let pages = bio.get_num_pages(); let outstanding = self.num_outstanding_pages.fetch_sub(pages, Ordering::Relaxed); + self.num_outstanding_requests.fetch_sub(1, Ordering::Relaxed); // log::info!("\x1b[31mDecremented\x1b[0m Page Counter by {}, new value: {}, device_index: {}, type: {:?}", pages, outstanding, self.device_index.load(Ordering::Relaxed), req_type); bio.report_statistics(); } @@ -597,9 +605,13 @@ impl DeviceInner { } } - fn inc_page_counter(&self, n_pages: u64) { + fn inc_page_counter(&self, n_pages: u32) { self.num_outstanding_pages.fetch_add(n_pages, Ordering::Relaxed); } + + fn inc_request_counter(&self) { + self.num_outstanding_requests.fetch_add(1, Ordering::Relaxed); + } } /// A submitted bio request for callback. diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 188066b36..fc570a04f 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -163,7 +163,7 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { Ok(device) => { info!("[raid] member '{}' online", name); if let Some(virtio_dev) = device.downcast_ref::() { - virtio_dev.set_device_index(index as u64); + virtio_dev.set_device_index((index) as u32); } members.push(device); } @@ -180,12 +180,12 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { #[cfg(all(not(baseline_asterinas), capture_data))] setup_data_capture(&members, RAID_MEMBER_NAMES); + // Clone members for Heimdall before they are consumed by the selection policy / RAID init. #[cfg(not(baseline_asterinas))] - info!("[raid] creating selection policy"); + let members_for_heimdall = members.clone(); - // Round Robin Policy - #[cfg(all(not(baseline_asterinas), raid_selection = "roundrobin"))] - let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); + #[cfg(not(baseline_asterinas))] + info!("[raid] creating selection policy"); // Shared weak observer setup for all observer-based policies (LinnOS, LinnOS Plus, Decision Tree) #[cfg(all(not(baseline_asterinas), any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree")))] @@ -220,6 +220,10 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { #[cfg(all(not(baseline_asterinas), raid_selection = "decision_tree"))] let selection_policy = DecisionTreePolicy::new(members.clone(), observers).unwrap(); + // Round Robin Policy (explicit or default when no raid_selection is specified) + #[cfg(all(not(baseline_asterinas), any(raid_selection = "roundrobin", not(any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree")))))] + let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); + // Initialize and Register RAID-1 device #[cfg(not(baseline_asterinas))] let raid1device = Raid1Device::init(raid_device_name, members, selection_policy); @@ -232,6 +236,48 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { })?; info!("[raid] RAID-1 device created"); + // Initialize Heimdall device performance monitor + #[cfg(not(baseline_asterinas))] + { + use aster_virtio::device::block::server_traits::BlockIOObservable; + use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; + + let heimdall_observers: Vec<_> = members_for_heimdall + .iter() + .map(|dev| { + let virtio_dev = dev + .downcast_ref::() + .expect("RAID member must be a VirtIoBlockDevice"); + virtio_dev + .bio_completion_oqueue() + .attach_strong_observer(ObservationQuery::identity()) + .expect("Failed to attach strong observer for Heimdall") + }) + .collect(); + + let heimdall = aster_raid::heimdall::Heimdall::new( + members_for_heimdall, + heimdall_observers, + ) + .expect("Failed to create Heimdall monitor"); + + let heimdall_task = move || { + info!("[heimdall] Heimdall monitor thread started"); + heimdall.run(); + }; + + crate::ThreadOptions::new(heimdall_task) + .sched_policy(crate::sched::SchedPolicy::RealTime { + rt_prio: 50.try_into().unwrap(), + rt_policy: crate::sched::RealTimePolicy::RoundRobin { + base_slice_factor: None, + }, + }) + .spawn(); + + info!("[heimdall] Heimdall monitor initialized and thread spawned"); + } + let worker = aster_block::get_device(raid_device_name).unwrap(); // The registry stores `Arc`. Use `downcast_ref` on the captured Arc each // iteration to call the RAID-specific helper without needing ownership of `Raid1Device`. From 07190f6f42f7d36a6ab22a7a6173990923f9ebc7 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 15 Apr 2026 05:17:08 +0000 Subject: [PATCH 19/22] Changed Admission and Submission Policy's initialization order --- kernel/src/fs/mod.rs | 86 ++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index fc570a04f..52fb6e8a6 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -184,6 +184,50 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { #[cfg(not(baseline_asterinas))] let members_for_heimdall = members.clone(); + // Initialize Heimdall device performance monitor + #[cfg(not(baseline_asterinas))] + { + use aster_virtio::device::block::server_traits::BlockIOObservable; + use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; + + let heimdall_observers: Vec<_> = members_for_heimdall + .iter() + .map(|dev| { + let virtio_dev = dev + .downcast_ref::() + .expect("RAID member must be a VirtIoBlockDevice"); + virtio_dev + .bio_completion_oqueue() + .attach_strong_observer(ObservationQuery::identity()) + .expect("Failed to attach strong observer for Heimdall") + }) + .collect(); + + let heimdall = aster_raid::heimdall::Heimdall::new( + members_for_heimdall, + heimdall_observers, + ) + .expect("Failed to create Heimdall monitor"); + + let heimdall_task = move || { + info!("[heimdall] Heimdall monitor thread started"); + heimdall.run(); + }; + + crate::ThreadOptions::new(heimdall_task) + .sched_policy(crate::sched::SchedPolicy::RealTime { + rt_prio: 50.try_into().unwrap(), + rt_policy: crate::sched::RealTimePolicy::RoundRobin { + base_slice_factor: None, + }, + }) + .spawn(); + + info!("[heimdall] Heimdall monitor initialized and thread spawned"); + } + + + #[cfg(not(baseline_asterinas))] info!("[raid] creating selection policy"); @@ -236,48 +280,6 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { })?; info!("[raid] RAID-1 device created"); - // Initialize Heimdall device performance monitor - #[cfg(not(baseline_asterinas))] - { - use aster_virtio::device::block::server_traits::BlockIOObservable; - use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; - - let heimdall_observers: Vec<_> = members_for_heimdall - .iter() - .map(|dev| { - let virtio_dev = dev - .downcast_ref::() - .expect("RAID member must be a VirtIoBlockDevice"); - virtio_dev - .bio_completion_oqueue() - .attach_strong_observer(ObservationQuery::identity()) - .expect("Failed to attach strong observer for Heimdall") - }) - .collect(); - - let heimdall = aster_raid::heimdall::Heimdall::new( - members_for_heimdall, - heimdall_observers, - ) - .expect("Failed to create Heimdall monitor"); - - let heimdall_task = move || { - info!("[heimdall] Heimdall monitor thread started"); - heimdall.run(); - }; - - crate::ThreadOptions::new(heimdall_task) - .sched_policy(crate::sched::SchedPolicy::RealTime { - rt_prio: 50.try_into().unwrap(), - rt_policy: crate::sched::RealTimePolicy::RoundRobin { - base_slice_factor: None, - }, - }) - .spawn(); - - info!("[heimdall] Heimdall monitor initialized and thread spawned"); - } - let worker = aster_block::get_device(raid_device_name).unwrap(); // The registry stores `Arc`. Use `downcast_ref` on the captured Arc each // iteration to call the RAID-specific helper without needing ownership of `Raid1Device`. From d65fb13664a8f165d2ef6fbb5c97fa4147d1aa97 Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 15 Apr 2026 05:17:25 +0000 Subject: [PATCH 20/22] Tuned Heimdall's parameter and added extra functionalities. --- kernel/comps/raid/src/heimdall.rs | 35 ++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/kernel/comps/raid/src/heimdall.rs b/kernel/comps/raid/src/heimdall.rs index 14f72a8fe..422c0bf72 100644 --- a/kernel/comps/raid/src/heimdall.rs +++ b/kernel/comps/raid/src/heimdall.rs @@ -62,12 +62,12 @@ impl core::fmt::Debug for Heimdall { use crate::heimdall_weights::{HIDDEN1_SIZE, HIDDEN2_SIZE, INPUT_DIM}; /// Number of completion records to drain before running an inference. -const BATCH_SIZE: usize = 16; +const BATCH_SIZE: usize = 8; /// Inference timeout in milliseconds. If this duration elapses since the last /// inference for a device, inference is triggered even if fewer than `BATCH_SIZE` /// records have been observed. -const INFERENCE_TIMEOUT_MS: u64 = 5; +const INFERENCE_TIMEOUT_MS: u64 = 28; impl Heimdall { /// Creates a new Heimdall monitor. @@ -123,6 +123,13 @@ impl Heimdall { self.fast_indicators[idx].load(Ordering::Relaxed) } + /// Returns the fast indicator for device `idx`. + /// + /// `true` means the device is currently predicted fast; `false` means slow. + pub fn check_device(&self, idx: usize) -> bool { + self.fast_indicators[idx].load(Ordering::Relaxed) + } + /// The number of member devices being monitored. pub fn num_devices(&self) -> usize { self.members.len() @@ -163,6 +170,11 @@ impl Heimdall { // Condition 1: batch is full. // Do device inference, then break to give other devices a turn. if batch_buffers[device_idx].len() >= BATCH_SIZE { + // log::info!( + // "Heimdall: triggered by batch for device {} ({} records)", + // device_idx, + // batch_buffers[device_idx].len() + // ); self.run_inference(device_idx, &mut batch_buffers[device_idx]); last_inference_jiffies[device_idx] = Jiffies::elapsed().as_u64(); break; @@ -193,6 +205,11 @@ impl Heimdall { // transition back to fast when IO pressure drops). let elapsed = Jiffies::elapsed().as_u64().wrapping_sub(last_inference_jiffies[device_idx]); if elapsed >= timeout_jiffies && !batch_buffers[device_idx].is_empty() { + // log::info!( + // "Heimdall: triggered by timeout for device {} ({} ms since last inference)", + // device_idx, + // elapsed * 1000 / ostd::arch::timer::TIMER_FREQ + // ); self.run_inference(device_idx, &mut batch_buffers[device_idx]); last_inference_jiffies[device_idx] = Jiffies::elapsed().as_u64(); } @@ -212,12 +229,12 @@ impl Heimdall { // Model output: 1 → slow (reject IO), 0 → fast (accept IO). let is_slow = self.infer_device_speed(device_idx, batch); self.fast_indicators[device_idx].store(!is_slow, Ordering::Relaxed); - log::info!( - "Heimdall: labeling device {} to {} (by {} records)", - device_idx, - if is_slow { "slow" } else { "fast" }, - batch.len() - ); + // log::info!( + // "Heimdall: labeling device {} to {} (by {} records)", + // device_idx, + // if is_slow { "slow" } else { "fast" }, + // batch.len() + // ); batch.clear(); } @@ -317,4 +334,6 @@ impl Heimdall { // threshold comparison at 0.5. logit >= 0.0 } + + } From ab22d7758e3643aba56c5b669080139d14a10bbc Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Wed, 15 Apr 2026 18:09:11 +0000 Subject: [PATCH 21/22] Heimdall Round Robin Policy --- Cargo.toml | 2 +- .../raid/src/generate_heimdall_weights.py | 33 ++++------- kernel/comps/raid/src/heimdall.rs | 56 ++++++------------- kernel/comps/raid/src/heimdall_weights.rs.j2 | 54 ++++-------------- kernel/comps/raid/src/selection_policies.rs | 52 +++++++++++++++++ kernel/src/fs/mod.rs | 18 ++++-- 6 files changed, 105 insertions(+), 110 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a547c2c4e..80dd2731e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ exclude = [ function_casts_as_integer = "allow" mismatched_lifetime_syntaxes = "allow" missing_crate_level_docs = "warn" -unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)', 'cfg(capture_data)', 'cfg(raid_selection, values("roundrobin", "linnos", "linnos_plus", "decision_tree"))'] } +unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)', 'cfg(capture_data)', 'cfg(raid_selection, values("roundrobin", "linnos", "linnos_plus", "decision_tree", "heimdall"))'] } unpredictable-function-pointer-comparisons = "allow" unsafe_op_in_unsafe_fn = "deny" unused_parens = "allow" diff --git a/kernel/comps/raid/src/generate_heimdall_weights.py b/kernel/comps/raid/src/generate_heimdall_weights.py index 5098bd4ad..857c1a241 100644 --- a/kernel/comps/raid/src/generate_heimdall_weights.py +++ b/kernel/comps/raid/src/generate_heimdall_weights.py @@ -5,12 +5,11 @@ Load trained PyTorch Heimdall models and generate the Rust weights file using the Jinja2 template. -The HeimdallNet architecture has three linear layers (one model per device): - Linear(input_dim, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid +The HeimdallNet architecture has two linear layers (one model per device): + Linear(input_dim, 16) -> ReLU -> Linear(16, 1) -> Sigmoid PyTorch state dict keys: - fc1.weight [128, input_dim] fc1.bias [128] - fc2.weight [16, 128] fc2.bias [16] + fc1.weight [16, input_dim] fc1.bias [16] fc3.weight [1, 16] fc3.bias [1] Usage: @@ -82,17 +81,15 @@ def main(): print() # Extract dimensions from the first model - # fc1: Linear(input_dim, hidden1_size) - # fc2: Linear(hidden1_size, hidden2_size) - # fc3: Linear(hidden2_size, 1) + # fc1: Linear(input_dim, hidden_size) + # fc3: Linear(hidden_size, 1) input_dim = models[0]["fc1.weight"].shape[1] - hidden1_size = models[0]["fc1.weight"].shape[0] - hidden2_size = models[0]["fc2.weight"].shape[0] + hidden_size = models[0]["fc1.weight"].shape[0] output_size = models[0]["fc3.weight"].shape[0] assert output_size == 1, f"Expected output size 1 (sigmoid), got {output_size}" - print(f"Network: {input_dim} -> {hidden1_size} (ReLU) -> {hidden2_size} (ReLU) -> {output_size} (Sigmoid)") + print(f"Network: {input_dim} -> {hidden_size} (ReLU) -> {output_size} (Sigmoid)") print() # Extract weights and biases for each device @@ -100,23 +97,16 @@ def main(): # In Rust we index as weights[input][output], so we transpose. fc1_weights = [] fc1_biases = [] - fc2_weights = [] - fc2_biases = [] fc3_weights = [] fc3_biases = [] for i, state in enumerate(models): - # fc1: [hidden1_size, input_dim] -> [input_dim, hidden1_size] + # fc1: [hidden_size, input_dim] -> [input_dim, hidden_size] w1 = state["fc1.weight"].T fc1_weights.append(tensor_to_list(w1)) fc1_biases.append(tensor_to_list(state["fc1.bias"])) - # fc2: [hidden2_size, hidden1_size] -> [hidden1_size, hidden2_size] - w2 = state["fc2.weight"].T - fc2_weights.append(tensor_to_list(w2)) - fc2_biases.append(tensor_to_list(state["fc2.bias"])) - - # fc3: [1, hidden2_size] -> [hidden2_size] (squeeze since output is scalar) + # fc3: [1, hidden_size] -> [hidden_size] (squeeze since output is scalar) w3 = state["fc3.weight"].squeeze(0) fc3_weights.append(tensor_to_list(w3)) fc3_biases.append(state["fc3.bias"].item()) @@ -132,12 +122,9 @@ def main(): rendered = template.render( num_devices=num_devices, input_dim=input_dim, - hidden1_size=hidden1_size, - hidden2_size=hidden2_size, + hidden_size=hidden_size, fc1_weights=fc1_weights, fc1_biases=fc1_biases, - fc2_weights=fc2_weights, - fc2_biases=fc2_biases, fc3_weights=fc3_weights, fc3_biases=fc3_biases, ) diff --git a/kernel/comps/raid/src/heimdall.rs b/kernel/comps/raid/src/heimdall.rs index 422c0bf72..2096b440d 100644 --- a/kernel/comps/raid/src/heimdall.rs +++ b/kernel/comps/raid/src/heimdall.rs @@ -23,7 +23,7 @@ use ostd::{ /// to update that device's fast/slow indicator. /// /// Model architecture (per device): -/// Linear(INPUT_DIM, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid +/// Linear(INPUT_DIM, 16) -> ReLU -> Linear(16, 1) -> Sigmoid /// /// Selection policies can query `is_device_fast(idx)` to incorporate Heimdall's /// classification into their scheduling decisions. @@ -32,16 +32,12 @@ pub struct Heimdall { observers: Vec>>, /// Per-device fast/slow indicator. `true` means the device is predicted fast. fast_indicators: Vec, - /// Per-device fc1 weights: [INPUT_DIM][HIDDEN1_SIZE] - fc1_weights: Vec<[[f32; HIDDEN1_SIZE]; INPUT_DIM]>, - /// Per-device fc1 biases: [HIDDEN1_SIZE] - fc1_biases: Vec<[f32; HIDDEN1_SIZE]>, - /// Per-device fc2 weights: [HIDDEN1_SIZE][HIDDEN2_SIZE] - fc2_weights: Vec<[[f32; HIDDEN2_SIZE]; HIDDEN1_SIZE]>, - /// Per-device fc2 biases: [HIDDEN2_SIZE] - fc2_biases: Vec<[f32; HIDDEN2_SIZE]>, - /// Per-device fc3 weights: [HIDDEN2_SIZE] - fc3_weights: Vec<[f32; HIDDEN2_SIZE]>, + /// Per-device fc1 weights: [INPUT_DIM][HIDDEN_SIZE] + fc1_weights: Vec<[[f32; HIDDEN_SIZE]; INPUT_DIM]>, + /// Per-device fc1 biases: [HIDDEN_SIZE] + fc1_biases: Vec<[f32; HIDDEN_SIZE]>, + /// Per-device fc3 weights: [HIDDEN_SIZE] + fc3_weights: Vec<[f32; HIDDEN_SIZE]>, /// Per-device fc3 biases: scalar fc3_biases: Vec, } @@ -59,10 +55,10 @@ impl core::fmt::Debug for Heimdall { } } -use crate::heimdall_weights::{HIDDEN1_SIZE, HIDDEN2_SIZE, INPUT_DIM}; +use crate::heimdall_weights::{HIDDEN_SIZE, INPUT_DIM}; /// Number of completion records to drain before running an inference. -const BATCH_SIZE: usize = 8; +const BATCH_SIZE: usize = 6; /// Inference timeout in milliseconds. If this duration elapses since the last /// inference for a device, inference is triggered even if fewer than `BATCH_SIZE` @@ -79,7 +75,7 @@ impl Heimdall { observers: Vec>, ) -> Result, Error> { use crate::heimdall_weights::{ - FC1_BIASES, FC1_WEIGHTS, FC2_BIASES, FC2_WEIGHTS, FC3_BIASES, FC3_WEIGHTS, + FC1_BIASES, FC1_WEIGHTS, FC3_BIASES, FC3_WEIGHTS, }; let num_devices = members.len(); @@ -95,8 +91,6 @@ impl Heimdall { let fc1_weights: Vec<_> = (0..num_devices).map(|i| *FC1_WEIGHTS[i]).collect(); let fc1_biases: Vec<_> = (0..num_devices).map(|i| *FC1_BIASES[i]).collect(); - let fc2_weights: Vec<_> = (0..num_devices).map(|i| *FC2_WEIGHTS[i]).collect(); - let fc2_biases: Vec<_> = (0..num_devices).map(|i| *FC2_BIASES[i]).collect(); let fc3_weights: Vec<_> = (0..num_devices).map(|i| *FC3_WEIGHTS[i]).collect(); let fc3_biases: Vec<_> = (0..num_devices).map(|i| FC3_BIASES[i]).collect(); @@ -111,8 +105,6 @@ impl Heimdall { fast_indicators, fc1_weights, fc1_biases, - fc2_weights, - fc2_biases, fc3_weights, fc3_biases, })) @@ -287,9 +279,9 @@ impl Heimdall { /// /// Each device has its own model weights. /// Architecture: - /// Linear(INPUT_DIM, 128) -> ReLU -> Linear(128, 16) -> ReLU -> Linear(16, 1) -> Sigmoid + /// Linear(INPUT_DIM, 16) -> ReLU -> Linear(16, 1) -> Sigmoid /// - /// Returns `true` if the device is predicted fast (sigmoid output >= 0.5). + /// Returns `true` if the model output >= 0.5 (sigmoid(logit) >= 0.5 ⟺ logit >= 0). fn infer_device_speed( &self, device_idx: usize, @@ -297,11 +289,11 @@ impl Heimdall { ) -> bool { let input = self.build_features(batch); - // fc1: input (INPUT_DIM) x fc1_weights (INPUT_DIM x HIDDEN1_SIZE) + bias -> ReLU + // fc1: input (INPUT_DIM) x fc1_weights (INPUT_DIM x HIDDEN_SIZE) + bias -> ReLU let w1 = &self.fc1_weights[device_idx]; let b1 = &self.fc1_biases[device_idx]; - let mut h1 = [0.0f32; HIDDEN1_SIZE]; - for j in 0..HIDDEN1_SIZE { + let mut h1 = [0.0f32; HIDDEN_SIZE]; + for j in 0..HIDDEN_SIZE { let mut sum = b1[j]; for i in 0..INPUT_DIM { sum += input[i] * w1[i][j]; @@ -309,24 +301,12 @@ impl Heimdall { h1[j] = if sum > 0.0 { sum } else { 0.0 }; // ReLU } - // fc2: h1 (HIDDEN1_SIZE) x fc2_weights (HIDDEN1_SIZE x HIDDEN2_SIZE) + bias -> ReLU - let w2 = &self.fc2_weights[device_idx]; - let b2 = &self.fc2_biases[device_idx]; - let mut h2 = [0.0f32; HIDDEN2_SIZE]; - for j in 0..HIDDEN2_SIZE { - let mut sum = b2[j]; - for i in 0..HIDDEN1_SIZE { - sum += h1[i] * w2[i][j]; - } - h2[j] = if sum > 0.0 { sum } else { 0.0 }; // ReLU - } - - // fc3: h2 (HIDDEN2_SIZE) x fc3_weights (HIDDEN2_SIZE) + bias -> Sigmoid + // fc3: h1 (HIDDEN_SIZE) x fc3_weights (HIDDEN_SIZE) + bias -> Sigmoid let w3 = &self.fc3_weights[device_idx]; let b3 = self.fc3_biases[device_idx]; let mut logit = b3; - for j in 0..HIDDEN2_SIZE { - logit += h2[j] * w3[j]; + for j in 0..HIDDEN_SIZE { + logit += h1[j] * w3[j]; } // Sigmoid: 1 / (1 + exp(-x)). Equivalent to: logit >= 0. diff --git a/kernel/comps/raid/src/heimdall_weights.rs.j2 b/kernel/comps/raid/src/heimdall_weights.rs.j2 index 51f13aea3..80114bf9b 100644 --- a/kernel/comps/raid/src/heimdall_weights.rs.j2 +++ b/kernel/comps/raid/src/heimdall_weights.rs.j2 @@ -2,9 +2,8 @@ // Heimdall neural network weights hardcoded for {{ num_devices }} devices. // Each device has: -// - fc1: {{ input_dim }} x {{ hidden1_size }} matrix + {{ hidden1_size }} bias (ReLU) -// - fc2: {{ hidden1_size }} x {{ hidden2_size }} matrix + {{ hidden2_size }} bias (ReLU) -// - fc3: {{ hidden2_size }} x 1 matrix + 1 bias (Sigmoid) +// - fc1: {{ input_dim }} x {{ hidden_size }} matrix + {{ hidden_size }} bias (ReLU) +// - fc3: {{ hidden_size }} x 1 matrix + 1 bias (Sigmoid) // // AUTO-GENERATED by generate_heimdall_weights.py using Jinja2. // Do not edit this file manually. @@ -15,74 +14,45 @@ pub const NUM_DEVICES: usize = {{ num_devices }}; /// Input dimension. pub const INPUT_DIM: usize = {{ input_dim }}; -/// First hidden layer size. -pub const HIDDEN1_SIZE: usize = {{ hidden1_size }}; - -/// Second hidden layer size. -pub const HIDDEN2_SIZE: usize = {{ hidden2_size }}; +/// Hidden layer size. +pub const HIDDEN_SIZE: usize = {{ hidden_size }}; {% for dev in range(num_devices) %} -/// fc1 weights for device {{ dev }}: {{ input_dim }} inputs -> {{ hidden1_size }} neurons -pub static FC1_WEIGHTS_{{ dev }}: [[f32; {{ hidden1_size }}]; {{ input_dim }}] = [ +/// fc1 weights for device {{ dev }}: {{ input_dim }} inputs -> {{ hidden_size }} neurons +pub static FC1_WEIGHTS_{{ dev }}: [[f32; {{ hidden_size }}]; {{ input_dim }}] = [ {% for row in fc1_weights[dev] %} [{{ row | join(', ') }}], {% endfor %} ]; /// fc1 bias for device {{ dev }} -pub static FC1_BIAS_{{ dev }}: [f32; {{ hidden1_size }}] = [{{ fc1_biases[dev] | join(', ') }}]; - -{% endfor %} -{% for dev in range(num_devices) %} -/// fc2 weights for device {{ dev }}: {{ hidden1_size }} -> {{ hidden2_size }} neurons -pub static FC2_WEIGHTS_{{ dev }}: [[f32; {{ hidden2_size }}]; {{ hidden1_size }}] = [ -{% for row in fc2_weights[dev] %} - [{{ row | join(', ') }}], -{% endfor %} -]; - -/// fc2 bias for device {{ dev }} -pub static FC2_BIAS_{{ dev }}: [f32; {{ hidden2_size }}] = [{{ fc2_biases[dev] | join(', ') }}]; +pub static FC1_BIAS_{{ dev }}: [f32; {{ hidden_size }}] = [{{ fc1_biases[dev] | join(', ') }}]; {% endfor %} {% for dev in range(num_devices) %} -/// fc3 weights for device {{ dev }}: {{ hidden2_size }} -> 1 output -pub static FC3_WEIGHTS_{{ dev }}: [f32; {{ hidden2_size }}] = [{{ fc3_weights[dev] | join(', ') }}]; +/// fc3 weights for device {{ dev }}: {{ hidden_size }} -> 1 output +pub static FC3_WEIGHTS_{{ dev }}: [f32; {{ hidden_size }}] = [{{ fc3_weights[dev] | join(', ') }}]; /// fc3 bias for device {{ dev }} pub static FC3_BIAS_{{ dev }}: f32 = {{ fc3_biases[dev] }}; {% endfor %} /// All fc1 weights indexed by device. -pub static FC1_WEIGHTS: [&[[f32; {{ hidden1_size }}]; {{ input_dim }}]; NUM_DEVICES] = [ +pub static FC1_WEIGHTS: [&[[f32; {{ hidden_size }}]; {{ input_dim }}]; NUM_DEVICES] = [ {% for dev in range(num_devices) %} &FC1_WEIGHTS_{{ dev }}, {% endfor %} ]; /// All fc1 biases indexed by device. -pub static FC1_BIASES: [&[f32; {{ hidden1_size }}]; NUM_DEVICES] = [ +pub static FC1_BIASES: [&[f32; {{ hidden_size }}]; NUM_DEVICES] = [ {% for dev in range(num_devices) %} &FC1_BIAS_{{ dev }}, {% endfor %} ]; -/// All fc2 weights indexed by device. -pub static FC2_WEIGHTS: [&[[f32; {{ hidden2_size }}]; {{ hidden1_size }}]; NUM_DEVICES] = [ -{% for dev in range(num_devices) %} - &FC2_WEIGHTS_{{ dev }}, -{% endfor %} -]; - -/// All fc2 biases indexed by device. -pub static FC2_BIASES: [&[f32; {{ hidden2_size }}]; NUM_DEVICES] = [ -{% for dev in range(num_devices) %} - &FC2_BIAS_{{ dev }}, -{% endfor %} -]; - /// All fc3 weights indexed by device. -pub static FC3_WEIGHTS: [&[f32; {{ hidden2_size }}]; NUM_DEVICES] = [ +pub static FC3_WEIGHTS: [&[f32; {{ hidden_size }}]; NUM_DEVICES] = [ {% for dev in range(num_devices) %} &FC3_WEIGHTS_{{ dev }}, {% endfor %} diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index 264b7a43f..f23a46722 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -15,6 +15,7 @@ use ostd::{ sync::Mutex, }; +use crate::heimdall::Heimdall; use crate::server_traits::SelectionPolicy; #[derive(Debug)] @@ -346,6 +347,57 @@ impl SelectionPolicy for DecisionTreePolicy { } } +/// Heimdall-guided round-robin selection policy. +/// +/// Uses the Heimdall asynchronous monitor to skip devices predicted slow. +/// If all devices are slow, falls back to plain round-robin so IO is never +/// stalled. The Heimdall monitor must be spawned on a separate thread via +/// `Heimdall::run()` before any IO arrives. +#[derive(Debug)] +#[orpc_server] +pub struct HeimdallRoundRobinPolicy { + read_cursor: AtomicUsize, + members: Vec>, + heimdall: Arc, +} + +impl HeimdallRoundRobinPolicy { + pub fn new( + members: Vec>, + heimdall: Arc, + ) -> Result, Error> { + let server = Self::new_with(|orpc_internal, _| Self { + orpc_internal, + read_cursor: AtomicUsize::new(0), + members, + heimdall, + }); + Ok(server) + } +} + +impl SelectionPolicy for HeimdallRoundRobinPolicy { + fn select_block_device( + &self, + _submitted: &mut SubmittedBio, + ) -> Result, Error> { + let num_devices = self.members.len(); + let start_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); + + // Try each device once, starting from the round-robin cursor. + for offset in 0..num_devices { + let device_idx = (start_idx + offset) % num_devices; + if self.heimdall.is_device_fast(device_idx) { + return Ok(self.members[device_idx].clone()); + } + } + + // All devices are slow — fall back to round-robin. + let fallback_idx = start_idx % num_devices; + Ok(self.members[fallback_idx].clone()) + } +} + /// LinnOSPlus: a deeper variant of the LinnOS neural-network selection policy. /// /// Architecture (per device): diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 52fb6e8a6..45a2e0491 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -25,7 +25,7 @@ pub mod utils; use aster_block::BlockDevice; #[cfg(not(baseline_asterinas))] #[expect(unused_imports)] -use aster_raid::selection_policies::{DecisionTreePolicy, Dummy0Policy, LinnOSPolicy, LinnOSPlusPolicy, RoundRobinPolicy}; +use aster_raid::selection_policies::{DecisionTreePolicy, Dummy0Policy, HeimdallRoundRobinPolicy, LinnOSPolicy, LinnOSPlusPolicy, RoundRobinPolicy}; use aster_raid::{Raid1Device, Raid1DeviceError}; use aster_virtio::device::block::device::BlockDevice as VirtIoBlockDevice; @@ -186,7 +186,7 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { // Initialize Heimdall device performance monitor #[cfg(not(baseline_asterinas))] - { + let heimdall = { use aster_virtio::device::block::server_traits::BlockIOObservable; use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; @@ -209,9 +209,10 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { ) .expect("Failed to create Heimdall monitor"); + let heimdall_clone = heimdall.clone(); let heimdall_task = move || { info!("[heimdall] Heimdall monitor thread started"); - heimdall.run(); + heimdall_clone.run(); }; crate::ThreadOptions::new(heimdall_task) @@ -223,8 +224,9 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { }) .spawn(); - info!("[heimdall] Heimdall monitor initialized and thread spawned"); - } + info!("[heimdall] is Online"); + heimdall + }; @@ -264,8 +266,12 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { #[cfg(all(not(baseline_asterinas), raid_selection = "decision_tree"))] let selection_policy = DecisionTreePolicy::new(members.clone(), observers).unwrap(); + // Heimdall Round Robin Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "heimdall"))] + let selection_policy = HeimdallRoundRobinPolicy::new(members.clone(), heimdall).unwrap(); + // Round Robin Policy (explicit or default when no raid_selection is specified) - #[cfg(all(not(baseline_asterinas), any(raid_selection = "roundrobin", not(any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree")))))] + #[cfg(all(not(baseline_asterinas), any(raid_selection = "roundrobin", not(any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree", raid_selection = "heimdall")))))] let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); // Initialize and Register RAID-1 device From 8395cdc5892be2e63d3f7558a50b6574d64a375c Mon Sep 17 00:00:00 2001 From: Yingqi Cao Date: Mon, 4 May 2026 18:05:34 -0500 Subject: [PATCH 22/22] Heimdall LinnOS Plus Policy --- Cargo.toml | 2 +- kernel/comps/raid/src/selection_policies.rs | 171 ++++++++++++++++++++ kernel/src/fs/mod.rs | 14 +- 3 files changed, 181 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 80dd2731e..faa9a892a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,7 +54,7 @@ exclude = [ function_casts_as_integer = "allow" mismatched_lifetime_syntaxes = "allow" missing_crate_level_docs = "warn" -unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)', 'cfg(capture_data)', 'cfg(raid_selection, values("roundrobin", "linnos", "linnos_plus", "decision_tree", "heimdall"))'] } +unexpected_cfgs = { level = "deny", check-cfg = ['cfg(baseline_asterinas)', 'cfg(ktest)', 'cfg(capture_data)', 'cfg(raid_selection, values("roundrobin", "linnos", "linnos_plus", "decision_tree", "heimdall", "heimdalllinnosplus"))'] } unpredictable-function-pointer-comparisons = "allow" unsafe_op_in_unsafe_fn = "deny" unused_parens = "allow" diff --git a/kernel/comps/raid/src/selection_policies.rs b/kernel/comps/raid/src/selection_policies.rs index f23a46722..869855cf0 100644 --- a/kernel/comps/raid/src/selection_policies.rs +++ b/kernel/comps/raid/src/selection_policies.rs @@ -562,3 +562,174 @@ impl SelectionPolicy for LinnOSPlusPolicy { } } } + +/// Heimdall + LinnOS Plus combined selection policy. +/// +/// For each candidate device (round-robin order), first checks Heimdall's +/// asynchronous prediction. If Heimdall says "fast", a LinnOS Plus neural +/// network inference is performed as a second gate. Only if both predict +/// "fast" is the device selected. If no device passes both checks, falls +/// back to round-robin. +#[orpc_server] +pub struct HeimdallLinnOSPlusPolicy { + read_cursor: AtomicUsize, + members: Vec>, + heimdall: Arc, + observers: Vec>>, + hidden1_weights: Vec<[[f32; 8]; 31]>, + hidden1_biases: Vec<[f32; 8]>, + hidden2_weights: Vec<[[f32; 8]; 8]>, + hidden2_biases: Vec<[f32; 8]>, + output_weights: Vec<[[f32; 2]; 8]>, + output_biases: Vec<[f32; 2]>, +} + +impl core::fmt::Debug for HeimdallLinnOSPlusPolicy { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("HeimdallLinnOSPlusPolicy") + .field("read_cursor", &self.read_cursor) + .field("members", &self.members) + .field( + "observers", + &format_args!("[{} observers]", self.observers.len()), + ) + .finish() + } +} + +impl HeimdallLinnOSPlusPolicy { + pub fn new( + members: Vec>, + heimdall: Arc, + observers: Vec>>, + ) -> Result, Error> { + use crate::linnos_plus_weights::{ + HIDDEN1_BIASES, HIDDEN1_WEIGHTS, HIDDEN2_BIASES, HIDDEN2_WEIGHTS, OUTPUT_BIASES, + OUTPUT_WEIGHTS, + }; + + let num_devices = members.len(); + + let hidden1_weights: Vec<[[f32; 8]; 31]> = + (0..num_devices).map(|i| *HIDDEN1_WEIGHTS[i]).collect(); + let hidden1_biases: Vec<[f32; 8]> = + (0..num_devices).map(|i| *HIDDEN1_BIASES[i]).collect(); + let hidden2_weights: Vec<[[f32; 8]; 8]> = + (0..num_devices).map(|i| *HIDDEN2_WEIGHTS[i]).collect(); + let hidden2_biases: Vec<[f32; 8]> = + (0..num_devices).map(|i| *HIDDEN2_BIASES[i]).collect(); + let output_weights: Vec<[[f32; 2]; 8]> = + (0..num_devices).map(|i| *OUTPUT_WEIGHTS[i]).collect(); + let output_biases: Vec<[f32; 2]> = + (0..num_devices).map(|i| *OUTPUT_BIASES[i]).collect(); + + let server = Self::new_with(|orpc_internal, _| Self { + orpc_internal, + read_cursor: AtomicUsize::new(0), + members, + heimdall, + observers, + hidden1_weights, + hidden1_biases, + hidden2_weights, + hidden2_biases, + output_weights, + output_biases, + }); + + Ok(server) + } +} + +impl SelectionPolicy for HeimdallLinnOSPlusPolicy { + fn select_block_device(&self, submitted: &mut SubmittedBio) -> Result, Error> { + let num_devices = self.members.len(); + let start_idx = self.read_cursor.fetch_add(1, Ordering::Relaxed); + let num_pages = submitted.num_pages(); + + // Try each device once, starting from the round-robin cursor. + for offset in 0..num_devices { + let device_idx = (start_idx + offset) % num_devices; + + // First gate: Heimdall asynchronous prediction. + if self.heimdall.is_device_fast(device_idx) { + return Ok(self.members[device_idx].clone()); + } + + // Second gate: LinnOS Plus neural network inference. + let observer = self.observers[device_idx].lock(); + let completion_trace = observer + .weak_observe_recent(4) + .expect("Failed to observe completion trace"); + + // Build the 31-element input feature vector (same as LinnOS Plus) + let mut input = [0.0f32; 31]; + + let current_outstanding = num_pages as usize + self.members[device_idx].num_outstanding_pages() as usize; + input[0] = ((current_outstanding / 100) % 10) as f32; + input[1] = ((current_outstanding / 10) % 10) as f32; + input[2] = (current_outstanding % 10) as f32; + + for (i, trace_entry) in completion_trace.iter().enumerate().take(4) { + let Some(trace_entry) = trace_entry else { + continue; + }; + let outstanding = trace_entry.outstanding_pages as usize; + let latency_us = trace_entry.latency_us as usize; + let base = 3 + i * 7; + + input[base] = ((outstanding / 100) % 10) as f32; + input[base + 1] = ((outstanding / 10) % 10) as f32; + input[base + 2] = (outstanding % 10) as f32; + + input[base + 3] = ((latency_us / 1000) % 10) as f32; + input[base + 4] = ((latency_us / 100) % 10) as f32; + input[base + 5] = ((latency_us / 10) % 10) as f32; + input[base + 6] = (latency_us % 10) as f32; + } + + // Hidden layer 1: input (31) x hidden1_weights (31x8) + bias (8) -> hidden1_out (8) + let h1_weights = &self.hidden1_weights[device_idx]; + let h1_bias = &self.hidden1_biases[device_idx]; + let mut hidden1_out = [0.0f32; 8]; + for j in 0..8 { + let mut sum = h1_bias[j]; + for i in 0..31 { + sum += input[i] * h1_weights[i][j]; + } + hidden1_out[j] = if sum > 0.0 { sum } else { 0.0 }; + } + + // Hidden layer 2: hidden1_out (8) x hidden2_weights (8x8) + bias (8) -> hidden2_out (8) + let h2_weights = &self.hidden2_weights[device_idx]; + let h2_bias = &self.hidden2_biases[device_idx]; + let mut hidden2_out = [0.0f32; 8]; + for j in 0..8 { + let mut sum = h2_bias[j]; + for i in 0..8 { + sum += hidden1_out[i] * h2_weights[i][j]; + } + hidden2_out[j] = if sum > 0.0 { sum } else { 0.0 }; + } + + // Output layer: hidden2_out (8) x output_weights (8x2) + bias (2) -> output (2) + let out_weights = &self.output_weights[device_idx]; + let out_bias = &self.output_biases[device_idx]; + let mut output = [out_bias[0], out_bias[1]]; + for k in 0..2 { + for j in 0..8 { + output[k] += hidden2_out[j] * out_weights[j][k]; + } + } + + // Argmax: output[0] < output[1] means fast, otherwise slow + if output[0] < output[1] { + return Ok(self.members[device_idx].clone()); + } + } + + // All devices failed both checks — fall back to round-robin. + let fallback_idx = start_idx % num_devices; + Ok(self.members[fallback_idx].clone()) + } +} diff --git a/kernel/src/fs/mod.rs b/kernel/src/fs/mod.rs index 45a2e0491..81bc903b8 100644 --- a/kernel/src/fs/mod.rs +++ b/kernel/src/fs/mod.rs @@ -25,7 +25,7 @@ pub mod utils; use aster_block::BlockDevice; #[cfg(not(baseline_asterinas))] #[expect(unused_imports)] -use aster_raid::selection_policies::{DecisionTreePolicy, Dummy0Policy, HeimdallRoundRobinPolicy, LinnOSPolicy, LinnOSPlusPolicy, RoundRobinPolicy}; +use aster_raid::selection_policies::{DecisionTreePolicy, Dummy0Policy, HeimdallLinnOSPlusPolicy, HeimdallRoundRobinPolicy, LinnOSPolicy, LinnOSPlusPolicy, RoundRobinPolicy}; use aster_raid::{Raid1Device, Raid1DeviceError}; use aster_virtio::device::block::device::BlockDevice as VirtIoBlockDevice; @@ -181,11 +181,11 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { setup_data_capture(&members, RAID_MEMBER_NAMES); // Clone members for Heimdall before they are consumed by the selection policy / RAID init. - #[cfg(not(baseline_asterinas))] + #[cfg(all(not(baseline_asterinas), any(raid_selection = "heimdall", raid_selection = "heimdalllinnosplus")))] let members_for_heimdall = members.clone(); // Initialize Heimdall device performance monitor - #[cfg(not(baseline_asterinas))] + #[cfg(all(not(baseline_asterinas), any(raid_selection = "heimdall", raid_selection = "heimdalllinnosplus")))] let heimdall = { use aster_virtio::device::block::server_traits::BlockIOObservable; use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; @@ -234,7 +234,7 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { info!("[raid] creating selection policy"); // Shared weak observer setup for all observer-based policies (LinnOS, LinnOS Plus, Decision Tree) - #[cfg(all(not(baseline_asterinas), any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree")))] + #[cfg(all(not(baseline_asterinas), any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree", raid_selection = "heimdalllinnosplus")))] let observers = { use aster_virtio::device::block::server_traits::BlockIOObservable; use ostd::orpc::oqueue::{OQueueBase, ObservationQuery}; @@ -270,8 +270,12 @@ fn setup_raid1_device(raid_device_name: &str) -> Result<()> { #[cfg(all(not(baseline_asterinas), raid_selection = "heimdall"))] let selection_policy = HeimdallRoundRobinPolicy::new(members.clone(), heimdall).unwrap(); + // Heimdall + LinnOS Plus Policy + #[cfg(all(not(baseline_asterinas), raid_selection = "heimdalllinnosplus"))] + let selection_policy = HeimdallLinnOSPlusPolicy::new(members.clone(), heimdall, observers).unwrap(); + // Round Robin Policy (explicit or default when no raid_selection is specified) - #[cfg(all(not(baseline_asterinas), any(raid_selection = "roundrobin", not(any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree", raid_selection = "heimdall")))))] + #[cfg(all(not(baseline_asterinas), any(raid_selection = "roundrobin", not(any(raid_selection = "linnos", raid_selection = "linnos_plus", raid_selection = "decision_tree", raid_selection = "heimdall", raid_selection = "heimdalllinnosplus")))))] let selection_policy = RoundRobinPolicy::new(members.clone()).unwrap(); // Initialize and Register RAID-1 device