From 126c8a9281ffa5eb3d94ff137dcc3f09866df268 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Fri, 27 Feb 2026 22:48:27 +0000 Subject: [PATCH 1/3] Refactor PerCpuVariables --- dev_tests/src/ratchet.rs | 2 +- litebox_platform_lvbs/src/arch/x86/gdt.rs | 14 +- litebox_platform_lvbs/src/host/lvbs_impl.rs | 10 +- .../src/host/per_cpu_variables.rs | 462 +++++++----------- litebox_platform_lvbs/src/lib.rs | 4 +- litebox_platform_lvbs/src/mshv/hvcall.rs | 14 +- litebox_platform_lvbs/src/mshv/hvcall_mm.rs | 192 ++++---- litebox_platform_lvbs/src/mshv/hvcall_vp.rs | 101 ++-- litebox_platform_lvbs/src/mshv/mod.rs | 4 +- litebox_platform_lvbs/src/mshv/vsm.rs | 23 +- .../src/mshv/vsm_intercept.rs | 29 +- litebox_platform_lvbs/src/mshv/vtl_switch.rs | 27 +- litebox_runner_lvbs/src/lib.rs | 271 +++++----- litebox_runner_lvbs/src/main.rs | 54 +- 14 files changed, 568 insertions(+), 639 deletions(-) diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs index 916ee857a..f9882a793 100644 --- a/dev_tests/src/ratchet.rs +++ b/dev_tests/src/ratchet.rs @@ -37,7 +37,7 @@ fn ratchet_globals() -> Result<()> { ("litebox/", 9), ("litebox_platform_linux_kernel/", 6), ("litebox_platform_linux_userland/", 5), - ("litebox_platform_lvbs/", 26), + ("litebox_platform_lvbs/", 23), ("litebox_platform_multiplex/", 1), ("litebox_platform_windows_userland/", 7), ("litebox_runner_linux_userland/", 1), diff --git a/litebox_platform_lvbs/src/arch/x86/gdt.rs b/litebox_platform_lvbs/src/arch/x86/gdt.rs index e86fcbe9f..ef449c58c 100644 --- a/litebox_platform_lvbs/src/arch/x86/gdt.rs +++ b/litebox_platform_lvbs/src/arch/x86/gdt.rs @@ -3,9 +3,7 @@ //! Global Descriptor Table (GDT) and Task State Segment (TSS) -use crate::host::per_cpu_variables::{ - PerCpuVariablesAsm, with_per_cpu_variables_asm, with_per_cpu_variables_mut, -}; +use crate::host::per_cpu_variables::with_per_cpu_variables; use alloc::boxed::Box; use x86_64::{ PrivilegeLevel, VirtAddr, @@ -82,10 +80,8 @@ impl Default for GdtWrapper { } fn setup_gdt_tss() { - let double_fault_stack_top = - with_per_cpu_variables_asm(PerCpuVariablesAsm::get_double_fault_stack_ptr); - let exception_stack_top = - with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception_stack_ptr); + let double_fault_stack_top = with_per_cpu_variables(|pcv| pcv.asm.get_double_fault_stack_ptr()); + let exception_stack_top = with_per_cpu_variables(|pcv| pcv.asm.get_exception_stack_ptr()); let mut tss = Box::new(AlignedTss(TaskStateSegment::new())); // TSS.IST1: dedicated stack for double faults @@ -123,8 +119,8 @@ fn setup_gdt_tss() { load_tss(gdt.selectors.tss); } - with_per_cpu_variables_mut(|per_cpu_variables| { - per_cpu_variables.gdt = Some(gdt); + with_per_cpu_variables(|per_cpu_variables| { + per_cpu_variables.gdt.set(Some(gdt)); }); } diff --git a/litebox_platform_lvbs/src/host/lvbs_impl.rs b/litebox_platform_lvbs/src/host/lvbs_impl.rs index 36d4b04c3..8bb1f415b 100644 --- a/litebox_platform_lvbs/src/host/lvbs_impl.rs +++ b/litebox_platform_lvbs/src/host/lvbs_impl.rs @@ -5,7 +5,7 @@ use crate::{ Errno, HostInterface, arch::ioport::serial_print_string, - host::per_cpu_variables::with_per_cpu_variables_mut, + host::per_cpu_variables::with_per_cpu_variables, }; pub type LvbsLinuxKernel = crate::LinuxKernel; @@ -87,14 +87,14 @@ impl LvbsLinuxKernel { unsafe impl litebox::platform::ThreadLocalStorageProvider for LvbsLinuxKernel { fn get_thread_local_storage() -> *mut () { - let tls = with_per_cpu_variables_mut(|pcv| pcv.tls); + let tls = with_per_cpu_variables(|pcv| pcv.tls.get()); tls.as_mut_ptr::<()>() } unsafe fn replace_thread_local_storage(value: *mut ()) -> *mut () { - with_per_cpu_variables_mut(|pcv| { - let old = pcv.tls; - pcv.tls = x86_64::VirtAddr::new(value as u64); + with_per_cpu_variables(|pcv| { + let old = pcv.tls.get(); + pcv.tls.set(x86_64::VirtAddr::new(value as u64)); old.as_u64() as *mut () }) } diff --git a/litebox_platform_lvbs/src/host/per_cpu_variables.rs b/litebox_platform_lvbs/src/host/per_cpu_variables.rs index e9d873dec..b093623cb 100644 --- a/litebox_platform_lvbs/src/host/per_cpu_variables.rs +++ b/litebox_platform_lvbs/src/host/per_cpu_variables.rs @@ -4,18 +4,15 @@ //! Per-CPU VTL1 kernel variables use crate::{ - arch::{MAX_CORES, gdt, get_core_id, instrs::rdmsr}, - host::bootparam::get_num_possible_cpus, + arch::{MAX_CORES, gdt, instrs::rdmsr}, mshv::{ - HV_REGISTER_VP_INDEX, HvMessagePage, HvVpAssistPage, - vsm::{ControlRegMap, NUM_CONTROL_REGS}, - vtl_switch::VtlState, - vtl1_mem_layout::PAGE_SIZE, + HV_REGISTER_VP_INDEX, HvMessage, HvMessagePage, HvVpAssistPage, vsm::ControlRegMap, + vtl_switch::VtlState, vtl1_mem_layout::PAGE_SIZE, }, }; use aligned_vec::avec; use alloc::boxed::Box; -use core::cell::{Cell, RefCell}; +use core::cell::{Cell, UnsafeCell}; use core::mem::offset_of; use litebox::utils::TruncateExt; use litebox_common_linux::{rdgsbase, wrgsbase}; @@ -26,25 +23,47 @@ pub const EXCEPTION_STACK_SIZE: usize = PAGE_SIZE; pub const KERNEL_STACK_SIZE: usize = 10 * PAGE_SIZE; /// Per-CPU VTL1 kernel variables -#[repr(align(4096))] -#[derive(Clone, Copy)] +#[repr(C, align(4096))] pub struct PerCpuVariables { - hv_vp_assist_page: [u8; PAGE_SIZE], - hv_simp_page: [u8; PAGE_SIZE], + /// Assembly-accessible fields at GS offset 0 (`gs:[offset]` in inline asm). + /// + /// All fields use `Cell` for interior mutability, so they can be accessed + /// through `&PerCpuVariables` without requiring `&mut`. + pub(crate) asm: PerCpuVariablesAsm, double_fault_stack: [u8; DOUBLE_FAULT_STACK_SIZE], _guard_page_0: [u8; PAGE_SIZE], exception_stack: [u8; EXCEPTION_STACK_SIZE], kernel_stack: [u8; KERNEL_STACK_SIZE], _guard_page_1: [u8; PAGE_SIZE], - hvcall_input: [u8; PAGE_SIZE], - hvcall_output: [u8; PAGE_SIZE], - pub vtl0_state: VtlState, - pub vtl0_locked_regs: ControlRegMap, - pub gdt: Option<&'static gdt::GdtWrapper>, - pub tls: VirtAddr, - pub vp_index: u32, + /// The below four pages are used for communication with the hypervisor and + /// must be page-aligned. `UnsafeCell` is used for interior mutability since + /// the hypervisor can write to or read from them with loose Rust guarantees. + hv_vp_assist_page: UnsafeCell<[u8; PAGE_SIZE]>, + hv_simp_page: UnsafeCell<[u8; PAGE_SIZE]>, + hvcall_input: UnsafeCell<[u8; PAGE_SIZE]>, + hvcall_output: UnsafeCell<[u8; PAGE_SIZE]>, + /// VTL0 general-purpose register state, saved/restored by assembly + /// (`SAVE_VTL_STATE_ASM`/`LOAD_VTL_STATE_ASM`) via raw pushes/pops to + /// the address cached in `PerCpuVariablesAsm::vtl0_state_top_addr`. + /// Rust code accesses it only between save and load (i.e., while VTL1 + /// is executing), so there is no data race with the assembly. + pub(crate) vtl0_state: Cell, + pub(crate) vtl0_locked_regs: Cell, + pub(crate) gdt: Cell>, + pub(crate) tls: Cell, + /// Cached VP index from the hypervisor. Lazily initialized on first access + /// via `rdmsr(HV_REGISTER_VP_INDEX)` and immutable thereafter. + /// Uses `u32::MAX` as the "uninitialized" sentinel. + vp_index: Cell, } +// These Hyper-V pages must be page-aligned. +// These compile-time assertions guard against layout regressions. +const _: () = assert!(offset_of!(PerCpuVariables, hv_vp_assist_page) % PAGE_SIZE == 0); +const _: () = assert!(offset_of!(PerCpuVariables, hv_simp_page) % PAGE_SIZE == 0); +const _: () = assert!(offset_of!(PerCpuVariables, hvcall_input) % PAGE_SIZE == 0); +const _: () = assert!(offset_of!(PerCpuVariables, hvcall_output) % PAGE_SIZE == 0); + impl PerCpuVariables { const XSAVE_ALIGNMENT: usize = 64; // XSAVE and XRSTORE require a 64-byte aligned buffer pub const VTL1_XSAVE_MASK: u64 = 0b11; // let XSAVE and XRSTORE deal with x87 and SSE states @@ -63,32 +82,77 @@ impl PerCpuVariables { &raw const self.exception_stack as u64 + (self.exception_stack.len() - 1) as u64 } - pub fn hv_vp_assist_page_as_ptr(&self) -> *const HvVpAssistPage { - (&raw const self.hv_vp_assist_page).cast::() - } - pub(crate) fn hv_vp_assist_page_as_u64(&self) -> u64 { - &raw const self.hv_vp_assist_page as u64 - } - - pub(crate) fn hv_simp_page_as_mut_ptr(&mut self) -> *mut HvMessagePage { - (&raw mut self.hv_simp_page).cast::() + self.hv_vp_assist_page.get() as u64 } pub(crate) fn hv_simp_page_as_u64(&self) -> u64 { - &raw const self.hv_simp_page as u64 + self.hv_simp_page.get() as u64 } - pub(crate) fn hv_hypercall_input_page_as_mut_ptr(&mut self) -> *mut [u8; PAGE_SIZE] { - &raw mut self.hvcall_input + /// Take the pending SynIC message from SIMP slot `sint_index`. + /// + /// Returns a copy of the message and clears the slot's `message_type` + /// to `HvMessageTypeNone`, signaling the hypervisor that the slot is + /// free for reuse. + /// + /// This is safe because the SynIC protocol guarantees the hypervisor + /// will not overwrite a slot whose `message_type` is non-zero. By + /// reading first and clearing last, no concurrent write is possible. + pub(crate) fn take_sint_message(&self, sint_index: usize) -> HvMessage { + // SAFETY: interior mutability via `UnsafeCell`. The SynIC protocol + // ensures the hypervisor does not concurrently write to this slot + // while `message_type != HvMessageTypeNone`. + let simp_page = unsafe { &mut *self.hv_simp_page.get().cast::() }; + let msg = simp_page.sint_message[sint_index]; + simp_page.sint_message[sint_index].header.message_type = 0; // HvMessageTypeNone + msg + } + + /// Run a closure with a shared reference to the VP assist page. + /// + /// The hypervisor writes to this page *before* entering VTL1 (e.g., + /// `vtl_entry_reason`). No concurrent modification. + pub(crate) fn with_vp_assist_page(&self, f: impl FnOnce(&HvVpAssistPage) -> R) -> R { + // SAFETY: interior mutability via `UnsafeCell`. The hypervisor + // finishes writing before VTL1 entry, so no concurrent write is + // possible while this reference exists. + f(unsafe { &*self.hv_vp_assist_page.get().cast::() }) } - pub(crate) fn hv_hypercall_output_page_as_mut_ptr(&mut self) -> *mut [u8; PAGE_SIZE] { - &raw mut self.hvcall_output + /// Run a closure with a mutable reference to the hypercall input page, + /// reinterpreted as `T`. + /// + /// **Not re-entrant**: the closure must not call back into this method, + /// as that would create aliasing mutable references to the same page. + pub(crate) fn with_hvcall_input(&self, f: impl FnOnce(&mut T) -> R) -> R { + const { assert!(core::mem::size_of::() <= PAGE_SIZE) }; + const { assert!(core::mem::align_of::() <= PAGE_SIZE) }; + // SAFETY: interior mutability via `UnsafeCell`; the `&mut T` is + // confined to this closure. The page is page-aligned (4096), which + // satisfies any T with align_of::() <= PAGE_SIZE. + f(unsafe { &mut *self.hvcall_input.get().cast::() }) + } + + /// Run a closure with a mutable reference to the hypercall output page, + /// reinterpreted as `T`. + /// + /// **Not re-entrant**: the closure must not call back into this method, + /// as that would create aliasing mutable references to the same page. + pub(crate) fn with_hvcall_output(&self, f: impl FnOnce(&mut T) -> R) -> R { + const { assert!(core::mem::size_of::() <= PAGE_SIZE) }; + const { assert!(core::mem::align_of::() <= PAGE_SIZE) }; + // SAFETY: interior mutability via `UnsafeCell`; the `&mut T` is + // confined to this closure. The page is page-aligned (4096), which + // satisfies any T with align_of::() <= PAGE_SIZE. + // The hypervisor synchronously writes to this page during the hypercall. + f(unsafe { &mut *self.hvcall_output.get().cast::() }) } - pub fn set_vtl_return_value(&mut self, value: u64) { - self.vtl0_state.r8 = value; // LVBS uses R8 to return a value from VTL1 to VTL0 + pub fn set_vtl_return_value(&self, value: u64) { + let mut state = self.vtl0_state.get(); + state.r8 = value; // LVBS uses R8 to return a value from VTL1 to VTL0 + self.vtl0_state.set(state); } /// Return the cached Hyper-V VP index for this core (which never changes during @@ -96,21 +160,24 @@ impl PerCpuVariables { /// /// # Panics /// Panics if the VP index returned by the hypervisor is ≥ `MAX_CORES`. - pub fn vp_index(&mut self) -> u32 { - if self.vp_index == u32::MAX { + pub fn vp_index(&self) -> u32 { + let idx = self.vp_index.get(); + if idx == u32::MAX { let vp_index: u32 = rdmsr(HV_REGISTER_VP_INDEX).truncate(); assert!( vp_index < u32::try_from(MAX_CORES).unwrap(), "VP index {vp_index} exceeds the configured processor mask" ); - self.vp_index = vp_index; + self.vp_index.set(vp_index); + vp_index + } else { + idx } - self.vp_index } /// Return kernel code, user code, and user data segment selectors pub(crate) fn get_segment_selectors(&self) -> Option<(u16, u16, u16)> { - self.gdt.map(gdt::GdtWrapper::get_segment_selectors) + self.gdt.get().map(gdt::GdtWrapper::get_segment_selectors) } /// Allocate XSAVE areas for saving/restoring the extended states of each core. @@ -166,45 +233,9 @@ impl PerCpuVariables { } } -/// per-CPU variables for core 0 (or BSP). This must use static memory because kernel heap is not ready. -static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables { - hv_vp_assist_page: [0u8; PAGE_SIZE], - hv_simp_page: [0u8; PAGE_SIZE], - double_fault_stack: [0u8; DOUBLE_FAULT_STACK_SIZE], - _guard_page_0: [0u8; PAGE_SIZE], - exception_stack: [0u8; EXCEPTION_STACK_SIZE], - kernel_stack: [0u8; KERNEL_STACK_SIZE], - _guard_page_1: [0u8; PAGE_SIZE], - hvcall_input: [0u8; PAGE_SIZE], - hvcall_output: [0u8; PAGE_SIZE], - vtl0_state: VtlState { - rbp: 0, - rax: 0, - rbx: 0, - rcx: 0, - rdx: 0, - rsi: 0, - rdi: 0, - r8: 0, - r9: 0, - r10: 0, - r11: 0, - r12: 0, - r13: 0, - r14: 0, - r15: 0, - }, - vtl0_locked_regs: ControlRegMap { - entries: [(0, 0); NUM_CONTROL_REGS], - }, - gdt: const { None }, - tls: VirtAddr::zero(), - vp_index: u32::MAX, -}; - -/// Specify the layout of PerCpuVariables for Assembly area. +/// Assembly-accessible per-CPU fields at the start of [`PerCpuVariables`]. /// -/// Unlike `litebox_platform_linux_userland`, this kernel platform does't rely on +/// Unlike `litebox_platform_linux_userland`, this kernel platform does not rely on /// the `tbss` section to specify FS/GS offsets for per CPU variables because /// there is no ELF loader that will set up it. /// @@ -213,10 +244,11 @@ static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables { /// mode transitions (i.e., ring transitions through iretq/syscall) unlike userland /// platforms. /// -/// TODO: Consider unifying with `PerCpuVariables` if possible. +/// Page-aligned (`align(4096)`) so that the following fields in +/// [`PerCpuVariables`] (HV pages, stacks, etc.) remain page-aligned. #[non_exhaustive] #[cfg(target_arch = "x86_64")] -#[repr(C)] +#[repr(C, align(4096))] #[derive(Clone)] pub struct PerCpuVariablesAsm { /// Initial kernel stack pointer to reset the kernel stack on VTL switch @@ -380,219 +412,89 @@ impl PerCpuVariablesAsm { } } -/// Wrapper struct to maintain `RefCell` along with `PerCpuVariablesAsm`. -/// This struct allows assembly code to read/write some PerCpuVariables area via the GS register (e.g., to -/// save/restore RIP/RSP). Currently, `PerCpuVariables` is protected by `RefCell` such that -/// assembly code cannot easily access it. -/// -/// TODO: Let's consider whether we should maintain these two types of Per CPU variable areas (for Rust and -/// assembly, respectively). This design secures Rust-side access to `PerCpuVariables` with `RefCell`, -/// but it might be unnecessarily complex. Instead, we could use assembly code in all cases, but -/// this might be unsafe. -#[repr(C)] -pub struct RefCellWrapper { - /// Make some PerCpuVariablesAsm area be accessible via the GS register. This is mainly for assembly code - pcv_asm: PerCpuVariablesAsm, - /// RefCell which will be stored in the GS register - inner: RefCell, -} -impl RefCellWrapper { - pub const fn new(value: T) -> Self { - Self { - pcv_asm: PerCpuVariablesAsm { - kernel_stack_ptr: Cell::new(0), - double_fault_stack_ptr: Cell::new(0), - exception_stack_ptr: Cell::new(0), - vtl_return_addr: Cell::new(0), - scratch: Cell::new(0), - vtl0_state_top_addr: Cell::new(0), - cur_kernel_stack_ptr: Cell::new(0), - cur_kernel_base_ptr: Cell::new(0), - user_context_top_addr: Cell::new(0), - vtl0_xsave_area_addr: Cell::new(0), - vtl0_xsave_mask_lo: Cell::new(0), - vtl0_xsave_mask_hi: Cell::new(0), - vtl1_kernel_xsave_area_addr: Cell::new(0), - vtl1_user_xsave_area_addr: Cell::new(0), - vtl1_xsave_mask_lo: Cell::new(0), - vtl1_xsave_mask_hi: Cell::new(0), - vtl1_kernel_xsaved: Cell::new(0), - vtl1_user_xsaved: Cell::new(0), - exception_trapno: Cell::new(0), - }, - inner: RefCell::new(value), - } - } - pub fn get_refcell(&self) -> &RefCell { - &self.inner - } -} - -/// Store the addresses of per-CPU variables. The kernel threads are expected to access -/// the corresponding per-CPU variables via the GS registers which will store the addresses later. -/// Instead of maintaining this map, we might be able to use a hypercall to directly program each core's GS register. -static mut PER_CPU_VARIABLE_ADDRESSES: [RefCellWrapper<*mut PerCpuVariables>; MAX_CORES] = - [const { RefCellWrapper::new(core::ptr::null_mut()) }; MAX_CORES]; -static mut PER_CPU_VARIABLE_ADDRESSES_IDX: usize = 0; - -/// Execute a closure with a reference to the current core's per-CPU variables. +/// Execute a closure with a shared reference to the current core's per-CPU variables. /// /// # Safety -/// This function assumes the following: -/// - The GSBASE register values of individual cores must be properly set (i.e., they must be different). -/// - `get_core_id()` must return distinct APIC IDs for different cores. -/// -/// If we cannot guarantee these assumptions, this function may result in unsafe or undefined behaviors. +/// The GSBASE register must point to a valid, heap-allocated `PerCpuVariables` +/// (set by [`allocate_per_cpu_variables`]). Each core must have a distinct +/// GSBASE value. /// /// # Panics -/// Panics if GSBASE is not set, it contains a non-canonical address, or no per-CPU variables are allocated. -/// Panics if this function is recursively called (`BorrowMutError`). +/// Panics if GSBASE is not set or contains a non-canonical address. pub fn with_per_cpu_variables(f: F) -> R where F: FnOnce(&PerCpuVariables) -> R, R: Sized + 'static, { - let Some(refcell) = get_or_init_refcell_of_per_cpu_variables() else { - panic!("No per-CPU variables are allocated"); - }; - let borrow = refcell.borrow(); - let per_cpu_variables = unsafe { &**borrow }; - - f(per_cpu_variables) + let ptr = get_per_cpu_variables_ptr(); + // Safety: per-CPU data is exclusive to this core; no other core can + // access it. + let pcv = unsafe { &*ptr }; + f(pcv) } -/// Execute a closure with a mutable reference to the current core's per-CPU variables. -/// -/// # Safety -/// This function assumes the following: -/// - The GSBASE register values of individual cores must be properly set (i.e., they must be different). -/// - `get_core_id()` must return distinct APIC IDs for different cores. -/// -/// If we cannot guarantee these assumptions, this function may result in unsafe or undefined behaviors. +/// Get a raw pointer to the current core's `PerCpuVariables` from GSBASE. /// /// # Panics -/// Panics if GSBASE is not set, it contains a non-canonical address, or no per-CPU variables are allocated. -/// Panics if this function is recursively called (`BorrowMutError`). -pub fn with_per_cpu_variables_mut(f: F) -> R -where - F: FnOnce(&mut PerCpuVariables) -> R, - R: Sized + 'static, -{ - let Some(refcell) = get_or_init_refcell_of_per_cpu_variables() else { - panic!("No per-CPU variables are allocated"); - }; - let mut borrow = refcell.borrow_mut(); - let per_cpu_variables = unsafe { &mut **borrow }; - - f(per_cpu_variables) +/// Panics if GSBASE is zero or non-canonical. +fn get_per_cpu_variables_ptr() -> *mut PerCpuVariables { + let gsbase = unsafe { rdgsbase() }; + assert!( + gsbase != 0, + "GSBASE not set. Call allocate_per_cpu_variables() first" + ); + let _ = VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address"); + gsbase as *mut PerCpuVariables } -/// Execute a closure with a reference to the current PerCpuVariablesAsm. +/// Heap-allocate this core's per-CPU variables and set GSBASE to point at them. +/// +/// Every core (BSP and AP) calls this exactly once during its boot path, +/// **before** [`init_per_cpu_variables`]. +/// +/// GSBASE will point directly at the `PerCpuVariables` struct, so assembly +/// code can access the `asm` field at GS offset 0 (guaranteed by `#[repr(C)]`). +/// +/// The caller must have already: +/// 1. Enabled FSGSBASE (`enable_fsgsbase()`). +/// 2. Enabled extended CPU states (`enable_extended_states()`). +/// 3. (BSP only) Seeded the global heap (`seed_initial_heap()`). /// /// # Panics -/// Panics if GSBASE is not set or it contains a non-canonical address. -pub fn with_per_cpu_variables_asm(f: F) -> R -where - F: FnOnce(&PerCpuVariablesAsm) -> R, - R: Sized + 'static, -{ - let pcv_asm_addr = unsafe { - let gsbase = rdgsbase(); - let addr = VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address"); - addr.as_ptr::>() - .cast::() +/// Panics if the heap allocation fails. +pub fn allocate_per_cpu_variables() { + let mut per_cpu_variables = Box::::new_uninit(); + // Safety: `PerCpuVariables` is too large for the stack, so we zero-init + // via `write_bytes` then fix up the `vp_index` sentinel. Zero is valid + // for all other field types: + // - `[u8; N]`, `VtlState`, `ControlRegMap`: all-zeroes is their default. + // - `Cell` / `UnsafeCell`: `#[repr(transparent)]`, same as inner T. + let per_cpu_variables = unsafe { + let ptr = per_cpu_variables.as_mut_ptr(); + ptr.write_bytes(0, 1); + // Set the "uninitialized" sentinel for vp_index (0 is a valid VP index). + core::ptr::addr_of_mut!((*ptr).vp_index).write(Cell::new(u32::MAX)); + per_cpu_variables.assume_init() }; - let pcv_asm = unsafe { &*pcv_asm_addr }; - - f(pcv_asm) -} -/// Get or initialize a `RefCell` that contains a pointer to the current core's per-CPU variables. -/// This `RefCell` is expected to be stored in the GS register. -fn get_or_init_refcell_of_per_cpu_variables() -> Option<&'static RefCell<*mut PerCpuVariables>> { - let gsbase = unsafe { rdgsbase() }; - if gsbase == 0 { - let core_id = get_core_id(); - let refcell_wrapper = if core_id == 0 { - let addr = &raw mut BSP_VARIABLES; - unsafe { - PER_CPU_VARIABLE_ADDRESSES[0] = RefCellWrapper::new(addr); - &PER_CPU_VARIABLE_ADDRESSES[0] - } - } else { - assert!( - unsafe { PER_CPU_VARIABLE_ADDRESSES_IDX < MAX_CORES }, - "PER_CPU_VARIABLE_ADDRESSES_IDX exceeds MAX_CORES", - ); - unsafe { &PER_CPU_VARIABLE_ADDRESSES[PER_CPU_VARIABLE_ADDRESSES_IDX] } - }; - unsafe { - PER_CPU_VARIABLE_ADDRESSES_IDX += 1; - } - let refcell = refcell_wrapper.get_refcell(); - if refcell.borrow().is_null() { - None - } else { - let addr = x86_64::VirtAddr::new(&raw const *refcell_wrapper as u64); - unsafe { - wrgsbase(addr.as_u64().truncate()); - } - Some(refcell) - } - } else { - let addr = - x86_64::VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address"); - let refcell_wrapper = unsafe { &*addr.as_ptr::>() }; - let refcell = refcell_wrapper.get_refcell(); - if refcell.borrow().is_null() { - None - } else { - Some(refcell) - } + // Leak the box so it lives for the core's lifetime. + let pcv = Box::leak(per_cpu_variables); + let addr = &raw const *pcv as u64; + unsafe { + wrgsbase(addr.truncate()); } } -/// Allocate per-CPU variables in heap for all possible cores. We expect that the BSP will call -/// this function to allocate per-CPU variables for other APs because our per-CPU variables are -/// huge such that each AP without a proper stack cannot allocate its own per-CPU variables. -/// # Panics -/// Panics if the number of possible CPUs exceeds `MAX_CORES` -pub fn allocate_per_cpu_variables() { - let num_cores = - usize::try_from(get_num_possible_cpus().expect("Failed to get number of possible CPUs")) - .unwrap(); - assert!( - num_cores <= MAX_CORES, - "# of possible CPUs ({num_cores}) exceeds MAX_CORES", - ); - - // Allocate xsave area for BSP (core 0) - with_per_cpu_variables_asm(|pcv_asm| { - PerCpuVariables::allocate_xsave_area(pcv_asm); +/// Allocate XSAVE areas for the current core. +/// +/// Must be called **after** [`allocate_per_cpu_variables`] (so GSBASE is +/// set) and **after** switching to the kernel stack. The CPUID queries and +/// `avec!` allocations inside `PerCpuVariables::allocate_xsave_area` use +/// significant stack space that exceeds the 4 KiB boot stack. +pub fn allocate_xsave_area() { + with_per_cpu_variables(|pcv| { + PerCpuVariables::allocate_xsave_area(&pcv.asm); }); - - // TODO: use `cpu_online_mask` to selectively allocate per-CPU variables only for online CPUs. - // Note. `PER_CPU_VARIABLE_ADDRESSES[0]` is expected to be already initialized to point to - // `BSP_VARIABLES` before calling this function by `get_or_init_refcell_of_per_cpu_variables()`. - #[allow(clippy::needless_range_loop)] - for i in 1..num_cores { - let mut per_cpu_variables = Box::::new_uninit(); - // Safety: `PerCpuVariables` is too large for the stack, so we zero-init - // via `write_bytes` then fix up `vp_index` to the `u32::MAX` sentinel - // before calling `assume_init`. - let per_cpu_variables = unsafe { - let ptr = per_cpu_variables.as_mut_ptr(); - ptr.write_bytes(0, 1); - (*ptr).vp_index = u32::MAX; - per_cpu_variables.assume_init() - }; - unsafe { - PER_CPU_VARIABLE_ADDRESSES[i] = RefCellWrapper::new(Box::into_raw(per_cpu_variables)); - // Allocate xsave area for this core, writing directly to its PerCpuVariablesAsm - let pcv_asm = &PER_CPU_VARIABLE_ADDRESSES[i].pcv_asm; - PerCpuVariables::allocate_xsave_area(pcv_asm); - } - } } /// Initialize PerCpuVariable and PerCpuVariableAsm for the current core. @@ -604,7 +506,7 @@ pub fn allocate_per_cpu_variables() { /// Panics if the per-CPU variables are not properly initialized. pub fn init_per_cpu_variables() { const STACK_ALIGNMENT: usize = 16; - with_per_cpu_variables_mut(|per_cpu_variables| { + with_per_cpu_variables(|per_cpu_variables| { let kernel_sp = TruncateExt::::truncate(per_cpu_variables.kernel_stack_top()) & !(STACK_ALIGNMENT - 1); let double_fault_sp = @@ -612,15 +514,23 @@ pub fn init_per_cpu_variables() { & !(STACK_ALIGNMENT - 1); let exception_sp = TruncateExt::::truncate(per_cpu_variables.exception_stack_top()) & !(STACK_ALIGNMENT - 1); + // `Cell` is `#[repr(transparent)]`, so its address equals + // the inner `VtlState`'s address. Assembly code (`SAVE_VTL_STATE_ASM` + // / `LOAD_VTL_STATE_ASM`) pushes/pops registers directly to/from this + // address. This is sound because the assembly executes outside any + // Rust reference scope and the Cell is only accessed in Rust between + // the save and load points (i.e., while VTL1 is executing). let vtl0_state_top_addr = TruncateExt::::truncate(&raw const per_cpu_variables.vtl0_state as u64) + core::mem::size_of::(); - with_per_cpu_variables_asm(|pcv_asm| { - pcv_asm.set_kernel_stack_ptr(kernel_sp); - pcv_asm.set_double_fault_stack_ptr(double_fault_sp); - pcv_asm.set_exception_stack_ptr(exception_sp); - pcv_asm.set_vtl0_state_top_addr(vtl0_state_top_addr); - }); + per_cpu_variables.asm.set_kernel_stack_ptr(kernel_sp); + per_cpu_variables + .asm + .set_double_fault_stack_ptr(double_fault_sp); + per_cpu_variables.asm.set_exception_stack_ptr(exception_sp); + per_cpu_variables + .asm + .set_vtl0_state_top_addr(vtl0_state_top_addr); }); } diff --git a/litebox_platform_lvbs/src/lib.rs b/litebox_platform_lvbs/src/lib.rs index 043e50a3d..b06073400 100644 --- a/litebox_platform_lvbs/src/lib.rs +++ b/litebox_platform_lvbs/src/lib.rs @@ -2063,10 +2063,10 @@ unsafe extern "C" fn exception_handler( kernel_mode: true, } } else { - use crate::host::per_cpu_variables::{PerCpuVariablesAsm, with_per_cpu_variables_asm}; + use crate::host::per_cpu_variables::with_per_cpu_variables; use litebox::utils::TruncateExt as _; litebox::shim::ExceptionInfo { - exception: with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception), + exception: with_per_cpu_variables(|pcv| pcv.asm.get_exception()), error_code: thread_ctx.ctx.orig_rax.truncate(), cr2, kernel_mode: false, diff --git a/litebox_platform_lvbs/src/mshv/hvcall.rs b/litebox_platform_lvbs/src/mshv/hvcall.rs index bbe14f9a4..7e328a676 100644 --- a/litebox_platform_lvbs/src/mshv/hvcall.rs +++ b/litebox_platform_lvbs/src/mshv/hvcall.rs @@ -4,10 +4,7 @@ //! Hyper-V Hypercall functions use crate::{ - arch::{ - get_core_id, - instrs::{rdmsr, wrmsr}, - }, + arch::instrs::{rdmsr, wrmsr}, debug_serial_println, host::{LvbsLinuxKernel, hv_hypercall_page_address, per_cpu_variables::with_per_cpu_variables}, mm::MemoryProvider, @@ -75,10 +72,11 @@ fn check_hyperv() -> Result<(), HypervError> { } /// Enable Hyper-V Hypercalls by initializing MSR and VP registers (for a core) +/// /// # Panics /// Panics if the underlying hardware/platform is not Hyper-V /// Panics if the MSR/VP registers writes fail -pub fn init() -> Result<(), HypervError> { +pub fn init(is_bsp: bool) -> Result<(), HypervError> { check_hyperv()?; debug_serial_println!("HV_REGISTER_VP_INDEX: {:#x}", rdmsr(HV_REGISTER_VP_INDEX)); @@ -113,7 +111,7 @@ pub fn init() -> Result<(), HypervError> { if guest_id != rdmsr(HV_X64_MSR_GUEST_OS_ID) { return Err(HypervError::InvalidGuestOSID); } - if get_core_id() == 0 { + if is_bsp { debug_serial_println!( "HV_X64_MSR_GUEST_OS_ID: {:#x}", rdmsr(HV_X64_MSR_GUEST_OS_ID) @@ -156,13 +154,13 @@ pub fn init() -> Result<(), HypervError> { sint.set_auto_eoi(true); wrmsr(HV_X64_MSR_SINT0, sint.as_uint64()); - if get_core_id() == 0 { + if is_bsp { debug_serial_println!("HV_X64_MSR_SINT0: {:#x}", rdmsr(HV_X64_MSR_SINT0)); } wrmsr(HV_X64_MSR_SCONTROL, u64::from(HV_X64_MSR_SCONTROL_ENABLE)); - vsm::init(); + vsm::init(is_bsp); Ok(()) } diff --git a/litebox_platform_lvbs/src/mshv/hvcall_mm.rs b/litebox_platform_lvbs/src/mshv/hvcall_mm.rs index 65746ed05..c02344109 100644 --- a/litebox_platform_lvbs/src/mshv/hvcall_mm.rs +++ b/litebox_platform_lvbs/src/mshv/hvcall_mm.rs @@ -12,7 +12,7 @@ use crate::mshv::{ vtl_switch::{is_only_vp_in_vtl1, vtl1_vp_mask}, }; use crate::{ - host::per_cpu_variables::with_per_cpu_variables_mut, + host::per_cpu_variables::with_per_cpu_variables, mshv::{ HV_PARTITION_ID_SELF, HVCALL_MODIFY_VTL_PROTECTION_MASK, HvInputModifyVtlProtectionMask, HvInputVtl, HvPageProtFlags, @@ -20,6 +20,7 @@ use crate::{ vtl1_mem_layout::PAGE_SHIFT, }, }; +use litebox::utils::TruncateExt; /// Compute the valid-bank bitmask for a sparse VP set /// (). @@ -49,41 +50,39 @@ pub fn hv_modify_vtl_protection_mask( num_pages: u64, page_access: HvPageProtFlags, ) -> Result { - let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe { - &mut *per_cpu_variables - .hv_hypercall_input_page_as_mut_ptr() - .cast::() - }); - *hvin = HvInputModifyVtlProtectionMask::new(); - - hvin.partition_id = HV_PARTITION_ID_SELF; - hvin.target_vtl = HvInputVtl::current(); - hvin.map_flags = u32::from(page_access.bits()); - - let mut total_protected: u64 = 0; - while total_protected < num_pages { - let mut pages_to_protect: u16 = 0; - for i in 0..HvInputModifyVtlProtectionMask::MAX_PAGES_PER_REQUEST { - if total_protected + i as u64 >= num_pages { - break; - } else { - hvin.gpa_page_list[i] = (start >> PAGE_SHIFT) + (total_protected + i as u64); - pages_to_protect += 1; + with_per_cpu_variables(|pcv| { + pcv.with_hvcall_input::(|hvin| { + *hvin = HvInputModifyVtlProtectionMask::new(); + + hvin.partition_id = HV_PARTITION_ID_SELF; + hvin.target_vtl = HvInputVtl::current(); + hvin.map_flags = u32::from(page_access.bits()); + + let mut total_protected: u64 = 0; + while total_protected < num_pages { + let remaining: usize = (num_pages - total_protected) + .min(HvInputModifyVtlProtectionMask::MAX_PAGES_PER_REQUEST as u64) + .truncate(); + let pages_to_protect: u16 = remaining.truncate(); + + for i in 0..remaining { + hvin.gpa_page_list[i] = (start >> PAGE_SHIFT) + (total_protected + i as u64); + } + + let result = hv_do_rep_hypercall( + HVCALL_MODIFY_VTL_PROTECTION_MASK, + pages_to_protect, + 0, + (&raw const *hvin).cast::(), + core::ptr::null_mut(), + ); + + total_protected += result?; } - } - - let result = hv_do_rep_hypercall( - HVCALL_MODIFY_VTL_PROTECTION_MASK, - pages_to_protect, - 0, - (&raw const *hvin).cast::(), - core::ptr::null_mut(), - ); - total_protected += result?; - } - - Ok(total_protected) + Ok(total_protected) + }) + }) } /// Flush the entire virtual address space on VPs currently in VTL1. @@ -110,27 +109,25 @@ pub(crate) fn hv_flush_virtual_address_space() -> Result<(), HypervCallError> { "caller is in VTL1 but VP mask is empty" ); - let input = with_per_cpu_variables_mut(|pcv| unsafe { - &mut *pcv - .hv_hypercall_input_page_as_mut_ptr() - .cast::() - }); - - *input = HvInputFlushVirtualAddressSpaceEx { - address_space: 0, - flags: HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES, - vp_set_format: HV_GENERIC_SET_SPARSE_4K, - vp_set_valid_bank_mask: valid_bank_mask, - vp_set_bank_contents: vp_mask, - }; - - hv_do_hypercall( - u64::from(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX), - (&raw const *input).cast::(), - core::ptr::null_mut(), - )?; - - Ok(()) + with_per_cpu_variables(|pcv| { + pcv.with_hvcall_input::(|input| { + *input = HvInputFlushVirtualAddressSpaceEx { + address_space: 0, + flags: HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES, + vp_set_format: HV_GENERIC_SET_SPARSE_4K, + vp_set_valid_bank_mask: valid_bank_mask, + vp_set_bank_contents: vp_mask, + }; + + hv_do_hypercall( + u64::from(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX), + (&raw const *input).cast::(), + core::ptr::null_mut(), + )?; + + Ok(()) + }) + }) } /// Flush specific virtual addresses on VPs currently in VTL1. @@ -168,50 +165,49 @@ pub(crate) fn hv_flush_virtual_address_list( "caller is in VTL1 but VP mask is empty" ); - let input = with_per_cpu_variables_mut(|pcv| unsafe { - &mut *pcv - .hv_hypercall_input_page_as_mut_ptr() - .cast::() - }); - - input.address_space = 0; - input.flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; - input.vp_set_format = HV_GENERIC_SET_SPARSE_4K; - input.vp_set_valid_bank_mask = valid_bank_mask; - input.vp_set_bank_contents = vp_mask; - - let mut remaining = page_count; - let mut current_va = start_va; - - while remaining > 0 { - let mut gva_count: u16 = 0; - - while remaining > 0 - && (gva_count as usize) < HvInputFlushVirtualAddressListEx::MAX_GVAS_PER_REQUEST - { - // Each entry can cover up to `MAX_ADDITIONAL_PAGES + 1` pages. - let additional = remaining.saturating_sub(1).min(MAX_ADDITIONAL_PAGES); - let pages_in_entry = additional + 1; - - // GVA range entry: bits 63:12 = page number, bits 11:0 = additional_pages - let page_number = current_va >> 12; - input.gva_range_list[gva_count as usize] = (page_number << 12) | additional as u64; - - current_va += (pages_in_entry as u64) << 12; - remaining -= pages_in_entry; - gva_count += 1; - } - - hv_do_rep_hypercall( - HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, - gva_count, - HvInputFlushVirtualAddressListEx::VP_SET_QWORD_COUNT, - (&raw const *input).cast::(), - core::ptr::null_mut(), - )?; - } + with_per_cpu_variables(|pcv| { + pcv.with_hvcall_input::(|input| { + input.address_space = 0; + input.flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + input.vp_set_format = HV_GENERIC_SET_SPARSE_4K; + input.vp_set_valid_bank_mask = valid_bank_mask; + input.vp_set_bank_contents = vp_mask; + + let mut remaining = page_count; + let mut current_va = start_va; + + while remaining > 0 { + let mut gva_count: u16 = 0; + + while remaining > 0 + && (gva_count as usize) < HvInputFlushVirtualAddressListEx::MAX_GVAS_PER_REQUEST + { + // Each entry can cover up to `MAX_ADDITIONAL_PAGES + 1` pages. + let additional = remaining.saturating_sub(1).min(MAX_ADDITIONAL_PAGES); + let pages_in_entry = additional + 1; + + // GVA range entry: bits 63:12 = page number, bits 11:0 = additional_pages + let page_number = current_va >> 12; + input.gva_range_list[gva_count as usize] = + (page_number << 12) | additional as u64; + + current_va += (pages_in_entry as u64) << 12; + remaining -= pages_in_entry; + gva_count += 1; + } + + hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_count, + HvInputFlushVirtualAddressListEx::VP_SET_QWORD_COUNT, + (&raw const *input).cast::(), + core::ptr::null_mut(), + )?; + } - Ok(()) + Ok(()) + }) + }) } /// Maximum number of additional pages encodable in bits 11:0 of a diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs index ff5643ec3..6f3474b2f 100644 --- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs +++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs @@ -9,7 +9,7 @@ use crate::{ msr::{MSR_EFER, MSR_IA32_CR_PAT}, }, debug_serial_println, - host::per_cpu_variables::with_per_cpu_variables_mut, + host::per_cpu_variables::with_per_cpu_variables, mshv::{ HV_PARTITION_ID_SELF, HV_VP_INDEX_SELF, HV_VTL_NORMAL, HV_VTL_SECURE, HVCALL_ENABLE_VP_VTL, HVCALL_GET_VP_REGISTERS, HVCALL_SET_VP_REGISTERS, HvEnableVpVtl, HvGetVpRegistersInput, @@ -31,26 +31,25 @@ fn hvcall_set_vp_registers_internal( value: u64, target_vtl: HvInputVtl, ) -> Result { - let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe { - &mut *per_cpu_variables - .hv_hypercall_input_page_as_mut_ptr() - .cast::() - }); - *hvin = HvSetVpRegistersInput::new(); - - hvin.header.partitionid = HV_PARTITION_ID_SELF; - hvin.header.vpindex = HV_VP_INDEX_SELF; - hvin.header.target_vtl = target_vtl; - hvin.element[0].name = reg_name; - hvin.element[0].valuelow = value; - - hv_do_rep_hypercall( - HVCALL_SET_VP_REGISTERS, - 1, - 0, - (&raw const *hvin).cast::(), - core::ptr::null_mut(), - ) + with_per_cpu_variables(|pcv| { + pcv.with_hvcall_input::(|hvin| { + *hvin = HvSetVpRegistersInput::new(); + + hvin.header.partitionid = HV_PARTITION_ID_SELF; + hvin.header.vpindex = HV_VP_INDEX_SELF; + hvin.header.target_vtl = target_vtl; + hvin.element[0].name = reg_name; + hvin.element[0].valuelow = value; + + hv_do_rep_hypercall( + HVCALL_SET_VP_REGISTERS, + 1, + 0, + (&raw const *hvin).cast::(), + core::ptr::null_mut(), + ) + }) + }) } /// Hyper-V Hypercall to set current VTL (i.e., VTL1)'s registers. It can program Hyper-V registers @@ -70,33 +69,30 @@ fn hvcall_get_vp_registers_internal( reg_name: u32, target_vtl: HvInputVtl, ) -> Result { - let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe { - &mut *per_cpu_variables - .hv_hypercall_input_page_as_mut_ptr() - .cast::() - }); - *hvin = HvGetVpRegistersInput::new(); - let hvout = with_per_cpu_variables_mut(|per_cpu_variables| unsafe { - &mut *per_cpu_variables - .hv_hypercall_output_page_as_mut_ptr() - .cast::() - }); - *hvout = HvGetVpRegistersOutput::new(); - - hvin.header.partitionid = HV_PARTITION_ID_SELF; - hvin.header.vpindex = HV_VP_INDEX_SELF; - hvin.header.target_vtl = target_vtl; - hvin.element[0].name0 = reg_name; - - hv_do_rep_hypercall( - HVCALL_GET_VP_REGISTERS, - 1, - 0, - (&raw const *hvin).cast::(), - (&raw mut *hvout).cast::(), - )?; - - Ok(hvout.as64().0) + with_per_cpu_variables(|pcv| { + pcv.with_hvcall_input::(|hvin| { + *hvin = HvGetVpRegistersInput::new(); + + hvin.header.partitionid = HV_PARTITION_ID_SELF; + hvin.header.vpindex = HV_VP_INDEX_SELF; + hvin.header.target_vtl = target_vtl; + hvin.element[0].name0 = reg_name; + + pcv.with_hvcall_output::(|hvout| { + *hvout = HvGetVpRegistersOutput::new(); + + hv_do_rep_hypercall( + HVCALL_GET_VP_REGISTERS, + 1, + 0, + (&raw const *hvin).cast::(), + (&raw mut *hvout).cast::(), + )?; + + Ok(hvout.as64().0) + }) + }) + }) } /// Hyper-V Hypercall to get current VTL (i.e., VTL1)'s registers. It can access Hyper-V registers @@ -233,7 +229,14 @@ pub fn init_vtl_ap(core: u32) -> Result { // has high-canonical mappings, so these are ready to use as-is for the // AP's initial VP context. let rip: u64 = get_entry(); - let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64 - 1; + // SAFETY: We dont support concurrent AP/VTL initialization and thus share + // the same stack pointer. If we plan to support concurrent initialization, + // we should provide seperate stack pointers for each AP (which might not + // scale if there are several 100s of APs). + // + // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V + // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned. + let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64; let tss = get_address_of_special_page(VTL1_TSS_PAGE); let result = hvcall_enable_vp_vtl(core, HV_VTL_SECURE, tss, rip, rsp); diff --git a/litebox_platform_lvbs/src/mshv/mod.rs b/litebox_platform_lvbs/src/mshv/mod.rs index 23ba2139a..0da18875d 100644 --- a/litebox_platform_lvbs/src/mshv/mod.rs +++ b/litebox_platform_lvbs/src/mshv/mod.rs @@ -925,11 +925,11 @@ impl HvPendingExceptionEvent { #[cfg(not(test))] #[inline] pub(crate) fn is_hvcall_ready() -> bool { - use crate::host::per_cpu_variables::with_per_cpu_variables_asm; + use crate::host::per_cpu_variables::with_per_cpu_variables; // The VTL return address is configured only after the hypercall page // has been set up, so a non-zero value indicates that hypercalls are // available. - with_per_cpu_variables_asm(|pcv| pcv.get_vtl_return_addr() != 0) + with_per_cpu_variables(|pcv| pcv.asm.get_vtl_return_addr() != 0) } #[cfg(test)] diff --git a/litebox_platform_lvbs/src/mshv/vsm.rs b/litebox_platform_lvbs/src/mshv/vsm.rs index c338e8b1b..b6895167f 100644 --- a/litebox_platform_lvbs/src/mshv/vsm.rs +++ b/litebox_platform_lvbs/src/mshv/vsm.rs @@ -7,12 +7,11 @@ use crate::mshv::mem_integrity::parse_modinfo; use crate::mshv::ringbuffer::set_ringbuffer; use crate::{ - arch::get_core_id, debug_serial_print, debug_serial_println, host::{ bootparam::get_vtl1_memory_info, linux::{CpuMask, KEXEC_SEGMENT_MAX, Kimage}, - per_cpu_variables::with_per_cpu_variables_mut, + per_cpu_variables::with_per_cpu_variables, }, mshv::{ HV_REGISTER_CR_INTERCEPT_CONTROL, HV_REGISTER_CR_INTERCEPT_CR0_MASK, @@ -67,9 +66,9 @@ const MODULE_VALIDATION_MAX_SIZE: usize = 64 * 1024 * 1024; static CPU_ONLINE_MASK: Once> = Once::new(); -pub(crate) fn init() { +pub(crate) fn init(is_bsp: bool) { assert!( - !(get_core_id() == 0 && mshv_vsm_configure_partition().is_err()), + !(is_bsp && mshv_vsm_configure_partition().is_err()), "Failed to configure VSM partition" ); @@ -83,7 +82,7 @@ pub(crate) fn init() { "Failed to secure VTL0 configuration" ); - if get_core_id() == 0 { + if is_bsp { if let Ok((start, size)) = get_vtl1_memory_info() { debug_serial_println!("VSM: Protect GPAs from {:#x} to {:#x}", start, start + size); if protect_physical_memory_range( @@ -1007,14 +1006,18 @@ impl ControlRegMap { #[allow(clippy::unnecessary_wraps)] fn save_vtl0_locked_regs() -> Result { - let reg_names = with_per_cpu_variables_mut(|per_cpu_variables| { - per_cpu_variables.vtl0_locked_regs.init(); - per_cpu_variables.vtl0_locked_regs.reg_names() + let reg_names = with_per_cpu_variables(|per_cpu_variables| { + let mut regs = per_cpu_variables.vtl0_locked_regs.get(); + regs.init(); + per_cpu_variables.vtl0_locked_regs.set(regs); + regs.reg_names() }); for reg_name in reg_names { if let Ok(value) = hvcall_get_vp_vtl0_registers(reg_name) { - with_per_cpu_variables_mut(|per_cpu_variables| { - per_cpu_variables.vtl0_locked_regs.set(reg_name, value); + with_per_cpu_variables(|per_cpu_variables| { + let mut regs = per_cpu_variables.vtl0_locked_regs.get(); + regs.set(reg_name, value); + per_cpu_variables.vtl0_locked_regs.set(regs); }); } } diff --git a/litebox_platform_lvbs/src/mshv/vsm_intercept.rs b/litebox_platform_lvbs/src/mshv/vsm_intercept.rs index dc17aaad2..4c98f80b5 100644 --- a/litebox_platform_lvbs/src/mshv/vsm_intercept.rs +++ b/litebox_platform_lvbs/src/mshv/vsm_intercept.rs @@ -3,7 +3,7 @@ use crate::{ debug_serial_println, - host::per_cpu_variables::{with_per_cpu_variables, with_per_cpu_variables_mut}, + host::per_cpu_variables::with_per_cpu_variables, mshv::{ DEFAULT_REG_PIN_MASK, HV_REGISTER_PENDING_EVENT0, HV_X64_REGISTER_APIC_BASE, HV_X64_REGISTER_CR0, HV_X64_REGISTER_CR4, HV_X64_REGISTER_CSTAR, HV_X64_REGISTER_EFER, @@ -55,19 +55,15 @@ pub enum InterceptedRegisterName { /// - Failed to raise VTL0 GP fault /// - Intercepted write to unknown MSR/register pub fn vsm_handle_intercept() { - let simp_page = with_per_cpu_variables_mut(|per_cpu_variables| unsafe { - &mut *per_cpu_variables.hv_simp_page_as_mut_ptr() - }); - - let msg_type = simp_page.sint_message[0].header.message_type; - simp_page.sint_message[0].header.message_type = HvMessageType::None.into(); - let payload = simp_page.sint_message[0].payload; + // Extract the intercept message from the SIMP page and clear it, + // all within the `with_per_cpu_variables` scope. + let msg = with_per_cpu_variables(|pcv| pcv.take_sint_message(0)); - match HvMessageType::try_from(msg_type).unwrap() { + match HvMessageType::try_from(msg.header.message_type).unwrap() { HvMessageType::GpaIntercept => { let int_msg = unsafe { - let ptr = payload.as_ptr().cast::(); - &(*ptr) as &HvMemInterceptMessage + let ptr = core::ptr::addr_of!(msg.payload).cast::(); + &*ptr }; let gpa = int_msg.gpa; @@ -76,8 +72,8 @@ pub fn vsm_handle_intercept() { } HvMessageType::MsrIntercept => { let int_msg = unsafe { - let ptr = payload.as_ptr().cast::(); - &(*ptr) as &HvMsrInterceptMessage + let ptr = core::ptr::addr_of!(msg.payload).cast::(); + &*ptr }; let msr_index = int_msg.msr; @@ -110,8 +106,8 @@ pub fn vsm_handle_intercept() { } HvMessageType::RegisterIntercept => { let int_msg = unsafe { - let ptr = payload.as_ptr().cast::(); - &(*ptr) as &HvInterceptMessage + let ptr = core::ptr::addr_of!(msg.payload).cast::(); + &*ptr }; let reg_name = int_msg.reg_name; @@ -141,6 +137,7 @@ pub fn vsm_handle_intercept() { } } _ => { + let msg_type = msg.header.message_type; debug_serial_println!( "VSM: Ignore unknown synthetic interrupt message type {msg_type:#x}" ); @@ -176,7 +173,7 @@ fn validate_and_continue_vtl0_register_write( int_msg_hdr: &HvInterceptMessageHeader, ) { let allowed_value = with_per_cpu_variables(|per_cpu_variables| { - per_cpu_variables.vtl0_locked_regs.get(reg_name) + per_cpu_variables.vtl0_locked_regs.get().get(reg_name) }); if let Some(allowed_value) = allowed_value { if value & mask == allowed_value { diff --git a/litebox_platform_lvbs/src/mshv/vtl_switch.rs b/litebox_platform_lvbs/src/mshv/vtl_switch.rs index cb437d12a..b707e1f64 100644 --- a/litebox_platform_lvbs/src/mshv/vtl_switch.rs +++ b/litebox_platform_lvbs/src/mshv/vtl_switch.rs @@ -5,10 +5,7 @@ use crate::host::{ hv_hypercall_page_address, - per_cpu_variables::{ - PerCpuVariables, PerCpuVariablesAsm, with_per_cpu_variables, with_per_cpu_variables_asm, - with_per_cpu_variables_mut, - }, + per_cpu_variables::{PerCpuVariables, PerCpuVariablesAsm, with_per_cpu_variables}, }; use crate::mshv::{ HV_FLUSH_EX_VP_SET_BANKS, HV_REGISTER_VSM_CODEPAGE_OFFSETS, HvRegisterVsmCodePageOffsets, @@ -99,13 +96,13 @@ static VTL1_VP_MASK: AtomicVpMask = AtomicVpMask::new(); /// Mark the current VP as executing in VTL1. #[inline] fn vtl1_vp_enter() { - VTL1_VP_MASK.set(with_per_cpu_variables_mut(PerCpuVariables::vp_index) as usize); + VTL1_VP_MASK.set(with_per_cpu_variables(PerCpuVariables::vp_index) as usize); } /// Remove the current VP from the VTL1 mask (it is returning to VTL0). #[inline] fn vtl1_vp_exit() { - VTL1_VP_MASK.clear(with_per_cpu_variables_mut(PerCpuVariables::vp_index) as usize); + VTL1_VP_MASK.clear(with_per_cpu_variables(PerCpuVariables::vp_index) as usize); } /// Return the current VTL1 VP mask for use in TLB flush hypercalls. @@ -128,7 +125,7 @@ pub(crate) fn vtl1_vp_mask() -> [u64; HV_FLUSH_EX_VP_SET_BANKS] { #[cfg(not(test))] #[inline] pub(crate) fn is_only_vp_in_vtl1() -> bool { - VTL1_VP_MASK.is_single_vp(with_per_cpu_variables_mut(PerCpuVariables::vp_index)) + VTL1_VP_MASK.is_single_vp(with_per_cpu_variables(PerCpuVariables::vp_index)) } // ============================================================================ @@ -365,8 +362,8 @@ fn handle_vtl_entry() -> Option<[u64; NUM_VTLCALL_PARAMS]> { /// Returns `None` if the entry reason is not a valid `VtlEntryReason`. #[inline] fn get_vtl_entry_reason() -> Option { - let reason = with_per_cpu_variables(|per_cpu_variables| unsafe { - (*per_cpu_variables.hv_vp_assist_page_as_ptr()).vtl_entry_reason + let reason = with_per_cpu_variables(|per_cpu_variables| { + per_cpu_variables.with_vp_assist_page(|page| page.vtl_entry_reason) }); VtlEntryReason::try_from(reason).ok() } @@ -374,13 +371,15 @@ fn get_vtl_entry_reason() -> Option { /// Get the VTL call parameters from the saved VTL0 state. #[inline] fn get_vtlcall_params() -> [u64; NUM_VTLCALL_PARAMS] { - with_per_cpu_variables(|per_cpu_variables| per_cpu_variables.vtl0_state.get_vtlcall_params()) + with_per_cpu_variables(|per_cpu_variables| { + per_cpu_variables.vtl0_state.get().get_vtlcall_params() + }) } /// Set the VTL return value that will be returned to VTL0. #[inline] fn set_vtl_return_value(value: i64) { - with_per_cpu_variables_mut(|per_cpu_variables| { + with_per_cpu_variables(|per_cpu_variables| { per_cpu_variables.set_vtl_return_value(value.reinterpret_as_unsigned()); }); } @@ -402,8 +401,8 @@ pub(crate) fn mshv_vsm_get_code_page_offsets() -> Result<(), VsmError> { let vtl_return_address = hvcall_page .checked_add(usize::from(code_page_offsets.vtl_return_offset())) .ok_or(VsmError::CodePageOffsetOverflow)?; - with_per_cpu_variables_asm(|pcv_asm| { - pcv_asm.set_vtl_return_addr(vtl_return_address); + with_per_cpu_variables(|pcv| { + pcv.asm.set_vtl_return_addr(vtl_return_address); }); Ok(()) } @@ -480,7 +479,7 @@ pub fn vtl_switch(return_value: Option) -> [u64; NUM_VTLCALL_PARAMS] { // one buffer at a time. At this point, the CPU's tracking might rely on VTL0's // buffer (if VTL0 called XRSTOR). Thus, we shouldn't use XSAVEOPT until XRSTOR // re-establishes tracking for VTL1's buffer. - with_per_cpu_variables_asm(PerCpuVariablesAsm::reset_vtl1_xsaved); + with_per_cpu_variables(|pcv| pcv.asm.reset_vtl1_xsaved()); return params; } diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs index 3c9a08acc..ed6302601 100644 --- a/litebox_runner_lvbs/src/lib.rs +++ b/litebox_runner_lvbs/src/lib.rs @@ -20,9 +20,9 @@ use litebox_common_optee::{ OpteeSmcReturnCode, TeeOrigin, TeeResult, UteeEntryFunc, UteeParams, optee_msg_args_total_size, }; use litebox_platform_lvbs::{ - arch::{gdt, get_core_id, instrs::hlt_loop, interrupts}, + arch::{gdt, instrs::hlt_loop, interrupts}, debug_serial_println, - host::{bootparam::get_vtl1_memory_info, per_cpu_variables::allocate_per_cpu_variables}, + host::{bootparam::get_vtl1_memory_info, per_cpu_variables}, mm::MemoryProvider, mshv::{ NUM_VTLCALL_PARAMS, VsmFunction, hvcall, @@ -32,8 +32,9 @@ use litebox_platform_lvbs::{ vtl1_mem_layout::{ VSM_SK_PTE_PAGES_COUNT, VTL1_INIT_HEAP_SIZE, VTL1_INIT_HEAP_START_PAGE, VTL1_PML4E_PAGE, VTL1_PRE_POPULATED_MEMORY_SIZE, VTL1_PTE_0_PAGE, VTL1_REMAP_PDE_PAGE, - VTL1_REMAP_PDPT_PAGE, get_heap_start_address, get_rela_end_address, - get_rela_start_address, get_text_end_address, get_text_start_address, + VTL1_REMAP_PDPT_PAGE, get_heap_start_address, get_memory_base_address, + get_rela_end_address, get_rela_start_address, get_text_end_address, + get_text_start_address, }, }, serial_println, @@ -49,152 +50,162 @@ use litebox_shim_optee::{NormalWorldConstPtr, NormalWorldMutPtr, UserConstPtr}; use once_cell::race::OnceBox; use spin::mutex::SpinMutex; +/// Seed the initial heap regions so the global allocator has enough memory +/// for slab-backed allocations (the slab needs >= 2 MB backing pages). +pub fn seed_initial_heap() { + let vtl1_base_va = get_memory_base_address(); + let vtl1_start = Platform::va_to_pa(x86_64::VirtAddr::new(vtl1_base_va)); + + let mem_fill_start = + TruncateExt::::truncate(vtl1_base_va) + VTL1_INIT_HEAP_START_PAGE * PAGE_SIZE; + unsafe { + Platform::mem_fill_pages(mem_fill_start, VTL1_INIT_HEAP_SIZE); + } + debug_serial_println!( + "heap: seed init region (pages {}..+{:#x}): VA {:#x}, size {:#x}", + VTL1_INIT_HEAP_START_PAGE, + VTL1_INIT_HEAP_SIZE, + mem_fill_start, + VTL1_INIT_HEAP_SIZE + ); + + // Add pre-populated region (_heap_start .. end of Phase 1 mapping). + let heap_va = get_heap_start_address(); + let mem_fill_start: usize = heap_va.truncate(); + let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64(); + let heap_offset: usize = TruncateExt::::truncate(heap_phys - vtl1_start.as_u64()); + let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset; + unsafe { + Platform::mem_fill_pages(mem_fill_start, mem_fill_size); + } + debug_serial_println!( + "heap: add pre-populated region (_heap_start..Phase 1 end): VA {:#x}, size {:#x}", + mem_fill_start, + mem_fill_size + ); +} + +/// Initialize the current core. +/// +/// When `is_bsp` is `true`, creates the platform, sets up page tables, and +/// reclaims early memory. +/// All cores then initialize hypercalls, GDT, IDT, interrupts, and syscall +/// support. +/// /// # Panics /// -/// Panics if it failed to enable Hyper-V hypercall -pub fn init() -> Option<&'static Platform> { - let mut ret: Option<&'static Platform> = None; - - if get_core_id() == 0 { - if let Ok((start, size)) = get_vtl1_memory_info() { - let vtl1_start = x86_64::PhysAddr::new(start); - let vtl1_end = x86_64::PhysAddr::new(start + size); - - // Add a small range of mapped memory to the global allocator for populating the base page table. - // `VTL1_INIT_HEAP_START_PAGE` and `VTL1_INIT_HEP_SIZE` specify a physical address range which is - // not used by the VTL1 kernel. - let mem_fill_start = - TruncateExt::::truncate(Platform::pa_to_va(vtl1_start).as_u64()) - + VTL1_INIT_HEAP_START_PAGE * PAGE_SIZE; - let mem_fill_size = VTL1_INIT_HEAP_SIZE; +/// Panics if VTL1 memory info is unavailable (BSP) or if hypercall +/// initialization fails. +pub fn init(is_bsp: bool) -> Option<&'static Platform> { + let ret = if is_bsp { + let (start, size) = get_vtl1_memory_info().expect("Failed to get memory info"); + let vtl1_start = x86_64::PhysAddr::new(start); + let vtl1_end = x86_64::PhysAddr::new(start + size); + + // Re-compute the pre-populated region bounds needed for the + // remaining-memory add after `Platform::new()` below. + let heap_va = get_heap_start_address(); + let mem_fill_start: usize = heap_va.truncate(); + let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64(); + let heap_offset: usize = TruncateExt::::truncate(heap_phys - start); + let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset; + + // Text section boundaries. These are used by the platform to mark + // code pages executable and everything else NO_EXECUTE (DEP). + // After two-phase relocation, linker symbols return + // high-canonical VAs; convert to PA for the page table mapper. + let text_phys_start = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_start_address())); + let text_phys_end = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_end_address())); + + // Reclaim .rela.dyn section memory now that relocations have been applied + // and we are running at high-canonical addresses. + // After two-phase relocation, `get_rela_start/end_address()` return + // high-canonical VAs. Use directly for the allocator. + let rela_va = get_rela_start_address(); + let rela_size: usize = (get_rela_end_address() - rela_va).truncate(); + if rela_size > 0 { + let rela_virt: usize = rela_va.truncate(); unsafe { - Platform::mem_fill_pages(mem_fill_start, mem_fill_size); + Platform::mem_fill_pages(rela_virt, rela_size); } debug_serial_println!( - "heap: seed init region (pages {}..+{:#x}): VA {:#x}, size {:#x}", - VTL1_INIT_HEAP_START_PAGE, - mem_fill_size, - mem_fill_start, - mem_fill_size + "heap: reclaim .rela.dyn section: VA {:#x}, size {:#x}", + rela_virt, + rela_size ); + } - // Add remaining mapped but non-used memory pages (between `get_heap_start_address()` and - // the end of the Phase 1 high-canonical mapping) to the global allocator. - // - // Phase 1 maps `VTL1_REMAP_PTE_COUNT * 2 MiB` = 16 MiB of high-canonical - // memory, which equals the full pre-populated region. We must NOT hand - // out addresses beyond that boundary because they are unmapped until - // `Platform::new()` builds the base page table covering all 128 MiB. - // The full VTL1 range is added after `Platform::new()` completes. - // - // After two-phase relocation, `get_heap_start_address()` returns a - // high-canonical VA. Use it directly for the allocator. - let heap_va = get_heap_start_address(); - let mem_fill_start: usize = heap_va.truncate(); - let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64(); - let heap_offset: usize = TruncateExt::::truncate(heap_phys - start); - let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset; + let platform = Platform::new(vtl1_start, vtl1_end, text_phys_start, text_phys_end); + litebox_platform_multiplex::set_platform(platform); + + // Reclaim Phase 1 / VTL0 page table frames now that Platform::new() + // has loaded a fresh base page table covering all VTL1 memory. + // These physical pages are no longer referenced by CR3. + { + // Reclaim pages 2–12 (PML4, PDPT, PDE, 8 PTE pages) + let early_pt_pa = vtl1_start + (VTL1_PML4E_PAGE * PAGE_SIZE) as u64; + let early_pt_start: usize = + TruncateExt::::truncate(Platform::pa_to_va(early_pt_pa).as_u64()); + let early_pt_size: usize = + (VTL1_PTE_0_PAGE + VSM_SK_PTE_PAGES_COUNT - VTL1_PML4E_PAGE) * PAGE_SIZE; + // Safety: the early page table frames are no longer referenced + // (CR3 now points to the Phase 2 base page table). unsafe { - Platform::mem_fill_pages(mem_fill_start, mem_fill_size); + Platform::mem_fill_pages(early_pt_start, early_pt_size); } debug_serial_println!( - "heap: add pre-populated region (_heap_start..Phase 1 end): VA {:#x}, size {:#x}", - mem_fill_start, - mem_fill_size + "heap: reclaim early page table frames (pages {}..{}): VA {:#x}, size {:#x}", + VTL1_PML4E_PAGE, + VTL1_PML4E_PAGE + (early_pt_size / PAGE_SIZE), + early_pt_start, + early_pt_size ); - // Text section boundaries. These are used by the platform to mark - // code pages executable and everything else NO_EXECUTE (DEP). - // After two-phase relocation, linker symbols return - // high-canonical VAs; convert to PA for the page table mapper. - let text_phys_start = - Platform::va_to_pa(x86_64::VirtAddr::new(get_text_start_address())); - let text_phys_end = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_end_address())); - - // Reclaim .rela.dyn section memory now that relocations have been applied - // and we're running at high-canonical addresses. - // After two-phase relocation, `get_rela_start/end_address()` return - // high-canonical VAs. Use directly for the allocator. - let rela_va = get_rela_start_address(); - let rela_size: usize = (get_rela_end_address() - rela_va).truncate(); - if rela_size > 0 { - let rela_virt: usize = rela_va.truncate(); - unsafe { - Platform::mem_fill_pages(rela_virt, rela_size); - } - debug_serial_println!( - "heap: reclaim .rela.dyn section: VA {:#x}, size {:#x}", - rela_virt, - rela_size - ); - } + // NOTE: The boot stack page (VTL1_KERNEL_STACK_PAGE) MUST NOT be + // reclaimed here. APs reuse it as their initial RSP when they + // enter VTL1 via `hvcall_enable_vp_vtl`. - let platform = Platform::new(vtl1_start, vtl1_end, text_phys_start, text_phys_end); - ret = Some(platform); - litebox_platform_multiplex::set_platform(platform); - - // Reclaim Phase 1 / VTL0 page table frames now that Platform::new() - // has loaded a fresh base page table covering all VTL1 memory. - // These physical pages are no longer referenced by CR3. - { - // Reclaim pages 2–12 (PML4, PDPT, PDE, 8 PTE pages) - let early_pt_pa = vtl1_start + (VTL1_PML4E_PAGE * PAGE_SIZE) as u64; - let early_pt_start: usize = - TruncateExt::::truncate(Platform::pa_to_va(early_pt_pa).as_u64()); - let early_pt_size: usize = - (VTL1_PTE_0_PAGE + VSM_SK_PTE_PAGES_COUNT - VTL1_PML4E_PAGE) * PAGE_SIZE; - // Safety: the early page table frames are no longer referenced - // (CR3 now points to the Phase 2 base page table). - unsafe { - Platform::mem_fill_pages(early_pt_start, early_pt_size); - } - debug_serial_println!( - "heap: reclaim early page table frames (pages {}..{}): VA {:#x}, size {:#x}", - VTL1_PML4E_PAGE, - VTL1_PML4E_PAGE + (early_pt_size / PAGE_SIZE), - early_pt_start, - early_pt_size - ); - - // Reclaim Phase 1 PDPT and PDE pages - let remap_pt_pa = vtl1_start + (VTL1_REMAP_PDPT_PAGE * PAGE_SIZE) as u64; - let remap_pt_start: usize = - TruncateExt::::truncate(Platform::pa_to_va(remap_pt_pa).as_u64()); - let remap_pt_size: usize = - (VTL1_REMAP_PDE_PAGE - VTL1_REMAP_PDPT_PAGE + 1) * PAGE_SIZE; - unsafe { - Platform::mem_fill_pages(remap_pt_start, remap_pt_size); - } - debug_serial_println!( - "heap: reclaim Phase 1 remap PT frames (pages {}..{}): VA {:#x}, size {:#x}", - VTL1_REMAP_PDPT_PAGE, - VTL1_REMAP_PDE_PAGE + 1, - remap_pt_start, - remap_pt_size - ); - } - - // Add the rest of the VTL1 memory to the global allocator once they are mapped to the base page table. - let mem_fill_start = mem_fill_start + mem_fill_size; - let mem_fill_size = TruncateExt::::truncate( - size - (mem_fill_start as u64 - Platform::pa_to_va(vtl1_start).as_u64()), - ); + // Reclaim Phase 1 PDPT and PDE pages + let remap_pt_pa = vtl1_start + (VTL1_REMAP_PDPT_PAGE * PAGE_SIZE) as u64; + let remap_pt_start: usize = + TruncateExt::::truncate(Platform::pa_to_va(remap_pt_pa).as_u64()); + let remap_pt_size: usize = (VTL1_REMAP_PDE_PAGE - VTL1_REMAP_PDPT_PAGE + 1) * PAGE_SIZE; unsafe { - Platform::mem_fill_pages(mem_fill_start, mem_fill_size); + Platform::mem_fill_pages(remap_pt_start, remap_pt_size); } debug_serial_println!( - "heap: add remaining VTL1 memory (post Phase 2): VA {:#x}, size {:#x}", - mem_fill_start, - mem_fill_size + "heap: reclaim Phase 1 remap PT frames (pages {}..{}): VA {:#x}, size {:#x}", + VTL1_REMAP_PDPT_PAGE, + VTL1_REMAP_PDE_PAGE + 1, + remap_pt_start, + remap_pt_size ); + } - allocate_per_cpu_variables(); - } else { - panic!("Failed to get memory info"); + // Add the rest of the VTL1 memory to the global allocator once they are mapped to the base page table. + let mem_fill_start = mem_fill_start + mem_fill_size; + let mem_fill_size = TruncateExt::::truncate( + size - (mem_fill_start as u64 - Platform::pa_to_va(vtl1_start).as_u64()), + ); + unsafe { + Platform::mem_fill_pages(mem_fill_start, mem_fill_size); } - } + debug_serial_println!( + "heap: add remaining VTL1 memory (post Phase 2): VA {:#x}, size {:#x}", + mem_fill_start, + mem_fill_size + ); + + Some(platform) + } else { + None + }; + + // Allocate XSAVE areas now that we are on the kernel stack (the CPUID + // queries and aligned-vec allocations need a lot of stack space). + per_cpu_variables::allocate_xsave_area(); - if let Err(e) = hvcall::init() { + if let Err(e) = hvcall::init(is_bsp) { panic!("Err: {:?}", e); } gdt::init(); diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs index 6fdb9590b..ea13e4226 100644 --- a/litebox_runner_lvbs/src/main.rs +++ b/litebox_runner_lvbs/src/main.rs @@ -7,12 +7,12 @@ use core::arch::{asm, naked_asm}; use litebox_platform_lvbs::{ - arch::{ - enable_extended_states, enable_fsgsbase, enable_smep_smap, get_core_id, instrs::hlt_loop, - }, + arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop}, host::{ bootparam::parse_boot_info, - per_cpu_variables::{PerCpuVariablesAsm, init_per_cpu_variables}, + per_cpu_variables::{ + PerCpuVariablesAsm, allocate_per_cpu_variables, init_per_cpu_variables, + }, }, mshv::vtl1_mem_layout::{self, VTL1_REMAP_PDE_PAGE, VTL1_REMAP_PDPT_PAGE}, serial_println, @@ -296,44 +296,61 @@ unsafe fn remap_to_high_canonical() -> ! { /// Trampoline executed at the high-canonical address after Phase 1 remap. /// /// Adjusts RSP from low-canonical (PA-based) to high-canonical, re-applies -/// ELF relocations for the final link address, and tail-jumps to `_ap_start`. +/// ELF relocations for the final link address, and tail-jumps to +/// `common_start` with `is_bsp = true`. #[unsafe(naked)] unsafe extern "C" fn high_canonical_trampoline() -> ! { // 1. Adjust RSP from low-canonical (PA-based) to high-canonical. // 2. Phase 1b: Re-apply ELF relocations so every GOT slot now points to // high-canonical VAs (addend + memory_base + KERNEL_OFFSET). - // 3. Tail-jump to _ap_start (common BSP + AP entry point). + // 3. Set edi = 1 (is_bsp = true) and tail-jump to common_start. naked_asm!( "mov rax, {offset}", "add rsp, rax", "and rsp, -16", "call {apply_reloc}", - "jmp {ap_start}", + "mov edi, 1", + "jmp {common_start}", offset = const KERNEL_OFFSET, apply_reloc = sym apply_relocations, - ap_start = sym _ap_start, + common_start = sym common_start, ); } -/// Common entry point for all cores after high-canonical page table setup. -/// -/// - **BSP**: reached via `high_canonical_trampoline()` after Phase 1 remap + re-relocation -/// - **APs**: entered directly by Hyper-V via `hvcall_enable_vp_vtl` (the VP -/// context's RIP is set to this symbol). APs inherit the BSP's CR3 (Phase 2 -/// page table with full 128 MiB mapped), so they already run at high-canonical -/// VAs and need no remap. +/// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl` +/// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3, +/// so they already run at high-canonical VAs and need no remap. #[expect(clippy::missing_safety_doc)] #[unsafe(no_mangle)] pub unsafe extern "C" fn _ap_start() -> ! { + unsafe { common_start(false) } +} + +/// Shared boot path for BSP and AP cores. +/// +/// When `is_bsp` is `true`, seeds the initial heap. +unsafe extern "C" fn common_start(is_bsp: bool) -> ! { enable_fsgsbase(); enable_extended_states(); + + if is_bsp { + litebox_runner_lvbs::seed_initial_heap(); + } + + // Each core heap-allocates its own PerCpuVariables and sets GSBASE + // to point at it (assembly fields are at GS offset 0). + allocate_per_cpu_variables(); + init_per_cpu_variables(); + // Switch to the kernel stack and tail-call kernel_main with is_bsp + let is_bsp_u32 = u32::from(is_bsp); unsafe { asm!( "mov rsp, gs:[{kernel_sp_off}]", "call {kernel_main}", kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() }, + in("edi") is_bsp_u32, kernel_main = sym kernel_main ); } @@ -360,9 +377,8 @@ pub unsafe extern "C" fn _start() -> ! { } } -unsafe extern "C" fn kernel_main() -> ! { - let core_id = get_core_id(); - if core_id == 0 { +unsafe extern "C" fn kernel_main(is_bsp: bool) -> ! { + if is_bsp { serial_println!("=============================="); serial_println!(" Hello from LiteBox for LVBS! "); serial_println!("=============================="); @@ -370,7 +386,7 @@ unsafe extern "C" fn kernel_main() -> ! { parse_boot_info(); } - let platform = litebox_runner_lvbs::init(); + let platform = litebox_runner_lvbs::init(is_bsp); enable_smep_smap(); From c42157c5bcbe4733edf2d12f413bf94cbaf50540 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Mon, 2 Mar 2026 20:53:22 +0000 Subject: [PATCH 2/3] spin lock AP boot stack --- dev_tests/src/ratchet.rs | 2 +- litebox_platform_lvbs/src/mshv/hvcall_vp.rs | 9 ++-- litebox_runner_lvbs/src/main.rs | 52 +++++++++++++++++++-- 3 files changed, 54 insertions(+), 9 deletions(-) diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs index f9882a793..6d7788be3 100644 --- a/dev_tests/src/ratchet.rs +++ b/dev_tests/src/ratchet.rs @@ -41,7 +41,7 @@ fn ratchet_globals() -> Result<()> { ("litebox_platform_multiplex/", 1), ("litebox_platform_windows_userland/", 7), ("litebox_runner_linux_userland/", 1), - ("litebox_runner_lvbs/", 4), + ("litebox_runner_lvbs/", 5), ("litebox_runner_snp/", 1), ("litebox_shim_linux/", 1), ("litebox_shim_optee/", 3), diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs index 6f3474b2f..4dc03aadc 100644 --- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs +++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs @@ -229,12 +229,11 @@ pub fn init_vtl_ap(core: u32) -> Result { // has high-canonical mappings, so these are ready to use as-is for the // AP's initial VP context. let rip: u64 = get_entry(); - // SAFETY: We dont support concurrent AP/VTL initialization and thus share - // the same stack pointer. If we plan to support concurrent initialization, - // we should provide seperate stack pointers for each AP (which might not - // scale if there are several 100s of APs). + // All APs share this single boot stack. `_ap_start` spin-acquires + // `AP_BOOT_STACK_LOCK` before touching the stack and releases it after + // switching to a per-CPU kernel stack, so concurrent AP entry is safe. // - // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V + // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to Hyper-V // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned. let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64; let tss = get_address_of_special_page(VTL1_TSS_PAGE); diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs index ea13e4226..d6a393e68 100644 --- a/litebox_runner_lvbs/src/main.rs +++ b/litebox_runner_lvbs/src/main.rs @@ -6,6 +6,7 @@ #![no_main] use core::arch::{asm, naked_asm}; +use core::sync::atomic::{AtomicBool, Ordering}; use litebox_platform_lvbs::{ arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop}, host::{ @@ -20,6 +21,21 @@ use litebox_platform_lvbs::{ use x86_64::VirtAddr; use x86_64::structures::paging::PageTableFlags; +/// Spinlock protecting the shared AP boot stack (`VTL1_KERNEL_STACK_PAGE`). +/// +/// All APs receive the same initial RSP via `hvcall_enable_vp_vtl`. VTL0 +/// controls when APs enter VTL1, so multiple APs may start concurrently. +/// Each AP spin-acquires this lock before touching the boot stack, and +/// releases it after switching to its own heap-allocated per-CPU kernel stack. +static AP_BOOT_STACK_LOCK: AtomicBool = AtomicBool::new(false); + +/// Release the AP boot stack spinlock. +/// +/// Called after the current core has switched RSP to its per-CPU kernel stack. +extern "C" fn release_boot_stack_lock() { + AP_BOOT_STACK_LOCK.store(false, Ordering::Release); +} + /// ELF64 relocation entry #[repr(C)] struct Elf64Rela { @@ -320,10 +336,32 @@ unsafe extern "C" fn high_canonical_trampoline() -> ! { /// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl` /// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3, /// so they already run at high-canonical VAs and need no remap. -#[expect(clippy::missing_safety_doc)] +/// +/// # Safety +/// +/// Must only be used as the initial RIP for an AP's VP context. +#[unsafe(naked)] #[unsafe(no_mangle)] pub unsafe extern "C" fn _ap_start() -> ! { - unsafe { common_start(false) } + naked_asm!( + // Spin-acquire the AP boot stack lock entirely in registers. + // No stack usage is permitted until the lock is held, because + // another AP may still be running on this same stack. + "lea rcx, [rip + {lock}]", + "2:", + "mov al, 1", + "xchg byte ptr [rcx], al", + "test al, al", + "jz 3f", + "pause", + "jmp 2b", + "3:", + // This AP has acquired the lock and exclusively owns the boot stack. + "xor edi, edi", // is_bsp = false + "jmp {common_start}", + lock = sym AP_BOOT_STACK_LOCK, + common_start = sym common_start, + ); } /// Shared boot path for BSP and AP cores. @@ -347,11 +385,19 @@ unsafe extern "C" fn common_start(is_bsp: bool) -> ! { let is_bsp_u32 = u32::from(is_bsp); unsafe { asm!( + // Now use this core's heap-allocated kernel stack. "mov rsp, gs:[{kernel_sp_off}]", + // The boot stack is no longer in use. Release the AP boot stack + // spinlock so the next AP can proceed. For the BSP this is a + // harmless no-op (the lock was never held). + "push rdi", + "call {release_lock}", + "pop rdi", "call {kernel_main}", kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() }, in("edi") is_bsp_u32, - kernel_main = sym kernel_main + release_lock = sym release_boot_stack_lock, + kernel_main = sym kernel_main, ); } From 320aa44239a4403486944be09ed6f96e541e2022 Mon Sep 17 00:00:00 2001 From: Sangho Lee Date: Mon, 2 Mar 2026 21:42:24 +0000 Subject: [PATCH 3/3] Revert "spin lock AP boot stack" This reverts commit c42157c5bcbe4733edf2d12f413bf94cbaf50540. --- dev_tests/src/ratchet.rs | 2 +- litebox_platform_lvbs/src/mshv/hvcall_vp.rs | 9 ++-- litebox_runner_lvbs/src/main.rs | 52 ++------------------- 3 files changed, 9 insertions(+), 54 deletions(-) diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs index 6d7788be3..f9882a793 100644 --- a/dev_tests/src/ratchet.rs +++ b/dev_tests/src/ratchet.rs @@ -41,7 +41,7 @@ fn ratchet_globals() -> Result<()> { ("litebox_platform_multiplex/", 1), ("litebox_platform_windows_userland/", 7), ("litebox_runner_linux_userland/", 1), - ("litebox_runner_lvbs/", 5), + ("litebox_runner_lvbs/", 4), ("litebox_runner_snp/", 1), ("litebox_shim_linux/", 1), ("litebox_shim_optee/", 3), diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs index 4dc03aadc..6f3474b2f 100644 --- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs +++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs @@ -229,11 +229,12 @@ pub fn init_vtl_ap(core: u32) -> Result { // has high-canonical mappings, so these are ready to use as-is for the // AP's initial VP context. let rip: u64 = get_entry(); - // All APs share this single boot stack. `_ap_start` spin-acquires - // `AP_BOOT_STACK_LOCK` before touching the stack and releases it after - // switching to a per-CPU kernel stack, so concurrent AP entry is safe. + // SAFETY: We dont support concurrent AP/VTL initialization and thus share + // the same stack pointer. If we plan to support concurrent initialization, + // we should provide seperate stack pointers for each AP (which might not + // scale if there are several 100s of APs). // - // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to Hyper-V + // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned. let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64; let tss = get_address_of_special_page(VTL1_TSS_PAGE); diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs index d6a393e68..ea13e4226 100644 --- a/litebox_runner_lvbs/src/main.rs +++ b/litebox_runner_lvbs/src/main.rs @@ -6,7 +6,6 @@ #![no_main] use core::arch::{asm, naked_asm}; -use core::sync::atomic::{AtomicBool, Ordering}; use litebox_platform_lvbs::{ arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop}, host::{ @@ -21,21 +20,6 @@ use litebox_platform_lvbs::{ use x86_64::VirtAddr; use x86_64::structures::paging::PageTableFlags; -/// Spinlock protecting the shared AP boot stack (`VTL1_KERNEL_STACK_PAGE`). -/// -/// All APs receive the same initial RSP via `hvcall_enable_vp_vtl`. VTL0 -/// controls when APs enter VTL1, so multiple APs may start concurrently. -/// Each AP spin-acquires this lock before touching the boot stack, and -/// releases it after switching to its own heap-allocated per-CPU kernel stack. -static AP_BOOT_STACK_LOCK: AtomicBool = AtomicBool::new(false); - -/// Release the AP boot stack spinlock. -/// -/// Called after the current core has switched RSP to its per-CPU kernel stack. -extern "C" fn release_boot_stack_lock() { - AP_BOOT_STACK_LOCK.store(false, Ordering::Release); -} - /// ELF64 relocation entry #[repr(C)] struct Elf64Rela { @@ -336,32 +320,10 @@ unsafe extern "C" fn high_canonical_trampoline() -> ! { /// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl` /// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3, /// so they already run at high-canonical VAs and need no remap. -/// -/// # Safety -/// -/// Must only be used as the initial RIP for an AP's VP context. -#[unsafe(naked)] +#[expect(clippy::missing_safety_doc)] #[unsafe(no_mangle)] pub unsafe extern "C" fn _ap_start() -> ! { - naked_asm!( - // Spin-acquire the AP boot stack lock entirely in registers. - // No stack usage is permitted until the lock is held, because - // another AP may still be running on this same stack. - "lea rcx, [rip + {lock}]", - "2:", - "mov al, 1", - "xchg byte ptr [rcx], al", - "test al, al", - "jz 3f", - "pause", - "jmp 2b", - "3:", - // This AP has acquired the lock and exclusively owns the boot stack. - "xor edi, edi", // is_bsp = false - "jmp {common_start}", - lock = sym AP_BOOT_STACK_LOCK, - common_start = sym common_start, - ); + unsafe { common_start(false) } } /// Shared boot path for BSP and AP cores. @@ -385,19 +347,11 @@ unsafe extern "C" fn common_start(is_bsp: bool) -> ! { let is_bsp_u32 = u32::from(is_bsp); unsafe { asm!( - // Now use this core's heap-allocated kernel stack. "mov rsp, gs:[{kernel_sp_off}]", - // The boot stack is no longer in use. Release the AP boot stack - // spinlock so the next AP can proceed. For the BSP this is a - // harmless no-op (the lock was never held). - "push rdi", - "call {release_lock}", - "pop rdi", "call {kernel_main}", kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() }, in("edi") is_bsp_u32, - release_lock = sym release_boot_stack_lock, - kernel_main = sym kernel_main, + kernel_main = sym kernel_main ); }