From 126c8a9281ffa5eb3d94ff137dcc3f09866df268 Mon Sep 17 00:00:00 2001
From: Sangho Lee <sanghle@microsoft.com>
Date: Fri, 27 Feb 2026 22:48:27 +0000
Subject: [PATCH 1/3] Refactor PerCpuVariables

---
 dev_tests/src/ratchet.rs                      |   2 +-
 litebox_platform_lvbs/src/arch/x86/gdt.rs     |  14 +-
 litebox_platform_lvbs/src/host/lvbs_impl.rs   |  10 +-
 .../src/host/per_cpu_variables.rs             | 462 +++++++-----------
 litebox_platform_lvbs/src/lib.rs              |   4 +-
 litebox_platform_lvbs/src/mshv/hvcall.rs      |  14 +-
 litebox_platform_lvbs/src/mshv/hvcall_mm.rs   | 192 ++++----
 litebox_platform_lvbs/src/mshv/hvcall_vp.rs   | 101 ++--
 litebox_platform_lvbs/src/mshv/mod.rs         |   4 +-
 litebox_platform_lvbs/src/mshv/vsm.rs         |  23 +-
 .../src/mshv/vsm_intercept.rs                 |  29 +-
 litebox_platform_lvbs/src/mshv/vtl_switch.rs  |  27 +-
 litebox_runner_lvbs/src/lib.rs                | 271 +++++-----
 litebox_runner_lvbs/src/main.rs               |  54 +-
 14 files changed, 568 insertions(+), 639 deletions(-)

diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs
index 916ee857a..f9882a793 100644
--- a/dev_tests/src/ratchet.rs
+++ b/dev_tests/src/ratchet.rs
@@ -37,7 +37,7 @@ fn ratchet_globals() -> Result<()> {
             ("litebox/", 9),
             ("litebox_platform_linux_kernel/", 6),
             ("litebox_platform_linux_userland/", 5),
-            ("litebox_platform_lvbs/", 26),
+            ("litebox_platform_lvbs/", 23),
             ("litebox_platform_multiplex/", 1),
             ("litebox_platform_windows_userland/", 7),
             ("litebox_runner_linux_userland/", 1),
diff --git a/litebox_platform_lvbs/src/arch/x86/gdt.rs b/litebox_platform_lvbs/src/arch/x86/gdt.rs
index e86fcbe9f..ef449c58c 100644
--- a/litebox_platform_lvbs/src/arch/x86/gdt.rs
+++ b/litebox_platform_lvbs/src/arch/x86/gdt.rs
@@ -3,9 +3,7 @@
 
 //! Global Descriptor Table (GDT) and Task State Segment (TSS)
 
-use crate::host::per_cpu_variables::{
-    PerCpuVariablesAsm, with_per_cpu_variables_asm, with_per_cpu_variables_mut,
-};
+use crate::host::per_cpu_variables::with_per_cpu_variables;
 use alloc::boxed::Box;
 use x86_64::{
     PrivilegeLevel, VirtAddr,
@@ -82,10 +80,8 @@ impl Default for GdtWrapper {
 }
 
 fn setup_gdt_tss() {
-    let double_fault_stack_top =
-        with_per_cpu_variables_asm(PerCpuVariablesAsm::get_double_fault_stack_ptr);
-    let exception_stack_top =
-        with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception_stack_ptr);
+    let double_fault_stack_top = with_per_cpu_variables(|pcv| pcv.asm.get_double_fault_stack_ptr());
+    let exception_stack_top = with_per_cpu_variables(|pcv| pcv.asm.get_exception_stack_ptr());
 
     let mut tss = Box::new(AlignedTss(TaskStateSegment::new()));
     // TSS.IST1: dedicated stack for double faults
@@ -123,8 +119,8 @@ fn setup_gdt_tss() {
         load_tss(gdt.selectors.tss);
     }
 
-    with_per_cpu_variables_mut(|per_cpu_variables| {
-        per_cpu_variables.gdt = Some(gdt);
+    with_per_cpu_variables(|per_cpu_variables| {
+        per_cpu_variables.gdt.set(Some(gdt));
     });
 }
 
diff --git a/litebox_platform_lvbs/src/host/lvbs_impl.rs b/litebox_platform_lvbs/src/host/lvbs_impl.rs
index 36d4b04c3..8bb1f415b 100644
--- a/litebox_platform_lvbs/src/host/lvbs_impl.rs
+++ b/litebox_platform_lvbs/src/host/lvbs_impl.rs
@@ -5,7 +5,7 @@
 
 use crate::{
     Errno, HostInterface, arch::ioport::serial_print_string,
-    host::per_cpu_variables::with_per_cpu_variables_mut,
+    host::per_cpu_variables::with_per_cpu_variables,
 };
 
 pub type LvbsLinuxKernel = crate::LinuxKernel<HostLvbsInterface>;
@@ -87,14 +87,14 @@ impl LvbsLinuxKernel {
 
 unsafe impl litebox::platform::ThreadLocalStorageProvider for LvbsLinuxKernel {
     fn get_thread_local_storage() -> *mut () {
-        let tls = with_per_cpu_variables_mut(|pcv| pcv.tls);
+        let tls = with_per_cpu_variables(|pcv| pcv.tls.get());
         tls.as_mut_ptr::<()>()
     }
 
     unsafe fn replace_thread_local_storage(value: *mut ()) -> *mut () {
-        with_per_cpu_variables_mut(|pcv| {
-            let old = pcv.tls;
-            pcv.tls = x86_64::VirtAddr::new(value as u64);
+        with_per_cpu_variables(|pcv| {
+            let old = pcv.tls.get();
+            pcv.tls.set(x86_64::VirtAddr::new(value as u64));
             old.as_u64() as *mut ()
         })
     }
diff --git a/litebox_platform_lvbs/src/host/per_cpu_variables.rs b/litebox_platform_lvbs/src/host/per_cpu_variables.rs
index e9d873dec..b093623cb 100644
--- a/litebox_platform_lvbs/src/host/per_cpu_variables.rs
+++ b/litebox_platform_lvbs/src/host/per_cpu_variables.rs
@@ -4,18 +4,15 @@
 //! Per-CPU VTL1 kernel variables
 
 use crate::{
-    arch::{MAX_CORES, gdt, get_core_id, instrs::rdmsr},
-    host::bootparam::get_num_possible_cpus,
+    arch::{MAX_CORES, gdt, instrs::rdmsr},
     mshv::{
-        HV_REGISTER_VP_INDEX, HvMessagePage, HvVpAssistPage,
-        vsm::{ControlRegMap, NUM_CONTROL_REGS},
-        vtl_switch::VtlState,
-        vtl1_mem_layout::PAGE_SIZE,
+        HV_REGISTER_VP_INDEX, HvMessage, HvMessagePage, HvVpAssistPage, vsm::ControlRegMap,
+        vtl_switch::VtlState, vtl1_mem_layout::PAGE_SIZE,
     },
 };
 use aligned_vec::avec;
 use alloc::boxed::Box;
-use core::cell::{Cell, RefCell};
+use core::cell::{Cell, UnsafeCell};
 use core::mem::offset_of;
 use litebox::utils::TruncateExt;
 use litebox_common_linux::{rdgsbase, wrgsbase};
@@ -26,25 +23,47 @@ pub const EXCEPTION_STACK_SIZE: usize = PAGE_SIZE;
 pub const KERNEL_STACK_SIZE: usize = 10 * PAGE_SIZE;
 
 /// Per-CPU VTL1 kernel variables
-#[repr(align(4096))]
-#[derive(Clone, Copy)]
+#[repr(C, align(4096))]
 pub struct PerCpuVariables {
-    hv_vp_assist_page: [u8; PAGE_SIZE],
-    hv_simp_page: [u8; PAGE_SIZE],
+    /// Assembly-accessible fields at GS offset 0 (`gs:[offset]` in inline asm).
+    ///
+    /// All fields use `Cell<T>` for interior mutability, so they can be accessed
+    /// through `&PerCpuVariables` without requiring `&mut`.
+    pub(crate) asm: PerCpuVariablesAsm,
     double_fault_stack: [u8; DOUBLE_FAULT_STACK_SIZE],
     _guard_page_0: [u8; PAGE_SIZE],
     exception_stack: [u8; EXCEPTION_STACK_SIZE],
     kernel_stack: [u8; KERNEL_STACK_SIZE],
     _guard_page_1: [u8; PAGE_SIZE],
-    hvcall_input: [u8; PAGE_SIZE],
-    hvcall_output: [u8; PAGE_SIZE],
-    pub vtl0_state: VtlState,
-    pub vtl0_locked_regs: ControlRegMap,
-    pub gdt: Option<&'static gdt::GdtWrapper>,
-    pub tls: VirtAddr,
-    pub vp_index: u32,
+    /// The below four pages are used for communication with the hypervisor and
+    /// must be page-aligned. `UnsafeCell` is used for interior mutability since
+    /// the hypervisor can write to or read from them with loose Rust guarantees.
+    hv_vp_assist_page: UnsafeCell<[u8; PAGE_SIZE]>,
+    hv_simp_page: UnsafeCell<[u8; PAGE_SIZE]>,
+    hvcall_input: UnsafeCell<[u8; PAGE_SIZE]>,
+    hvcall_output: UnsafeCell<[u8; PAGE_SIZE]>,
+    /// VTL0 general-purpose register state, saved/restored by assembly
+    /// (`SAVE_VTL_STATE_ASM`/`LOAD_VTL_STATE_ASM`) via raw pushes/pops to
+    /// the address cached in `PerCpuVariablesAsm::vtl0_state_top_addr`.
+    /// Rust code accesses it only between save and load (i.e., while VTL1
+    /// is executing), so there is no data race with the assembly.
+    pub(crate) vtl0_state: Cell<VtlState>,
+    pub(crate) vtl0_locked_regs: Cell<ControlRegMap>,
+    pub(crate) gdt: Cell<Option<&'static gdt::GdtWrapper>>,
+    pub(crate) tls: Cell<VirtAddr>,
+    /// Cached VP index from the hypervisor. Lazily initialized on first access
+    /// via `rdmsr(HV_REGISTER_VP_INDEX)` and immutable thereafter.
+    /// Uses `u32::MAX` as the "uninitialized" sentinel.
+    vp_index: Cell<u32>,
 }
 
+// These Hyper-V pages must be page-aligned.
+// These compile-time assertions guard against layout regressions.
+const _: () = assert!(offset_of!(PerCpuVariables, hv_vp_assist_page) % PAGE_SIZE == 0);
+const _: () = assert!(offset_of!(PerCpuVariables, hv_simp_page) % PAGE_SIZE == 0);
+const _: () = assert!(offset_of!(PerCpuVariables, hvcall_input) % PAGE_SIZE == 0);
+const _: () = assert!(offset_of!(PerCpuVariables, hvcall_output) % PAGE_SIZE == 0);
+
 impl PerCpuVariables {
     const XSAVE_ALIGNMENT: usize = 64; // XSAVE and XRSTORE require a 64-byte aligned buffer
     pub const VTL1_XSAVE_MASK: u64 = 0b11; // let XSAVE and XRSTORE deal with x87 and SSE states
@@ -63,32 +82,77 @@ impl PerCpuVariables {
         &raw const self.exception_stack as u64 + (self.exception_stack.len() - 1) as u64
     }
 
-    pub fn hv_vp_assist_page_as_ptr(&self) -> *const HvVpAssistPage {
-        (&raw const self.hv_vp_assist_page).cast::<HvVpAssistPage>()
-    }
-
     pub(crate) fn hv_vp_assist_page_as_u64(&self) -> u64 {
-        &raw const self.hv_vp_assist_page as u64
-    }
-
-    pub(crate) fn hv_simp_page_as_mut_ptr(&mut self) -> *mut HvMessagePage {
-        (&raw mut self.hv_simp_page).cast::<HvMessagePage>()
+        self.hv_vp_assist_page.get() as u64
     }
 
     pub(crate) fn hv_simp_page_as_u64(&self) -> u64 {
-        &raw const self.hv_simp_page as u64
+        self.hv_simp_page.get() as u64
     }
 
-    pub(crate) fn hv_hypercall_input_page_as_mut_ptr(&mut self) -> *mut [u8; PAGE_SIZE] {
-        &raw mut self.hvcall_input
+    /// Take the pending SynIC message from SIMP slot `sint_index`.
+    ///
+    /// Returns a copy of the message and clears the slot's `message_type`
+    /// to `HvMessageTypeNone`, signaling the hypervisor that the slot is
+    /// free for reuse.
+    ///
+    /// This is safe because the SynIC protocol guarantees the hypervisor
+    /// will not overwrite a slot whose `message_type` is non-zero. By
+    /// reading first and clearing last, no concurrent write is possible.
+    pub(crate) fn take_sint_message(&self, sint_index: usize) -> HvMessage {
+        // SAFETY: interior mutability via `UnsafeCell`. The SynIC protocol
+        // ensures the hypervisor does not concurrently write to this slot
+        // while `message_type != HvMessageTypeNone`.
+        let simp_page = unsafe { &mut *self.hv_simp_page.get().cast::<HvMessagePage>() };
+        let msg = simp_page.sint_message[sint_index];
+        simp_page.sint_message[sint_index].header.message_type = 0; // HvMessageTypeNone
+        msg
+    }
+
+    /// Run a closure with a shared reference to the VP assist page.
+    ///
+    /// The hypervisor writes to this page *before* entering VTL1 (e.g.,
+    /// `vtl_entry_reason`). No concurrent modification.
+    pub(crate) fn with_vp_assist_page<R>(&self, f: impl FnOnce(&HvVpAssistPage) -> R) -> R {
+        // SAFETY: interior mutability via `UnsafeCell`. The hypervisor
+        // finishes writing before VTL1 entry, so no concurrent write is
+        // possible while this reference exists.
+        f(unsafe { &*self.hv_vp_assist_page.get().cast::<HvVpAssistPage>() })
     }
 
-    pub(crate) fn hv_hypercall_output_page_as_mut_ptr(&mut self) -> *mut [u8; PAGE_SIZE] {
-        &raw mut self.hvcall_output
+    /// Run a closure with a mutable reference to the hypercall input page,
+    /// reinterpreted as `T`.
+    ///
+    /// **Not re-entrant**: the closure must not call back into this method,
+    /// as that would create aliasing mutable references to the same page.
+    pub(crate) fn with_hvcall_input<T, R>(&self, f: impl FnOnce(&mut T) -> R) -> R {
+        const { assert!(core::mem::size_of::<T>() <= PAGE_SIZE) };
+        const { assert!(core::mem::align_of::<T>() <= PAGE_SIZE) };
+        // SAFETY: interior mutability via `UnsafeCell`; the `&mut T` is
+        // confined to this closure. The page is page-aligned (4096), which
+        // satisfies any T with align_of::<T>() <= PAGE_SIZE.
+        f(unsafe { &mut *self.hvcall_input.get().cast::<T>() })
+    }
+
+    /// Run a closure with a mutable reference to the hypercall output page,
+    /// reinterpreted as `T`.
+    ///
+    /// **Not re-entrant**: the closure must not call back into this method,
+    /// as that would create aliasing mutable references to the same page.
+    pub(crate) fn with_hvcall_output<T, R>(&self, f: impl FnOnce(&mut T) -> R) -> R {
+        const { assert!(core::mem::size_of::<T>() <= PAGE_SIZE) };
+        const { assert!(core::mem::align_of::<T>() <= PAGE_SIZE) };
+        // SAFETY: interior mutability via `UnsafeCell`; the `&mut T` is
+        // confined to this closure. The page is page-aligned (4096), which
+        // satisfies any T with align_of::<T>() <= PAGE_SIZE.
+        // The hypervisor synchronously writes to this page during the hypercall.
+        f(unsafe { &mut *self.hvcall_output.get().cast::<T>() })
     }
 
-    pub fn set_vtl_return_value(&mut self, value: u64) {
-        self.vtl0_state.r8 = value; // LVBS uses R8 to return a value from VTL1 to VTL0
+    pub fn set_vtl_return_value(&self, value: u64) {
+        let mut state = self.vtl0_state.get();
+        state.r8 = value; // LVBS uses R8 to return a value from VTL1 to VTL0
+        self.vtl0_state.set(state);
     }
 
     /// Return the cached Hyper-V VP index for this core (which never changes during
@@ -96,21 +160,24 @@ impl PerCpuVariables {
     ///
     /// # Panics
     /// Panics if the VP index returned by the hypervisor is ≥ `MAX_CORES`.
-    pub fn vp_index(&mut self) -> u32 {
-        if self.vp_index == u32::MAX {
+    pub fn vp_index(&self) -> u32 {
+        let idx = self.vp_index.get();
+        if idx == u32::MAX {
             let vp_index: u32 = rdmsr(HV_REGISTER_VP_INDEX).truncate();
             assert!(
                 vp_index < u32::try_from(MAX_CORES).unwrap(),
                 "VP index {vp_index} exceeds the configured processor mask"
             );
-            self.vp_index = vp_index;
+            self.vp_index.set(vp_index);
+            vp_index
+        } else {
+            idx
         }
-        self.vp_index
     }
 
     /// Return kernel code, user code, and user data segment selectors
     pub(crate) fn get_segment_selectors(&self) -> Option<(u16, u16, u16)> {
-        self.gdt.map(gdt::GdtWrapper::get_segment_selectors)
+        self.gdt.get().map(gdt::GdtWrapper::get_segment_selectors)
     }
 
     /// Allocate XSAVE areas for saving/restoring the extended states of each core.
@@ -166,45 +233,9 @@ impl PerCpuVariables {
     }
 }
 
-/// per-CPU variables for core 0 (or BSP). This must use static memory because kernel heap is not ready.
-static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables {
-    hv_vp_assist_page: [0u8; PAGE_SIZE],
-    hv_simp_page: [0u8; PAGE_SIZE],
-    double_fault_stack: [0u8; DOUBLE_FAULT_STACK_SIZE],
-    _guard_page_0: [0u8; PAGE_SIZE],
-    exception_stack: [0u8; EXCEPTION_STACK_SIZE],
-    kernel_stack: [0u8; KERNEL_STACK_SIZE],
-    _guard_page_1: [0u8; PAGE_SIZE],
-    hvcall_input: [0u8; PAGE_SIZE],
-    hvcall_output: [0u8; PAGE_SIZE],
-    vtl0_state: VtlState {
-        rbp: 0,
-        rax: 0,
-        rbx: 0,
-        rcx: 0,
-        rdx: 0,
-        rsi: 0,
-        rdi: 0,
-        r8: 0,
-        r9: 0,
-        r10: 0,
-        r11: 0,
-        r12: 0,
-        r13: 0,
-        r14: 0,
-        r15: 0,
-    },
-    vtl0_locked_regs: ControlRegMap {
-        entries: [(0, 0); NUM_CONTROL_REGS],
-    },
-    gdt: const { None },
-    tls: VirtAddr::zero(),
-    vp_index: u32::MAX,
-};
-
-/// Specify the layout of PerCpuVariables for Assembly area.
+/// Assembly-accessible per-CPU fields at the start of [`PerCpuVariables`].
 ///
-/// Unlike `litebox_platform_linux_userland`, this kernel platform does't rely on
+/// Unlike `litebox_platform_linux_userland`, this kernel platform does not rely on
 /// the `tbss` section to specify FS/GS offsets for per CPU variables because
 /// there is no ELF loader that will set up it.
 ///
@@ -213,10 +244,11 @@ static mut BSP_VARIABLES: PerCpuVariables = PerCpuVariables {
 /// mode transitions (i.e., ring transitions through iretq/syscall) unlike userland
 /// platforms.
 ///
-/// TODO: Consider unifying with `PerCpuVariables` if possible.
+/// Page-aligned (`align(4096)`) so that the following fields in
+/// [`PerCpuVariables`] (HV pages, stacks, etc.) remain page-aligned.
 #[non_exhaustive]
 #[cfg(target_arch = "x86_64")]
-#[repr(C)]
+#[repr(C, align(4096))]
 #[derive(Clone)]
 pub struct PerCpuVariablesAsm {
     /// Initial kernel stack pointer to reset the kernel stack on VTL switch
@@ -380,219 +412,89 @@ impl PerCpuVariablesAsm {
     }
 }
 
-/// Wrapper struct to maintain `RefCell` along with `PerCpuVariablesAsm`.
-/// This struct allows assembly code to read/write some PerCpuVariables area via the GS register (e.g., to
-/// save/restore RIP/RSP). Currently, `PerCpuVariables` is protected by `RefCell` such that
-/// assembly code cannot easily access it.
-///
-/// TODO: Let's consider whether we should maintain these two types of Per CPU variable areas (for Rust and
-/// assembly, respectively). This design secures Rust-side access to `PerCpuVariables` with `RefCell`,
-/// but it might be unnecessarily complex. Instead, we could use assembly code in all cases, but
-/// this might be unsafe.
-#[repr(C)]
-pub struct RefCellWrapper<T> {
-    /// Make some PerCpuVariablesAsm area be accessible via the GS register. This is mainly for assembly code
-    pcv_asm: PerCpuVariablesAsm,
-    /// RefCell which will be stored in the GS register
-    inner: RefCell<T>,
-}
-impl<T> RefCellWrapper<T> {
-    pub const fn new(value: T) -> Self {
-        Self {
-            pcv_asm: PerCpuVariablesAsm {
-                kernel_stack_ptr: Cell::new(0),
-                double_fault_stack_ptr: Cell::new(0),
-                exception_stack_ptr: Cell::new(0),
-                vtl_return_addr: Cell::new(0),
-                scratch: Cell::new(0),
-                vtl0_state_top_addr: Cell::new(0),
-                cur_kernel_stack_ptr: Cell::new(0),
-                cur_kernel_base_ptr: Cell::new(0),
-                user_context_top_addr: Cell::new(0),
-                vtl0_xsave_area_addr: Cell::new(0),
-                vtl0_xsave_mask_lo: Cell::new(0),
-                vtl0_xsave_mask_hi: Cell::new(0),
-                vtl1_kernel_xsave_area_addr: Cell::new(0),
-                vtl1_user_xsave_area_addr: Cell::new(0),
-                vtl1_xsave_mask_lo: Cell::new(0),
-                vtl1_xsave_mask_hi: Cell::new(0),
-                vtl1_kernel_xsaved: Cell::new(0),
-                vtl1_user_xsaved: Cell::new(0),
-                exception_trapno: Cell::new(0),
-            },
-            inner: RefCell::new(value),
-        }
-    }
-    pub fn get_refcell(&self) -> &RefCell<T> {
-        &self.inner
-    }
-}
-
-/// Store the addresses of per-CPU variables. The kernel threads are expected to access
-/// the corresponding per-CPU variables via the GS registers which will store the addresses later.
-/// Instead of maintaining this map, we might be able to use a hypercall to directly program each core's GS register.
-static mut PER_CPU_VARIABLE_ADDRESSES: [RefCellWrapper<*mut PerCpuVariables>; MAX_CORES] =
-    [const { RefCellWrapper::new(core::ptr::null_mut()) }; MAX_CORES];
-static mut PER_CPU_VARIABLE_ADDRESSES_IDX: usize = 0;
-
-/// Execute a closure with a reference to the current core's per-CPU variables.
+/// Execute a closure with a shared reference to the current core's per-CPU variables.
 ///
 /// # Safety
-/// This function assumes the following:
-/// - The GSBASE register values of individual cores must be properly set (i.e., they must be different).
-/// - `get_core_id()` must return distinct APIC IDs for different cores.
-///
-/// If we cannot guarantee these assumptions, this function may result in unsafe or undefined behaviors.
+/// The GSBASE register must point to a valid, heap-allocated `PerCpuVariables`
+/// (set by [`allocate_per_cpu_variables`]). Each core must have a distinct
+/// GSBASE value.
 ///
 /// # Panics
-/// Panics if GSBASE is not set, it contains a non-canonical address, or no per-CPU variables are allocated.
-/// Panics if this function is recursively called (`BorrowMutError`).
+/// Panics if GSBASE is not set or contains a non-canonical address.
 pub fn with_per_cpu_variables<F, R>(f: F) -> R
 where
     F: FnOnce(&PerCpuVariables) -> R,
     R: Sized + 'static,
 {
-    let Some(refcell) = get_or_init_refcell_of_per_cpu_variables() else {
-        panic!("No per-CPU variables are allocated");
-    };
-    let borrow = refcell.borrow();
-    let per_cpu_variables = unsafe { &**borrow };
-
-    f(per_cpu_variables)
+    let ptr = get_per_cpu_variables_ptr();
+    // Safety: per-CPU data is exclusive to this core; no other core can
+    // access it.
+    let pcv = unsafe { &*ptr };
+    f(pcv)
 }
 
-/// Execute a closure with a mutable reference to the current core's per-CPU variables.
-///
-/// # Safety
-/// This function assumes the following:
-/// - The GSBASE register values of individual cores must be properly set (i.e., they must be different).
-/// - `get_core_id()` must return distinct APIC IDs for different cores.
-///
-/// If we cannot guarantee these assumptions, this function may result in unsafe or undefined behaviors.
+/// Get a raw pointer to the current core's `PerCpuVariables` from GSBASE.
 ///
 /// # Panics
-/// Panics if GSBASE is not set, it contains a non-canonical address, or no per-CPU variables are allocated.
-/// Panics if this function is recursively called (`BorrowMutError`).
-pub fn with_per_cpu_variables_mut<F, R>(f: F) -> R
-where
-    F: FnOnce(&mut PerCpuVariables) -> R,
-    R: Sized + 'static,
-{
-    let Some(refcell) = get_or_init_refcell_of_per_cpu_variables() else {
-        panic!("No per-CPU variables are allocated");
-    };
-    let mut borrow = refcell.borrow_mut();
-    let per_cpu_variables = unsafe { &mut **borrow };
-
-    f(per_cpu_variables)
+/// Panics if GSBASE is zero or non-canonical.
+fn get_per_cpu_variables_ptr() -> *mut PerCpuVariables {
+    let gsbase = unsafe { rdgsbase() };
+    assert!(
+        gsbase != 0,
+        "GSBASE not set. Call allocate_per_cpu_variables() first"
+    );
+    let _ = VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address");
+    gsbase as *mut PerCpuVariables
 }
 
-/// Execute a closure with a reference to the current PerCpuVariablesAsm.
+/// Heap-allocate this core's per-CPU variables and set GSBASE to point at them.
+///
+/// Every core (BSP and AP) calls this exactly once during its boot path,
+/// **before** [`init_per_cpu_variables`].
+///
+/// GSBASE will point directly at the `PerCpuVariables` struct, so assembly
+/// code can access the `asm` field at GS offset 0 (guaranteed by `#[repr(C)]`).
+///
+/// The caller must have already:
+///   1. Enabled FSGSBASE (`enable_fsgsbase()`).
+///   2. Enabled extended CPU states (`enable_extended_states()`).
+///   3. (BSP only) Seeded the global heap (`seed_initial_heap()`).
 ///
 /// # Panics
-/// Panics if GSBASE is not set or it contains a non-canonical address.
-pub fn with_per_cpu_variables_asm<F, R>(f: F) -> R
-where
-    F: FnOnce(&PerCpuVariablesAsm) -> R,
-    R: Sized + 'static,
-{
-    let pcv_asm_addr = unsafe {
-        let gsbase = rdgsbase();
-        let addr = VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address");
-        addr.as_ptr::<RefCellWrapper<*mut PerCpuVariables>>()
-            .cast::<PerCpuVariablesAsm>()
+/// Panics if the heap allocation fails.
+pub fn allocate_per_cpu_variables() {
+    let mut per_cpu_variables = Box::<PerCpuVariables>::new_uninit();
+    // Safety: `PerCpuVariables` is too large for the stack, so we zero-init
+    // via `write_bytes` then fix up the `vp_index` sentinel. Zero is valid
+    // for all other field types:
+    // - `[u8; N]`, `VtlState`, `ControlRegMap`: all-zeroes is their default.
+    // - `Cell<T>` / `UnsafeCell<T>`: `#[repr(transparent)]`, same as inner T.
+    let per_cpu_variables = unsafe {
+        let ptr = per_cpu_variables.as_mut_ptr();
+        ptr.write_bytes(0, 1);
+        // Set the "uninitialized" sentinel for vp_index (0 is a valid VP index).
+        core::ptr::addr_of_mut!((*ptr).vp_index).write(Cell::new(u32::MAX));
+        per_cpu_variables.assume_init()
     };
-    let pcv_asm = unsafe { &*pcv_asm_addr };
-
-    f(pcv_asm)
-}
 
-/// Get or initialize a `RefCell` that contains a pointer to the current core's per-CPU variables.
-/// This `RefCell` is expected to be stored in the GS register.
-fn get_or_init_refcell_of_per_cpu_variables() -> Option<&'static RefCell<*mut PerCpuVariables>> {
-    let gsbase = unsafe { rdgsbase() };
-    if gsbase == 0 {
-        let core_id = get_core_id();
-        let refcell_wrapper = if core_id == 0 {
-            let addr = &raw mut BSP_VARIABLES;
-            unsafe {
-                PER_CPU_VARIABLE_ADDRESSES[0] = RefCellWrapper::new(addr);
-                &PER_CPU_VARIABLE_ADDRESSES[0]
-            }
-        } else {
-            assert!(
-                unsafe { PER_CPU_VARIABLE_ADDRESSES_IDX < MAX_CORES },
-                "PER_CPU_VARIABLE_ADDRESSES_IDX exceeds MAX_CORES",
-            );
-            unsafe { &PER_CPU_VARIABLE_ADDRESSES[PER_CPU_VARIABLE_ADDRESSES_IDX] }
-        };
-        unsafe {
-            PER_CPU_VARIABLE_ADDRESSES_IDX += 1;
-        }
-        let refcell = refcell_wrapper.get_refcell();
-        if refcell.borrow().is_null() {
-            None
-        } else {
-            let addr = x86_64::VirtAddr::new(&raw const *refcell_wrapper as u64);
-            unsafe {
-                wrgsbase(addr.as_u64().truncate());
-            }
-            Some(refcell)
-        }
-    } else {
-        let addr =
-            x86_64::VirtAddr::try_new(gsbase as u64).expect("GS contains a non-canonical address");
-        let refcell_wrapper = unsafe { &*addr.as_ptr::<RefCellWrapper<*mut PerCpuVariables>>() };
-        let refcell = refcell_wrapper.get_refcell();
-        if refcell.borrow().is_null() {
-            None
-        } else {
-            Some(refcell)
-        }
+    // Leak the box so it lives for the core's lifetime.
+    let pcv = Box::leak(per_cpu_variables);
+    let addr = &raw const *pcv as u64;
+    unsafe {
+        wrgsbase(addr.truncate());
     }
 }
 
-/// Allocate per-CPU variables in heap for all possible cores. We expect that the BSP will call
-/// this function to allocate per-CPU variables for other APs because our per-CPU variables are
-/// huge such that each AP without a proper stack cannot allocate its own per-CPU variables.
-/// # Panics
-/// Panics if the number of possible CPUs exceeds `MAX_CORES`
-pub fn allocate_per_cpu_variables() {
-    let num_cores =
-        usize::try_from(get_num_possible_cpus().expect("Failed to get number of possible CPUs"))
-            .unwrap();
-    assert!(
-        num_cores <= MAX_CORES,
-        "# of possible CPUs ({num_cores}) exceeds MAX_CORES",
-    );
-
-    // Allocate xsave area for BSP (core 0)
-    with_per_cpu_variables_asm(|pcv_asm| {
-        PerCpuVariables::allocate_xsave_area(pcv_asm);
+/// Allocate XSAVE areas for the current core.
+///
+/// Must be called **after** [`allocate_per_cpu_variables`] (so GSBASE is
+/// set) and **after** switching to the kernel stack. The CPUID queries and
+/// `avec!` allocations inside `PerCpuVariables::allocate_xsave_area` use
+/// significant stack space that exceeds the 4 KiB boot stack.
+pub fn allocate_xsave_area() {
+    with_per_cpu_variables(|pcv| {
+        PerCpuVariables::allocate_xsave_area(&pcv.asm);
     });
-
-    // TODO: use `cpu_online_mask` to selectively allocate per-CPU variables only for online CPUs.
-    // Note. `PER_CPU_VARIABLE_ADDRESSES[0]` is expected to be already initialized to point to
-    // `BSP_VARIABLES` before calling this function by `get_or_init_refcell_of_per_cpu_variables()`.
-    #[allow(clippy::needless_range_loop)]
-    for i in 1..num_cores {
-        let mut per_cpu_variables = Box::<PerCpuVariables>::new_uninit();
-        // Safety: `PerCpuVariables` is too large for the stack, so we zero-init
-        // via `write_bytes` then fix up `vp_index` to the `u32::MAX` sentinel
-        // before calling `assume_init`.
-        let per_cpu_variables = unsafe {
-            let ptr = per_cpu_variables.as_mut_ptr();
-            ptr.write_bytes(0, 1);
-            (*ptr).vp_index = u32::MAX;
-            per_cpu_variables.assume_init()
-        };
-        unsafe {
-            PER_CPU_VARIABLE_ADDRESSES[i] = RefCellWrapper::new(Box::into_raw(per_cpu_variables));
-            // Allocate xsave area for this core, writing directly to its PerCpuVariablesAsm
-            let pcv_asm = &PER_CPU_VARIABLE_ADDRESSES[i].pcv_asm;
-            PerCpuVariables::allocate_xsave_area(pcv_asm);
-        }
-    }
 }
 
 /// Initialize PerCpuVariable and PerCpuVariableAsm for the current core.
@@ -604,7 +506,7 @@ pub fn allocate_per_cpu_variables() {
 /// Panics if the per-CPU variables are not properly initialized.
 pub fn init_per_cpu_variables() {
     const STACK_ALIGNMENT: usize = 16;
-    with_per_cpu_variables_mut(|per_cpu_variables| {
+    with_per_cpu_variables(|per_cpu_variables| {
         let kernel_sp = TruncateExt::<usize>::truncate(per_cpu_variables.kernel_stack_top())
             & !(STACK_ALIGNMENT - 1);
         let double_fault_sp =
@@ -612,15 +514,23 @@ pub fn init_per_cpu_variables() {
                 & !(STACK_ALIGNMENT - 1);
         let exception_sp = TruncateExt::<usize>::truncate(per_cpu_variables.exception_stack_top())
             & !(STACK_ALIGNMENT - 1);
+        // `Cell<VtlState>` is `#[repr(transparent)]`, so its address equals
+        // the inner `VtlState`'s address. Assembly code (`SAVE_VTL_STATE_ASM`
+        // / `LOAD_VTL_STATE_ASM`) pushes/pops registers directly to/from this
+        // address. This is sound because the assembly executes outside any
+        // Rust reference scope and the Cell is only accessed in Rust between
+        // the save and load points (i.e., while VTL1 is executing).
         let vtl0_state_top_addr =
             TruncateExt::<usize>::truncate(&raw const per_cpu_variables.vtl0_state as u64)
                 + core::mem::size_of::<VtlState>();
-        with_per_cpu_variables_asm(|pcv_asm| {
-            pcv_asm.set_kernel_stack_ptr(kernel_sp);
-            pcv_asm.set_double_fault_stack_ptr(double_fault_sp);
-            pcv_asm.set_exception_stack_ptr(exception_sp);
-            pcv_asm.set_vtl0_state_top_addr(vtl0_state_top_addr);
-        });
+        per_cpu_variables.asm.set_kernel_stack_ptr(kernel_sp);
+        per_cpu_variables
+            .asm
+            .set_double_fault_stack_ptr(double_fault_sp);
+        per_cpu_variables.asm.set_exception_stack_ptr(exception_sp);
+        per_cpu_variables
+            .asm
+            .set_vtl0_state_top_addr(vtl0_state_top_addr);
     });
 }
 
diff --git a/litebox_platform_lvbs/src/lib.rs b/litebox_platform_lvbs/src/lib.rs
index 043e50a3d..b06073400 100644
--- a/litebox_platform_lvbs/src/lib.rs
+++ b/litebox_platform_lvbs/src/lib.rs
@@ -2063,10 +2063,10 @@ unsafe extern "C" fn exception_handler(
             kernel_mode: true,
         }
     } else {
-        use crate::host::per_cpu_variables::{PerCpuVariablesAsm, with_per_cpu_variables_asm};
+        use crate::host::per_cpu_variables::with_per_cpu_variables;
         use litebox::utils::TruncateExt as _;
         litebox::shim::ExceptionInfo {
-            exception: with_per_cpu_variables_asm(PerCpuVariablesAsm::get_exception),
+            exception: with_per_cpu_variables(|pcv| pcv.asm.get_exception()),
             error_code: thread_ctx.ctx.orig_rax.truncate(),
             cr2,
             kernel_mode: false,
diff --git a/litebox_platform_lvbs/src/mshv/hvcall.rs b/litebox_platform_lvbs/src/mshv/hvcall.rs
index bbe14f9a4..7e328a676 100644
--- a/litebox_platform_lvbs/src/mshv/hvcall.rs
+++ b/litebox_platform_lvbs/src/mshv/hvcall.rs
@@ -4,10 +4,7 @@
 //! Hyper-V Hypercall functions
 
 use crate::{
-    arch::{
-        get_core_id,
-        instrs::{rdmsr, wrmsr},
-    },
+    arch::instrs::{rdmsr, wrmsr},
     debug_serial_println,
     host::{LvbsLinuxKernel, hv_hypercall_page_address, per_cpu_variables::with_per_cpu_variables},
     mm::MemoryProvider,
@@ -75,10 +72,11 @@ fn check_hyperv() -> Result<(), HypervError> {
 }
 
 /// Enable Hyper-V Hypercalls by initializing MSR and VP registers (for a core)
+///
 /// # Panics
 /// Panics if the underlying hardware/platform is not Hyper-V
 /// Panics if the MSR/VP registers writes fail
-pub fn init() -> Result<(), HypervError> {
+pub fn init(is_bsp: bool) -> Result<(), HypervError> {
     check_hyperv()?;
 
     debug_serial_println!("HV_REGISTER_VP_INDEX: {:#x}", rdmsr(HV_REGISTER_VP_INDEX));
@@ -113,7 +111,7 @@ pub fn init() -> Result<(), HypervError> {
     if guest_id != rdmsr(HV_X64_MSR_GUEST_OS_ID) {
         return Err(HypervError::InvalidGuestOSID);
     }
-    if get_core_id() == 0 {
+    if is_bsp {
         debug_serial_println!(
             "HV_X64_MSR_GUEST_OS_ID: {:#x}",
             rdmsr(HV_X64_MSR_GUEST_OS_ID)
@@ -156,13 +154,13 @@ pub fn init() -> Result<(), HypervError> {
     sint.set_auto_eoi(true);
 
     wrmsr(HV_X64_MSR_SINT0, sint.as_uint64());
-    if get_core_id() == 0 {
+    if is_bsp {
         debug_serial_println!("HV_X64_MSR_SINT0: {:#x}", rdmsr(HV_X64_MSR_SINT0));
     }
 
     wrmsr(HV_X64_MSR_SCONTROL, u64::from(HV_X64_MSR_SCONTROL_ENABLE));
 
-    vsm::init();
+    vsm::init(is_bsp);
 
     Ok(())
 }
diff --git a/litebox_platform_lvbs/src/mshv/hvcall_mm.rs b/litebox_platform_lvbs/src/mshv/hvcall_mm.rs
index 65746ed05..c02344109 100644
--- a/litebox_platform_lvbs/src/mshv/hvcall_mm.rs
+++ b/litebox_platform_lvbs/src/mshv/hvcall_mm.rs
@@ -12,7 +12,7 @@ use crate::mshv::{
     vtl_switch::{is_only_vp_in_vtl1, vtl1_vp_mask},
 };
 use crate::{
-    host::per_cpu_variables::with_per_cpu_variables_mut,
+    host::per_cpu_variables::with_per_cpu_variables,
     mshv::{
         HV_PARTITION_ID_SELF, HVCALL_MODIFY_VTL_PROTECTION_MASK, HvInputModifyVtlProtectionMask,
         HvInputVtl, HvPageProtFlags,
@@ -20,6 +20,7 @@ use crate::{
         vtl1_mem_layout::PAGE_SHIFT,
     },
 };
+use litebox::utils::TruncateExt;
 
 /// Compute the valid-bank bitmask for a sparse VP set
 /// (<https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/datatypes/hv_vp_set>).
@@ -49,41 +50,39 @@ pub fn hv_modify_vtl_protection_mask(
     num_pages: u64,
     page_access: HvPageProtFlags,
 ) -> Result<u64, HypervCallError> {
-    let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe {
-        &mut *per_cpu_variables
-            .hv_hypercall_input_page_as_mut_ptr()
-            .cast::<HvInputModifyVtlProtectionMask>()
-    });
-    *hvin = HvInputModifyVtlProtectionMask::new();
-
-    hvin.partition_id = HV_PARTITION_ID_SELF;
-    hvin.target_vtl = HvInputVtl::current();
-    hvin.map_flags = u32::from(page_access.bits());
-
-    let mut total_protected: u64 = 0;
-    while total_protected < num_pages {
-        let mut pages_to_protect: u16 = 0;
-        for i in 0..HvInputModifyVtlProtectionMask::MAX_PAGES_PER_REQUEST {
-            if total_protected + i as u64 >= num_pages {
-                break;
-            } else {
-                hvin.gpa_page_list[i] = (start >> PAGE_SHIFT) + (total_protected + i as u64);
-                pages_to_protect += 1;
+    with_per_cpu_variables(|pcv| {
+        pcv.with_hvcall_input::<HvInputModifyVtlProtectionMask, _>(|hvin| {
+            *hvin = HvInputModifyVtlProtectionMask::new();
+
+            hvin.partition_id = HV_PARTITION_ID_SELF;
+            hvin.target_vtl = HvInputVtl::current();
+            hvin.map_flags = u32::from(page_access.bits());
+
+            let mut total_protected: u64 = 0;
+            while total_protected < num_pages {
+                let remaining: usize = (num_pages - total_protected)
+                    .min(HvInputModifyVtlProtectionMask::MAX_PAGES_PER_REQUEST as u64)
+                    .truncate();
+                let pages_to_protect: u16 = remaining.truncate();
+
+                for i in 0..remaining {
+                    hvin.gpa_page_list[i] = (start >> PAGE_SHIFT) + (total_protected + i as u64);
+                }
+
+                let result = hv_do_rep_hypercall(
+                    HVCALL_MODIFY_VTL_PROTECTION_MASK,
+                    pages_to_protect,
+                    0,
+                    (&raw const *hvin).cast::<core::ffi::c_void>(),
+                    core::ptr::null_mut(),
+                );
+
+                total_protected += result?;
             }
-        }
-
-        let result = hv_do_rep_hypercall(
-            HVCALL_MODIFY_VTL_PROTECTION_MASK,
-            pages_to_protect,
-            0,
-            (&raw const *hvin).cast::<core::ffi::c_void>(),
-            core::ptr::null_mut(),
-        );
 
-        total_protected += result?;
-    }
-
-    Ok(total_protected)
+            Ok(total_protected)
+        })
+    })
 }
 
 /// Flush the entire virtual address space on VPs currently in VTL1.
@@ -110,27 +109,25 @@ pub(crate) fn hv_flush_virtual_address_space() -> Result<(), HypervCallError> {
         "caller is in VTL1 but VP mask is empty"
     );
 
-    let input = with_per_cpu_variables_mut(|pcv| unsafe {
-        &mut *pcv
-            .hv_hypercall_input_page_as_mut_ptr()
-            .cast::<HvInputFlushVirtualAddressSpaceEx>()
-    });
-
-    *input = HvInputFlushVirtualAddressSpaceEx {
-        address_space: 0,
-        flags: HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES,
-        vp_set_format: HV_GENERIC_SET_SPARSE_4K,
-        vp_set_valid_bank_mask: valid_bank_mask,
-        vp_set_bank_contents: vp_mask,
-    };
-
-    hv_do_hypercall(
-        u64::from(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX),
-        (&raw const *input).cast::<core::ffi::c_void>(),
-        core::ptr::null_mut(),
-    )?;
-
-    Ok(())
+    with_per_cpu_variables(|pcv| {
+        pcv.with_hvcall_input::<HvInputFlushVirtualAddressSpaceEx, _>(|input| {
+            *input = HvInputFlushVirtualAddressSpaceEx {
+                address_space: 0,
+                flags: HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES,
+                vp_set_format: HV_GENERIC_SET_SPARSE_4K,
+                vp_set_valid_bank_mask: valid_bank_mask,
+                vp_set_bank_contents: vp_mask,
+            };
+
+            hv_do_hypercall(
+                u64::from(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX),
+                (&raw const *input).cast::<core::ffi::c_void>(),
+                core::ptr::null_mut(),
+            )?;
+
+            Ok(())
+        })
+    })
 }
 
 /// Flush specific virtual addresses on VPs currently in VTL1.
@@ -168,50 +165,49 @@ pub(crate) fn hv_flush_virtual_address_list(
         "caller is in VTL1 but VP mask is empty"
     );
 
-    let input = with_per_cpu_variables_mut(|pcv| unsafe {
-        &mut *pcv
-            .hv_hypercall_input_page_as_mut_ptr()
-            .cast::<HvInputFlushVirtualAddressListEx>()
-    });
-
-    input.address_space = 0;
-    input.flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
-    input.vp_set_format = HV_GENERIC_SET_SPARSE_4K;
-    input.vp_set_valid_bank_mask = valid_bank_mask;
-    input.vp_set_bank_contents = vp_mask;
-
-    let mut remaining = page_count;
-    let mut current_va = start_va;
-
-    while remaining > 0 {
-        let mut gva_count: u16 = 0;
-
-        while remaining > 0
-            && (gva_count as usize) < HvInputFlushVirtualAddressListEx::MAX_GVAS_PER_REQUEST
-        {
-            // Each entry can cover up to `MAX_ADDITIONAL_PAGES + 1` pages.
-            let additional = remaining.saturating_sub(1).min(MAX_ADDITIONAL_PAGES);
-            let pages_in_entry = additional + 1;
-
-            // GVA range entry: bits 63:12 = page number, bits 11:0 = additional_pages
-            let page_number = current_va >> 12;
-            input.gva_range_list[gva_count as usize] = (page_number << 12) | additional as u64;
-
-            current_va += (pages_in_entry as u64) << 12;
-            remaining -= pages_in_entry;
-            gva_count += 1;
-        }
-
-        hv_do_rep_hypercall(
-            HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
-            gva_count,
-            HvInputFlushVirtualAddressListEx::VP_SET_QWORD_COUNT,
-            (&raw const *input).cast::<core::ffi::c_void>(),
-            core::ptr::null_mut(),
-        )?;
-    }
+    with_per_cpu_variables(|pcv| {
+        pcv.with_hvcall_input::<HvInputFlushVirtualAddressListEx, _>(|input| {
+            input.address_space = 0;
+            input.flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+            input.vp_set_format = HV_GENERIC_SET_SPARSE_4K;
+            input.vp_set_valid_bank_mask = valid_bank_mask;
+            input.vp_set_bank_contents = vp_mask;
+
+            let mut remaining = page_count;
+            let mut current_va = start_va;
+
+            while remaining > 0 {
+                let mut gva_count: u16 = 0;
+
+                while remaining > 0
+                    && (gva_count as usize) < HvInputFlushVirtualAddressListEx::MAX_GVAS_PER_REQUEST
+                {
+                    // Each entry can cover up to `MAX_ADDITIONAL_PAGES + 1` pages.
+                    let additional = remaining.saturating_sub(1).min(MAX_ADDITIONAL_PAGES);
+                    let pages_in_entry = additional + 1;
+
+                    // GVA range entry: bits 63:12 = page number, bits 11:0 = additional_pages
+                    let page_number = current_va >> 12;
+                    input.gva_range_list[gva_count as usize] =
+                        (page_number << 12) | additional as u64;
+
+                    current_va += (pages_in_entry as u64) << 12;
+                    remaining -= pages_in_entry;
+                    gva_count += 1;
+                }
+
+                hv_do_rep_hypercall(
+                    HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
+                    gva_count,
+                    HvInputFlushVirtualAddressListEx::VP_SET_QWORD_COUNT,
+                    (&raw const *input).cast::<core::ffi::c_void>(),
+                    core::ptr::null_mut(),
+                )?;
+            }
 
-    Ok(())
+            Ok(())
+        })
+    })
 }
 
 /// Maximum number of additional pages encodable in bits 11:0 of a
diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
index ff5643ec3..6f3474b2f 100644
--- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
+++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
@@ -9,7 +9,7 @@ use crate::{
         msr::{MSR_EFER, MSR_IA32_CR_PAT},
     },
     debug_serial_println,
-    host::per_cpu_variables::with_per_cpu_variables_mut,
+    host::per_cpu_variables::with_per_cpu_variables,
     mshv::{
         HV_PARTITION_ID_SELF, HV_VP_INDEX_SELF, HV_VTL_NORMAL, HV_VTL_SECURE, HVCALL_ENABLE_VP_VTL,
         HVCALL_GET_VP_REGISTERS, HVCALL_SET_VP_REGISTERS, HvEnableVpVtl, HvGetVpRegistersInput,
@@ -31,26 +31,25 @@ fn hvcall_set_vp_registers_internal(
     value: u64,
     target_vtl: HvInputVtl,
 ) -> Result<u64, HypervCallError> {
-    let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe {
-        &mut *per_cpu_variables
-            .hv_hypercall_input_page_as_mut_ptr()
-            .cast::<HvSetVpRegistersInput>()
-    });
-    *hvin = HvSetVpRegistersInput::new();
-
-    hvin.header.partitionid = HV_PARTITION_ID_SELF;
-    hvin.header.vpindex = HV_VP_INDEX_SELF;
-    hvin.header.target_vtl = target_vtl;
-    hvin.element[0].name = reg_name;
-    hvin.element[0].valuelow = value;
-
-    hv_do_rep_hypercall(
-        HVCALL_SET_VP_REGISTERS,
-        1,
-        0,
-        (&raw const *hvin).cast::<core::ffi::c_void>(),
-        core::ptr::null_mut(),
-    )
+    with_per_cpu_variables(|pcv| {
+        pcv.with_hvcall_input::<HvSetVpRegistersInput, _>(|hvin| {
+            *hvin = HvSetVpRegistersInput::new();
+
+            hvin.header.partitionid = HV_PARTITION_ID_SELF;
+            hvin.header.vpindex = HV_VP_INDEX_SELF;
+            hvin.header.target_vtl = target_vtl;
+            hvin.element[0].name = reg_name;
+            hvin.element[0].valuelow = value;
+
+            hv_do_rep_hypercall(
+                HVCALL_SET_VP_REGISTERS,
+                1,
+                0,
+                (&raw const *hvin).cast::<core::ffi::c_void>(),
+                core::ptr::null_mut(),
+            )
+        })
+    })
 }
 
 /// Hyper-V Hypercall to set current VTL (i.e., VTL1)'s registers. It can program Hyper-V registers
@@ -70,33 +69,30 @@ fn hvcall_get_vp_registers_internal(
     reg_name: u32,
     target_vtl: HvInputVtl,
 ) -> Result<u64, HypervCallError> {
-    let hvin = with_per_cpu_variables_mut(|per_cpu_variables| unsafe {
-        &mut *per_cpu_variables
-            .hv_hypercall_input_page_as_mut_ptr()
-            .cast::<HvGetVpRegistersInput>()
-    });
-    *hvin = HvGetVpRegistersInput::new();
-    let hvout = with_per_cpu_variables_mut(|per_cpu_variables| unsafe {
-        &mut *per_cpu_variables
-            .hv_hypercall_output_page_as_mut_ptr()
-            .cast::<HvGetVpRegistersOutput>()
-    });
-    *hvout = HvGetVpRegistersOutput::new();
-
-    hvin.header.partitionid = HV_PARTITION_ID_SELF;
-    hvin.header.vpindex = HV_VP_INDEX_SELF;
-    hvin.header.target_vtl = target_vtl;
-    hvin.element[0].name0 = reg_name;
-
-    hv_do_rep_hypercall(
-        HVCALL_GET_VP_REGISTERS,
-        1,
-        0,
-        (&raw const *hvin).cast::<core::ffi::c_void>(),
-        (&raw mut *hvout).cast::<core::ffi::c_void>(),
-    )?;
-
-    Ok(hvout.as64().0)
+    with_per_cpu_variables(|pcv| {
+        pcv.with_hvcall_input::<HvGetVpRegistersInput, _>(|hvin| {
+            *hvin = HvGetVpRegistersInput::new();
+
+            hvin.header.partitionid = HV_PARTITION_ID_SELF;
+            hvin.header.vpindex = HV_VP_INDEX_SELF;
+            hvin.header.target_vtl = target_vtl;
+            hvin.element[0].name0 = reg_name;
+
+            pcv.with_hvcall_output::<HvGetVpRegistersOutput, _>(|hvout| {
+                *hvout = HvGetVpRegistersOutput::new();
+
+                hv_do_rep_hypercall(
+                    HVCALL_GET_VP_REGISTERS,
+                    1,
+                    0,
+                    (&raw const *hvin).cast::<core::ffi::c_void>(),
+                    (&raw mut *hvout).cast::<core::ffi::c_void>(),
+                )?;
+
+                Ok(hvout.as64().0)
+            })
+        })
+    })
 }
 
 /// Hyper-V Hypercall to get current VTL (i.e., VTL1)'s registers. It can access Hyper-V registers
@@ -233,7 +229,14 @@ pub fn init_vtl_ap(core: u32) -> Result<u64, HypervCallError> {
     // has high-canonical mappings, so these are ready to use as-is for the
     // AP's initial VP context.
     let rip: u64 = get_entry();
-    let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64 - 1;
+    // SAFETY: We dont support concurrent AP/VTL initialization and thus share
+    // the same stack pointer. If we plan to support concurrent initialization,
+    // we should provide seperate stack pointers for each AP (which might not
+    // scale if there are several 100s of APs).
+    //
+    // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V
+    // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned.
+    let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64;
     let tss = get_address_of_special_page(VTL1_TSS_PAGE);
 
     let result = hvcall_enable_vp_vtl(core, HV_VTL_SECURE, tss, rip, rsp);
diff --git a/litebox_platform_lvbs/src/mshv/mod.rs b/litebox_platform_lvbs/src/mshv/mod.rs
index 23ba2139a..0da18875d 100644
--- a/litebox_platform_lvbs/src/mshv/mod.rs
+++ b/litebox_platform_lvbs/src/mshv/mod.rs
@@ -925,11 +925,11 @@ impl HvPendingExceptionEvent {
 #[cfg(not(test))]
 #[inline]
 pub(crate) fn is_hvcall_ready() -> bool {
-    use crate::host::per_cpu_variables::with_per_cpu_variables_asm;
+    use crate::host::per_cpu_variables::with_per_cpu_variables;
     // The VTL return address is configured only after the hypercall page
     // has been set up, so a non-zero value indicates that hypercalls are
     // available.
-    with_per_cpu_variables_asm(|pcv| pcv.get_vtl_return_addr() != 0)
+    with_per_cpu_variables(|pcv| pcv.asm.get_vtl_return_addr() != 0)
 }
 
 #[cfg(test)]
diff --git a/litebox_platform_lvbs/src/mshv/vsm.rs b/litebox_platform_lvbs/src/mshv/vsm.rs
index c338e8b1b..b6895167f 100644
--- a/litebox_platform_lvbs/src/mshv/vsm.rs
+++ b/litebox_platform_lvbs/src/mshv/vsm.rs
@@ -7,12 +7,11 @@
 use crate::mshv::mem_integrity::parse_modinfo;
 use crate::mshv::ringbuffer::set_ringbuffer;
 use crate::{
-    arch::get_core_id,
     debug_serial_print, debug_serial_println,
     host::{
         bootparam::get_vtl1_memory_info,
         linux::{CpuMask, KEXEC_SEGMENT_MAX, Kimage},
-        per_cpu_variables::with_per_cpu_variables_mut,
+        per_cpu_variables::with_per_cpu_variables,
     },
     mshv::{
         HV_REGISTER_CR_INTERCEPT_CONTROL, HV_REGISTER_CR_INTERCEPT_CR0_MASK,
@@ -67,9 +66,9 @@ const MODULE_VALIDATION_MAX_SIZE: usize = 64 * 1024 * 1024;
 
 static CPU_ONLINE_MASK: Once<Box<CpuMask>> = Once::new();
 
-pub(crate) fn init() {
+pub(crate) fn init(is_bsp: bool) {
     assert!(
-        !(get_core_id() == 0 && mshv_vsm_configure_partition().is_err()),
+        !(is_bsp && mshv_vsm_configure_partition().is_err()),
         "Failed to configure VSM partition"
     );
 
@@ -83,7 +82,7 @@ pub(crate) fn init() {
         "Failed to secure VTL0 configuration"
     );
 
-    if get_core_id() == 0 {
+    if is_bsp {
         if let Ok((start, size)) = get_vtl1_memory_info() {
             debug_serial_println!("VSM: Protect GPAs from {:#x} to {:#x}", start, start + size);
             if protect_physical_memory_range(
@@ -1007,14 +1006,18 @@ impl ControlRegMap {
 
 #[allow(clippy::unnecessary_wraps)]
 fn save_vtl0_locked_regs() -> Result<u64, HypervCallError> {
-    let reg_names = with_per_cpu_variables_mut(|per_cpu_variables| {
-        per_cpu_variables.vtl0_locked_regs.init();
-        per_cpu_variables.vtl0_locked_regs.reg_names()
+    let reg_names = with_per_cpu_variables(|per_cpu_variables| {
+        let mut regs = per_cpu_variables.vtl0_locked_regs.get();
+        regs.init();
+        per_cpu_variables.vtl0_locked_regs.set(regs);
+        regs.reg_names()
     });
     for reg_name in reg_names {
         if let Ok(value) = hvcall_get_vp_vtl0_registers(reg_name) {
-            with_per_cpu_variables_mut(|per_cpu_variables| {
-                per_cpu_variables.vtl0_locked_regs.set(reg_name, value);
+            with_per_cpu_variables(|per_cpu_variables| {
+                let mut regs = per_cpu_variables.vtl0_locked_regs.get();
+                regs.set(reg_name, value);
+                per_cpu_variables.vtl0_locked_regs.set(regs);
             });
         }
     }
diff --git a/litebox_platform_lvbs/src/mshv/vsm_intercept.rs b/litebox_platform_lvbs/src/mshv/vsm_intercept.rs
index dc17aaad2..4c98f80b5 100644
--- a/litebox_platform_lvbs/src/mshv/vsm_intercept.rs
+++ b/litebox_platform_lvbs/src/mshv/vsm_intercept.rs
@@ -3,7 +3,7 @@
 
 use crate::{
     debug_serial_println,
-    host::per_cpu_variables::{with_per_cpu_variables, with_per_cpu_variables_mut},
+    host::per_cpu_variables::with_per_cpu_variables,
     mshv::{
         DEFAULT_REG_PIN_MASK, HV_REGISTER_PENDING_EVENT0, HV_X64_REGISTER_APIC_BASE,
         HV_X64_REGISTER_CR0, HV_X64_REGISTER_CR4, HV_X64_REGISTER_CSTAR, HV_X64_REGISTER_EFER,
@@ -55,19 +55,15 @@ pub enum InterceptedRegisterName {
 /// - Failed to raise VTL0 GP fault
 /// - Intercepted write to unknown MSR/register
 pub fn vsm_handle_intercept() {
-    let simp_page = with_per_cpu_variables_mut(|per_cpu_variables| unsafe {
-        &mut *per_cpu_variables.hv_simp_page_as_mut_ptr()
-    });
-
-    let msg_type = simp_page.sint_message[0].header.message_type;
-    simp_page.sint_message[0].header.message_type = HvMessageType::None.into();
-    let payload = simp_page.sint_message[0].payload;
+    // Extract the intercept message from the SIMP page and clear it,
+    // all within the `with_per_cpu_variables` scope.
+    let msg = with_per_cpu_variables(|pcv| pcv.take_sint_message(0));
 
-    match HvMessageType::try_from(msg_type).unwrap() {
+    match HvMessageType::try_from(msg.header.message_type).unwrap() {
         HvMessageType::GpaIntercept => {
             let int_msg = unsafe {
-                let ptr = payload.as_ptr().cast::<HvMemInterceptMessage>();
-                &(*ptr) as &HvMemInterceptMessage
+                let ptr = core::ptr::addr_of!(msg.payload).cast::<HvMemInterceptMessage>();
+                &*ptr
             };
 
             let gpa = int_msg.gpa;
@@ -76,8 +72,8 @@ pub fn vsm_handle_intercept() {
         }
         HvMessageType::MsrIntercept => {
             let int_msg = unsafe {
-                let ptr = payload.as_ptr().cast::<HvMsrInterceptMessage>();
-                &(*ptr) as &HvMsrInterceptMessage
+                let ptr = core::ptr::addr_of!(msg.payload).cast::<HvMsrInterceptMessage>();
+                &*ptr
             };
 
             let msr_index = int_msg.msr;
@@ -110,8 +106,8 @@ pub fn vsm_handle_intercept() {
         }
         HvMessageType::RegisterIntercept => {
             let int_msg = unsafe {
-                let ptr = payload.as_ptr().cast::<HvInterceptMessage>();
-                &(*ptr) as &HvInterceptMessage
+                let ptr = core::ptr::addr_of!(msg.payload).cast::<HvInterceptMessage>();
+                &*ptr
             };
 
             let reg_name = int_msg.reg_name;
@@ -141,6 +137,7 @@ pub fn vsm_handle_intercept() {
             }
         }
         _ => {
+            let msg_type = msg.header.message_type;
             debug_serial_println!(
                 "VSM: Ignore unknown synthetic interrupt message type {msg_type:#x}"
             );
@@ -176,7 +173,7 @@ fn validate_and_continue_vtl0_register_write(
     int_msg_hdr: &HvInterceptMessageHeader,
 ) {
     let allowed_value = with_per_cpu_variables(|per_cpu_variables| {
-        per_cpu_variables.vtl0_locked_regs.get(reg_name)
+        per_cpu_variables.vtl0_locked_regs.get().get(reg_name)
     });
     if let Some(allowed_value) = allowed_value {
         if value & mask == allowed_value {
diff --git a/litebox_platform_lvbs/src/mshv/vtl_switch.rs b/litebox_platform_lvbs/src/mshv/vtl_switch.rs
index cb437d12a..b707e1f64 100644
--- a/litebox_platform_lvbs/src/mshv/vtl_switch.rs
+++ b/litebox_platform_lvbs/src/mshv/vtl_switch.rs
@@ -5,10 +5,7 @@
 
 use crate::host::{
     hv_hypercall_page_address,
-    per_cpu_variables::{
-        PerCpuVariables, PerCpuVariablesAsm, with_per_cpu_variables, with_per_cpu_variables_asm,
-        with_per_cpu_variables_mut,
-    },
+    per_cpu_variables::{PerCpuVariables, PerCpuVariablesAsm, with_per_cpu_variables},
 };
 use crate::mshv::{
     HV_FLUSH_EX_VP_SET_BANKS, HV_REGISTER_VSM_CODEPAGE_OFFSETS, HvRegisterVsmCodePageOffsets,
@@ -99,13 +96,13 @@ static VTL1_VP_MASK: AtomicVpMask = AtomicVpMask::new();
 /// Mark the current VP as executing in VTL1.
 #[inline]
 fn vtl1_vp_enter() {
-    VTL1_VP_MASK.set(with_per_cpu_variables_mut(PerCpuVariables::vp_index) as usize);
+    VTL1_VP_MASK.set(with_per_cpu_variables(PerCpuVariables::vp_index) as usize);
 }
 
 /// Remove the current VP from the VTL1 mask (it is returning to VTL0).
 #[inline]
 fn vtl1_vp_exit() {
-    VTL1_VP_MASK.clear(with_per_cpu_variables_mut(PerCpuVariables::vp_index) as usize);
+    VTL1_VP_MASK.clear(with_per_cpu_variables(PerCpuVariables::vp_index) as usize);
 }
 
 /// Return the current VTL1 VP mask for use in TLB flush hypercalls.
@@ -128,7 +125,7 @@ pub(crate) fn vtl1_vp_mask() -> [u64; HV_FLUSH_EX_VP_SET_BANKS] {
 #[cfg(not(test))]
 #[inline]
 pub(crate) fn is_only_vp_in_vtl1() -> bool {
-    VTL1_VP_MASK.is_single_vp(with_per_cpu_variables_mut(PerCpuVariables::vp_index))
+    VTL1_VP_MASK.is_single_vp(with_per_cpu_variables(PerCpuVariables::vp_index))
 }
 
 // ============================================================================
@@ -365,8 +362,8 @@ fn handle_vtl_entry() -> Option<[u64; NUM_VTLCALL_PARAMS]> {
 /// Returns `None` if the entry reason is not a valid `VtlEntryReason`.
 #[inline]
 fn get_vtl_entry_reason() -> Option<VtlEntryReason> {
-    let reason = with_per_cpu_variables(|per_cpu_variables| unsafe {
-        (*per_cpu_variables.hv_vp_assist_page_as_ptr()).vtl_entry_reason
+    let reason = with_per_cpu_variables(|per_cpu_variables| {
+        per_cpu_variables.with_vp_assist_page(|page| page.vtl_entry_reason)
     });
     VtlEntryReason::try_from(reason).ok()
 }
@@ -374,13 +371,15 @@ fn get_vtl_entry_reason() -> Option<VtlEntryReason> {
 /// Get the VTL call parameters from the saved VTL0 state.
 #[inline]
 fn get_vtlcall_params() -> [u64; NUM_VTLCALL_PARAMS] {
-    with_per_cpu_variables(|per_cpu_variables| per_cpu_variables.vtl0_state.get_vtlcall_params())
+    with_per_cpu_variables(|per_cpu_variables| {
+        per_cpu_variables.vtl0_state.get().get_vtlcall_params()
+    })
 }
 
 /// Set the VTL return value that will be returned to VTL0.
 #[inline]
 fn set_vtl_return_value(value: i64) {
-    with_per_cpu_variables_mut(|per_cpu_variables| {
+    with_per_cpu_variables(|per_cpu_variables| {
         per_cpu_variables.set_vtl_return_value(value.reinterpret_as_unsigned());
     });
 }
@@ -402,8 +401,8 @@ pub(crate) fn mshv_vsm_get_code_page_offsets() -> Result<(), VsmError> {
     let vtl_return_address = hvcall_page
         .checked_add(usize::from(code_page_offsets.vtl_return_offset()))
         .ok_or(VsmError::CodePageOffsetOverflow)?;
-    with_per_cpu_variables_asm(|pcv_asm| {
-        pcv_asm.set_vtl_return_addr(vtl_return_address);
+    with_per_cpu_variables(|pcv| {
+        pcv.asm.set_vtl_return_addr(vtl_return_address);
     });
     Ok(())
 }
@@ -480,7 +479,7 @@ pub fn vtl_switch(return_value: Option<i64>) -> [u64; NUM_VTLCALL_PARAMS] {
             // one buffer at a time. At this point, the CPU's tracking might rely on VTL0's
             // buffer (if VTL0 called XRSTOR). Thus, we shouldn't use XSAVEOPT until XRSTOR
             // re-establishes tracking for VTL1's buffer.
-            with_per_cpu_variables_asm(PerCpuVariablesAsm::reset_vtl1_xsaved);
+            with_per_cpu_variables(|pcv| pcv.asm.reset_vtl1_xsaved());
 
             return params;
         }
diff --git a/litebox_runner_lvbs/src/lib.rs b/litebox_runner_lvbs/src/lib.rs
index 3c9a08acc..ed6302601 100644
--- a/litebox_runner_lvbs/src/lib.rs
+++ b/litebox_runner_lvbs/src/lib.rs
@@ -20,9 +20,9 @@ use litebox_common_optee::{
     OpteeSmcReturnCode, TeeOrigin, TeeResult, UteeEntryFunc, UteeParams, optee_msg_args_total_size,
 };
 use litebox_platform_lvbs::{
-    arch::{gdt, get_core_id, instrs::hlt_loop, interrupts},
+    arch::{gdt, instrs::hlt_loop, interrupts},
     debug_serial_println,
-    host::{bootparam::get_vtl1_memory_info, per_cpu_variables::allocate_per_cpu_variables},
+    host::{bootparam::get_vtl1_memory_info, per_cpu_variables},
     mm::MemoryProvider,
     mshv::{
         NUM_VTLCALL_PARAMS, VsmFunction, hvcall,
@@ -32,8 +32,9 @@ use litebox_platform_lvbs::{
         vtl1_mem_layout::{
             VSM_SK_PTE_PAGES_COUNT, VTL1_INIT_HEAP_SIZE, VTL1_INIT_HEAP_START_PAGE,
             VTL1_PML4E_PAGE, VTL1_PRE_POPULATED_MEMORY_SIZE, VTL1_PTE_0_PAGE, VTL1_REMAP_PDE_PAGE,
-            VTL1_REMAP_PDPT_PAGE, get_heap_start_address, get_rela_end_address,
-            get_rela_start_address, get_text_end_address, get_text_start_address,
+            VTL1_REMAP_PDPT_PAGE, get_heap_start_address, get_memory_base_address,
+            get_rela_end_address, get_rela_start_address, get_text_end_address,
+            get_text_start_address,
         },
     },
     serial_println,
@@ -49,152 +50,162 @@ use litebox_shim_optee::{NormalWorldConstPtr, NormalWorldMutPtr, UserConstPtr};
 use once_cell::race::OnceBox;
 use spin::mutex::SpinMutex;
 
+/// Seed the initial heap regions so the global allocator has enough memory
+/// for slab-backed allocations (the slab needs >= 2 MB backing pages).
+pub fn seed_initial_heap() {
+    let vtl1_base_va = get_memory_base_address();
+    let vtl1_start = Platform::va_to_pa(x86_64::VirtAddr::new(vtl1_base_va));
+
+    let mem_fill_start =
+        TruncateExt::<usize>::truncate(vtl1_base_va) + VTL1_INIT_HEAP_START_PAGE * PAGE_SIZE;
+    unsafe {
+        Platform::mem_fill_pages(mem_fill_start, VTL1_INIT_HEAP_SIZE);
+    }
+    debug_serial_println!(
+        "heap: seed init region (pages {}..+{:#x}): VA {:#x}, size {:#x}",
+        VTL1_INIT_HEAP_START_PAGE,
+        VTL1_INIT_HEAP_SIZE,
+        mem_fill_start,
+        VTL1_INIT_HEAP_SIZE
+    );
+
+    // Add pre-populated region (_heap_start .. end of Phase 1 mapping).
+    let heap_va = get_heap_start_address();
+    let mem_fill_start: usize = heap_va.truncate();
+    let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64();
+    let heap_offset: usize = TruncateExt::<usize>::truncate(heap_phys - vtl1_start.as_u64());
+    let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset;
+    unsafe {
+        Platform::mem_fill_pages(mem_fill_start, mem_fill_size);
+    }
+    debug_serial_println!(
+        "heap: add pre-populated region (_heap_start..Phase 1 end): VA {:#x}, size {:#x}",
+        mem_fill_start,
+        mem_fill_size
+    );
+}
+
+/// Initialize the current core.
+///
+/// When `is_bsp` is `true`, creates the platform, sets up page tables, and
+/// reclaims early memory.
+/// All cores then initialize hypercalls, GDT, IDT, interrupts, and syscall
+/// support.
+///
 /// # Panics
 ///
-/// Panics if it failed to enable Hyper-V hypercall
-pub fn init() -> Option<&'static Platform> {
-    let mut ret: Option<&'static Platform> = None;
-
-    if get_core_id() == 0 {
-        if let Ok((start, size)) = get_vtl1_memory_info() {
-            let vtl1_start = x86_64::PhysAddr::new(start);
-            let vtl1_end = x86_64::PhysAddr::new(start + size);
-
-            // Add a small range of mapped memory to the global allocator for populating the base page table.
-            // `VTL1_INIT_HEAP_START_PAGE` and `VTL1_INIT_HEP_SIZE` specify a physical address range which is
-            // not used by the VTL1 kernel.
-            let mem_fill_start =
-                TruncateExt::<usize>::truncate(Platform::pa_to_va(vtl1_start).as_u64())
-                    + VTL1_INIT_HEAP_START_PAGE * PAGE_SIZE;
-            let mem_fill_size = VTL1_INIT_HEAP_SIZE;
+/// Panics if VTL1 memory info is unavailable (BSP) or if hypercall
+/// initialization fails.
+pub fn init(is_bsp: bool) -> Option<&'static Platform> {
+    let ret = if is_bsp {
+        let (start, size) = get_vtl1_memory_info().expect("Failed to get memory info");
+        let vtl1_start = x86_64::PhysAddr::new(start);
+        let vtl1_end = x86_64::PhysAddr::new(start + size);
+
+        // Re-compute the pre-populated region bounds needed for the
+        // remaining-memory add after `Platform::new()` below.
+        let heap_va = get_heap_start_address();
+        let mem_fill_start: usize = heap_va.truncate();
+        let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64();
+        let heap_offset: usize = TruncateExt::<usize>::truncate(heap_phys - start);
+        let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset;
+
+        // Text section boundaries. These are used by the platform to mark
+        // code pages executable and everything else NO_EXECUTE (DEP).
+        // After two-phase relocation, linker symbols return
+        // high-canonical VAs; convert to PA for the page table mapper.
+        let text_phys_start = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_start_address()));
+        let text_phys_end = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_end_address()));
+
+        // Reclaim .rela.dyn section memory now that relocations have been applied
+        // and we are running at high-canonical addresses.
+        // After two-phase relocation, `get_rela_start/end_address()` return
+        // high-canonical VAs. Use directly for the allocator.
+        let rela_va = get_rela_start_address();
+        let rela_size: usize = (get_rela_end_address() - rela_va).truncate();
+        if rela_size > 0 {
+            let rela_virt: usize = rela_va.truncate();
             unsafe {
-                Platform::mem_fill_pages(mem_fill_start, mem_fill_size);
+                Platform::mem_fill_pages(rela_virt, rela_size);
             }
             debug_serial_println!(
-                "heap: seed init region (pages {}..+{:#x}): VA {:#x}, size {:#x}",
-                VTL1_INIT_HEAP_START_PAGE,
-                mem_fill_size,
-                mem_fill_start,
-                mem_fill_size
+                "heap: reclaim .rela.dyn section: VA {:#x}, size {:#x}",
+                rela_virt,
+                rela_size
             );
+        }
 
-            // Add remaining mapped but non-used memory pages (between `get_heap_start_address()` and
-            // the end of the Phase 1 high-canonical mapping) to the global allocator.
-            //
-            // Phase 1 maps `VTL1_REMAP_PTE_COUNT * 2 MiB` = 16 MiB of high-canonical
-            // memory, which equals the full pre-populated region. We must NOT hand
-            // out addresses beyond that boundary because they are unmapped until
-            // `Platform::new()` builds the base page table covering all 128 MiB.
-            // The full VTL1 range is added after `Platform::new()` completes.
-            //
-            // After two-phase relocation, `get_heap_start_address()` returns a
-            // high-canonical VA. Use it directly for the allocator.
-            let heap_va = get_heap_start_address();
-            let mem_fill_start: usize = heap_va.truncate();
-            let heap_phys = Platform::va_to_pa(x86_64::VirtAddr::new(heap_va)).as_u64();
-            let heap_offset: usize = TruncateExt::<usize>::truncate(heap_phys - start);
-            let mem_fill_size = VTL1_PRE_POPULATED_MEMORY_SIZE - heap_offset;
+        let platform = Platform::new(vtl1_start, vtl1_end, text_phys_start, text_phys_end);
+        litebox_platform_multiplex::set_platform(platform);
+
+        // Reclaim Phase 1 / VTL0 page table frames now that Platform::new()
+        // has loaded a fresh base page table covering all VTL1 memory.
+        // These physical pages are no longer referenced by CR3.
+        {
+            // Reclaim pages 2–12 (PML4, PDPT, PDE, 8 PTE pages)
+            let early_pt_pa = vtl1_start + (VTL1_PML4E_PAGE * PAGE_SIZE) as u64;
+            let early_pt_start: usize =
+                TruncateExt::<usize>::truncate(Platform::pa_to_va(early_pt_pa).as_u64());
+            let early_pt_size: usize =
+                (VTL1_PTE_0_PAGE + VSM_SK_PTE_PAGES_COUNT - VTL1_PML4E_PAGE) * PAGE_SIZE;
+            // Safety: the early page table frames are no longer referenced
+            // (CR3 now points to the Phase 2 base page table).
             unsafe {
-                Platform::mem_fill_pages(mem_fill_start, mem_fill_size);
+                Platform::mem_fill_pages(early_pt_start, early_pt_size);
             }
             debug_serial_println!(
-                "heap: add pre-populated region (_heap_start..Phase 1 end): VA {:#x}, size {:#x}",
-                mem_fill_start,
-                mem_fill_size
+                "heap: reclaim early page table frames (pages {}..{}): VA {:#x}, size {:#x}",
+                VTL1_PML4E_PAGE,
+                VTL1_PML4E_PAGE + (early_pt_size / PAGE_SIZE),
+                early_pt_start,
+                early_pt_size
             );
 
-            // Text section boundaries. These are used by the platform to mark
-            // code pages executable and everything else NO_EXECUTE (DEP).
-            // After two-phase relocation, linker symbols return
-            // high-canonical VAs; convert to PA for the page table mapper.
-            let text_phys_start =
-                Platform::va_to_pa(x86_64::VirtAddr::new(get_text_start_address()));
-            let text_phys_end = Platform::va_to_pa(x86_64::VirtAddr::new(get_text_end_address()));
-
-            // Reclaim .rela.dyn section memory now that relocations have been applied
-            // and we're running at high-canonical addresses.
-            // After two-phase relocation, `get_rela_start/end_address()` return
-            // high-canonical VAs. Use directly for the allocator.
-            let rela_va = get_rela_start_address();
-            let rela_size: usize = (get_rela_end_address() - rela_va).truncate();
-            if rela_size > 0 {
-                let rela_virt: usize = rela_va.truncate();
-                unsafe {
-                    Platform::mem_fill_pages(rela_virt, rela_size);
-                }
-                debug_serial_println!(
-                    "heap: reclaim .rela.dyn section: VA {:#x}, size {:#x}",
-                    rela_virt,
-                    rela_size
-                );
-            }
+            // NOTE: The boot stack page (VTL1_KERNEL_STACK_PAGE) MUST NOT be
+            // reclaimed here. APs reuse it as their initial RSP when they
+            // enter VTL1 via `hvcall_enable_vp_vtl`.
 
-            let platform = Platform::new(vtl1_start, vtl1_end, text_phys_start, text_phys_end);
-            ret = Some(platform);
-            litebox_platform_multiplex::set_platform(platform);
-
-            // Reclaim Phase 1 / VTL0 page table frames now that Platform::new()
-            // has loaded a fresh base page table covering all VTL1 memory.
-            // These physical pages are no longer referenced by CR3.
-            {
-                // Reclaim pages 2–12 (PML4, PDPT, PDE, 8 PTE pages)
-                let early_pt_pa = vtl1_start + (VTL1_PML4E_PAGE * PAGE_SIZE) as u64;
-                let early_pt_start: usize =
-                    TruncateExt::<usize>::truncate(Platform::pa_to_va(early_pt_pa).as_u64());
-                let early_pt_size: usize =
-                    (VTL1_PTE_0_PAGE + VSM_SK_PTE_PAGES_COUNT - VTL1_PML4E_PAGE) * PAGE_SIZE;
-                // Safety: the early page table frames are no longer referenced
-                // (CR3 now points to the Phase 2 base page table).
-                unsafe {
-                    Platform::mem_fill_pages(early_pt_start, early_pt_size);
-                }
-                debug_serial_println!(
-                    "heap: reclaim early page table frames (pages {}..{}): VA {:#x}, size {:#x}",
-                    VTL1_PML4E_PAGE,
-                    VTL1_PML4E_PAGE + (early_pt_size / PAGE_SIZE),
-                    early_pt_start,
-                    early_pt_size
-                );
-
-                // Reclaim Phase 1 PDPT and PDE pages
-                let remap_pt_pa = vtl1_start + (VTL1_REMAP_PDPT_PAGE * PAGE_SIZE) as u64;
-                let remap_pt_start: usize =
-                    TruncateExt::<usize>::truncate(Platform::pa_to_va(remap_pt_pa).as_u64());
-                let remap_pt_size: usize =
-                    (VTL1_REMAP_PDE_PAGE - VTL1_REMAP_PDPT_PAGE + 1) * PAGE_SIZE;
-                unsafe {
-                    Platform::mem_fill_pages(remap_pt_start, remap_pt_size);
-                }
-                debug_serial_println!(
-                    "heap: reclaim Phase 1 remap PT frames (pages {}..{}): VA {:#x}, size {:#x}",
-                    VTL1_REMAP_PDPT_PAGE,
-                    VTL1_REMAP_PDE_PAGE + 1,
-                    remap_pt_start,
-                    remap_pt_size
-                );
-            }
-
-            // Add the rest of the VTL1 memory to the global allocator once they are mapped to the base page table.
-            let mem_fill_start = mem_fill_start + mem_fill_size;
-            let mem_fill_size = TruncateExt::<usize>::truncate(
-                size - (mem_fill_start as u64 - Platform::pa_to_va(vtl1_start).as_u64()),
-            );
+            // Reclaim Phase 1 PDPT and PDE pages
+            let remap_pt_pa = vtl1_start + (VTL1_REMAP_PDPT_PAGE * PAGE_SIZE) as u64;
+            let remap_pt_start: usize =
+                TruncateExt::<usize>::truncate(Platform::pa_to_va(remap_pt_pa).as_u64());
+            let remap_pt_size: usize = (VTL1_REMAP_PDE_PAGE - VTL1_REMAP_PDPT_PAGE + 1) * PAGE_SIZE;
             unsafe {
-                Platform::mem_fill_pages(mem_fill_start, mem_fill_size);
+                Platform::mem_fill_pages(remap_pt_start, remap_pt_size);
             }
             debug_serial_println!(
-                "heap: add remaining VTL1 memory (post Phase 2): VA {:#x}, size {:#x}",
-                mem_fill_start,
-                mem_fill_size
+                "heap: reclaim Phase 1 remap PT frames (pages {}..{}): VA {:#x}, size {:#x}",
+                VTL1_REMAP_PDPT_PAGE,
+                VTL1_REMAP_PDE_PAGE + 1,
+                remap_pt_start,
+                remap_pt_size
             );
+        }
 
-            allocate_per_cpu_variables();
-        } else {
-            panic!("Failed to get memory info");
+        // Add the rest of the VTL1 memory to the global allocator once they are mapped to the base page table.
+        let mem_fill_start = mem_fill_start + mem_fill_size;
+        let mem_fill_size = TruncateExt::<usize>::truncate(
+            size - (mem_fill_start as u64 - Platform::pa_to_va(vtl1_start).as_u64()),
+        );
+        unsafe {
+            Platform::mem_fill_pages(mem_fill_start, mem_fill_size);
         }
-    }
+        debug_serial_println!(
+            "heap: add remaining VTL1 memory (post Phase 2): VA {:#x}, size {:#x}",
+            mem_fill_start,
+            mem_fill_size
+        );
+
+        Some(platform)
+    } else {
+        None
+    };
+
+    // Allocate XSAVE areas now that we are on the kernel stack (the CPUID
+    // queries and aligned-vec allocations need a lot of stack space).
+    per_cpu_variables::allocate_xsave_area();
 
-    if let Err(e) = hvcall::init() {
+    if let Err(e) = hvcall::init(is_bsp) {
         panic!("Err: {:?}", e);
     }
     gdt::init();
diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs
index 6fdb9590b..ea13e4226 100644
--- a/litebox_runner_lvbs/src/main.rs
+++ b/litebox_runner_lvbs/src/main.rs
@@ -7,12 +7,12 @@
 
 use core::arch::{asm, naked_asm};
 use litebox_platform_lvbs::{
-    arch::{
-        enable_extended_states, enable_fsgsbase, enable_smep_smap, get_core_id, instrs::hlt_loop,
-    },
+    arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop},
     host::{
         bootparam::parse_boot_info,
-        per_cpu_variables::{PerCpuVariablesAsm, init_per_cpu_variables},
+        per_cpu_variables::{
+            PerCpuVariablesAsm, allocate_per_cpu_variables, init_per_cpu_variables,
+        },
     },
     mshv::vtl1_mem_layout::{self, VTL1_REMAP_PDE_PAGE, VTL1_REMAP_PDPT_PAGE},
     serial_println,
@@ -296,44 +296,61 @@ unsafe fn remap_to_high_canonical() -> ! {
 /// Trampoline executed at the high-canonical address after Phase 1 remap.
 ///
 /// Adjusts RSP from low-canonical (PA-based) to high-canonical, re-applies
-/// ELF relocations for the final link address, and tail-jumps to `_ap_start`.
+/// ELF relocations for the final link address, and tail-jumps to
+/// `common_start` with `is_bsp = true`.
 #[unsafe(naked)]
 unsafe extern "C" fn high_canonical_trampoline() -> ! {
     // 1. Adjust RSP from low-canonical (PA-based) to high-canonical.
     // 2. Phase 1b: Re-apply ELF relocations so every GOT slot now points to
     //    high-canonical VAs (addend + memory_base + KERNEL_OFFSET).
-    // 3. Tail-jump to _ap_start (common BSP + AP entry point).
+    // 3. Set edi = 1 (is_bsp = true) and tail-jump to common_start.
     naked_asm!(
         "mov rax, {offset}",
         "add rsp, rax",
         "and rsp, -16",
         "call {apply_reloc}",
-        "jmp {ap_start}",
+        "mov edi, 1",
+        "jmp {common_start}",
         offset = const KERNEL_OFFSET,
         apply_reloc = sym apply_relocations,
-        ap_start = sym _ap_start,
+        common_start = sym common_start,
     );
 }
 
-/// Common entry point for all cores after high-canonical page table setup.
-///
-/// - **BSP**: reached via `high_canonical_trampoline()` after Phase 1 remap + re-relocation
-/// - **APs**: entered directly by Hyper-V via `hvcall_enable_vp_vtl` (the VP
-///   context's RIP is set to this symbol). APs inherit the BSP's CR3 (Phase 2
-///   page table with full 128 MiB mapped), so they already run at high-canonical
-///   VAs and need no remap.
+/// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl`
+/// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3,
+/// so they already run at high-canonical VAs and need no remap.
 #[expect(clippy::missing_safety_doc)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn _ap_start() -> ! {
+    unsafe { common_start(false) }
+}
+
+/// Shared boot path for BSP and AP cores.
+///
+/// When `is_bsp` is `true`, seeds the initial heap.
+unsafe extern "C" fn common_start(is_bsp: bool) -> ! {
     enable_fsgsbase();
     enable_extended_states();
+
+    if is_bsp {
+        litebox_runner_lvbs::seed_initial_heap();
+    }
+
+    // Each core heap-allocates its own PerCpuVariables and sets GSBASE
+    // to point at it (assembly fields are at GS offset 0).
+    allocate_per_cpu_variables();
+
     init_per_cpu_variables();
 
+    // Switch to the kernel stack and tail-call kernel_main with is_bsp
+    let is_bsp_u32 = u32::from(is_bsp);
     unsafe {
         asm!(
             "mov rsp, gs:[{kernel_sp_off}]",
             "call {kernel_main}",
             kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() },
+            in("edi") is_bsp_u32,
             kernel_main = sym kernel_main
         );
     }
@@ -360,9 +377,8 @@ pub unsafe extern "C" fn _start() -> ! {
     }
 }
 
-unsafe extern "C" fn kernel_main() -> ! {
-    let core_id = get_core_id();
-    if core_id == 0 {
+unsafe extern "C" fn kernel_main(is_bsp: bool) -> ! {
+    if is_bsp {
         serial_println!("==============================");
         serial_println!(" Hello from LiteBox for LVBS! ");
         serial_println!("==============================");
@@ -370,7 +386,7 @@ unsafe extern "C" fn kernel_main() -> ! {
         parse_boot_info();
     }
 
-    let platform = litebox_runner_lvbs::init();
+    let platform = litebox_runner_lvbs::init(is_bsp);
 
     enable_smep_smap();
 

From c42157c5bcbe4733edf2d12f413bf94cbaf50540 Mon Sep 17 00:00:00 2001
From: Sangho Lee <sanghle@microsoft.com>
Date: Mon, 2 Mar 2026 20:53:22 +0000
Subject: [PATCH 2/3] spin lock AP boot stack

---
 dev_tests/src/ratchet.rs                    |  2 +-
 litebox_platform_lvbs/src/mshv/hvcall_vp.rs |  9 ++--
 litebox_runner_lvbs/src/main.rs             | 52 +++++++++++++++++++--
 3 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs
index f9882a793..6d7788be3 100644
--- a/dev_tests/src/ratchet.rs
+++ b/dev_tests/src/ratchet.rs
@@ -41,7 +41,7 @@ fn ratchet_globals() -> Result<()> {
             ("litebox_platform_multiplex/", 1),
             ("litebox_platform_windows_userland/", 7),
             ("litebox_runner_linux_userland/", 1),
-            ("litebox_runner_lvbs/", 4),
+            ("litebox_runner_lvbs/", 5),
             ("litebox_runner_snp/", 1),
             ("litebox_shim_linux/", 1),
             ("litebox_shim_optee/", 3),
diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
index 6f3474b2f..4dc03aadc 100644
--- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
+++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
@@ -229,12 +229,11 @@ pub fn init_vtl_ap(core: u32) -> Result<u64, HypervCallError> {
     // has high-canonical mappings, so these are ready to use as-is for the
     // AP's initial VP context.
     let rip: u64 = get_entry();
-    // SAFETY: We dont support concurrent AP/VTL initialization and thus share
-    // the same stack pointer. If we plan to support concurrent initialization,
-    // we should provide seperate stack pointers for each AP (which might not
-    // scale if there are several 100s of APs).
+    // All APs share this single boot stack. `_ap_start` spin-acquires
+    // `AP_BOOT_STACK_LOCK` before touching the stack and releases it after
+    // switching to a per-CPU kernel stack, so concurrent AP entry is safe.
     //
-    // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V
+    // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to Hyper-V
     // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned.
     let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64;
     let tss = get_address_of_special_page(VTL1_TSS_PAGE);
diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs
index ea13e4226..d6a393e68 100644
--- a/litebox_runner_lvbs/src/main.rs
+++ b/litebox_runner_lvbs/src/main.rs
@@ -6,6 +6,7 @@
 #![no_main]
 
 use core::arch::{asm, naked_asm};
+use core::sync::atomic::{AtomicBool, Ordering};
 use litebox_platform_lvbs::{
     arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop},
     host::{
@@ -20,6 +21,21 @@ use litebox_platform_lvbs::{
 use x86_64::VirtAddr;
 use x86_64::structures::paging::PageTableFlags;
 
+/// Spinlock protecting the shared AP boot stack (`VTL1_KERNEL_STACK_PAGE`).
+///
+/// All APs receive the same initial RSP via `hvcall_enable_vp_vtl`. VTL0
+/// controls when APs enter VTL1, so multiple APs may start concurrently.
+/// Each AP spin-acquires this lock before touching the boot stack, and
+/// releases it after switching to its own heap-allocated per-CPU kernel stack.
+static AP_BOOT_STACK_LOCK: AtomicBool = AtomicBool::new(false);
+
+/// Release the AP boot stack spinlock.
+///
+/// Called after the current core has switched RSP to its per-CPU kernel stack.
+extern "C" fn release_boot_stack_lock() {
+    AP_BOOT_STACK_LOCK.store(false, Ordering::Release);
+}
+
 /// ELF64 relocation entry
 #[repr(C)]
 struct Elf64Rela {
@@ -320,10 +336,32 @@ unsafe extern "C" fn high_canonical_trampoline() -> ! {
 /// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl`
 /// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3,
 /// so they already run at high-canonical VAs and need no remap.
-#[expect(clippy::missing_safety_doc)]
+///
+/// # Safety
+///
+/// Must only be used as the initial RIP for an AP's VP context.
+#[unsafe(naked)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn _ap_start() -> ! {
-    unsafe { common_start(false) }
+    naked_asm!(
+        // Spin-acquire the AP boot stack lock entirely in registers.
+        // No stack usage is permitted until the lock is held, because
+        // another AP may still be running on this same stack.
+        "lea rcx, [rip + {lock}]",
+        "2:",
+        "mov al, 1",
+        "xchg byte ptr [rcx], al",
+        "test al, al",
+        "jz 3f",
+        "pause",
+        "jmp 2b",
+        "3:",
+        // This AP has acquired the lock and exclusively owns the boot stack.
+        "xor edi, edi", // is_bsp = false
+        "jmp {common_start}",
+        lock = sym AP_BOOT_STACK_LOCK,
+        common_start = sym common_start,
+    );
 }
 
 /// Shared boot path for BSP and AP cores.
@@ -347,11 +385,19 @@ unsafe extern "C" fn common_start(is_bsp: bool) -> ! {
     let is_bsp_u32 = u32::from(is_bsp);
     unsafe {
         asm!(
+            // Now use this core's heap-allocated kernel stack.
             "mov rsp, gs:[{kernel_sp_off}]",
+            // The boot stack is no longer in use. Release the AP boot stack
+            // spinlock so the next AP can proceed. For the BSP this is a
+            // harmless no-op (the lock was never held).
+            "push rdi",
+            "call {release_lock}",
+            "pop rdi",
             "call {kernel_main}",
             kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() },
             in("edi") is_bsp_u32,
-            kernel_main = sym kernel_main
+            release_lock = sym release_boot_stack_lock,
+            kernel_main = sym kernel_main,
         );
     }
 

From 320aa44239a4403486944be09ed6f96e541e2022 Mon Sep 17 00:00:00 2001
From: Sangho Lee <sanghle@microsoft.com>
Date: Mon, 2 Mar 2026 21:42:24 +0000
Subject: [PATCH 3/3] Revert "spin lock AP boot stack"

This reverts commit c42157c5bcbe4733edf2d12f413bf94cbaf50540.
---
 dev_tests/src/ratchet.rs                    |  2 +-
 litebox_platform_lvbs/src/mshv/hvcall_vp.rs |  9 ++--
 litebox_runner_lvbs/src/main.rs             | 52 ++-------------------
 3 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/dev_tests/src/ratchet.rs b/dev_tests/src/ratchet.rs
index 6d7788be3..f9882a793 100644
--- a/dev_tests/src/ratchet.rs
+++ b/dev_tests/src/ratchet.rs
@@ -41,7 +41,7 @@ fn ratchet_globals() -> Result<()> {
             ("litebox_platform_multiplex/", 1),
             ("litebox_platform_windows_userland/", 7),
             ("litebox_runner_linux_userland/", 1),
-            ("litebox_runner_lvbs/", 5),
+            ("litebox_runner_lvbs/", 4),
             ("litebox_runner_snp/", 1),
             ("litebox_shim_linux/", 1),
             ("litebox_shim_optee/", 3),
diff --git a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
index 4dc03aadc..6f3474b2f 100644
--- a/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
+++ b/litebox_platform_lvbs/src/mshv/hvcall_vp.rs
@@ -229,11 +229,12 @@ pub fn init_vtl_ap(core: u32) -> Result<u64, HypervCallError> {
     // has high-canonical mappings, so these are ready to use as-is for the
     // AP's initial VP context.
     let rip: u64 = get_entry();
-    // All APs share this single boot stack. `_ap_start` spin-acquires
-    // `AP_BOOT_STACK_LOCK` before touching the stack and releases it after
-    // switching to a per-CPU kernel stack, so concurrent AP entry is safe.
+    // SAFETY: We dont support concurrent AP/VTL initialization and thus share
+    // the same stack pointer. If we plan to support concurrent initialization,
+    // we should provide seperate stack pointers for each AP (which might not
+    // scale if there are several 100s of APs).
     //
-    // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to Hyper-V
+    // This RSP is part of `HV_INITIAL_VP_CONTEXT` provided to the Hyper-V
     // via `HvCallEnableVpVtl`. It is expected to be 16-byte aligned.
     let rsp = get_address_of_special_page(VTL1_KERNEL_STACK_PAGE) + PAGE_SIZE as u64;
     let tss = get_address_of_special_page(VTL1_TSS_PAGE);
diff --git a/litebox_runner_lvbs/src/main.rs b/litebox_runner_lvbs/src/main.rs
index d6a393e68..ea13e4226 100644
--- a/litebox_runner_lvbs/src/main.rs
+++ b/litebox_runner_lvbs/src/main.rs
@@ -6,7 +6,6 @@
 #![no_main]
 
 use core::arch::{asm, naked_asm};
-use core::sync::atomic::{AtomicBool, Ordering};
 use litebox_platform_lvbs::{
     arch::{enable_extended_states, enable_fsgsbase, enable_smep_smap, instrs::hlt_loop},
     host::{
@@ -21,21 +20,6 @@ use litebox_platform_lvbs::{
 use x86_64::VirtAddr;
 use x86_64::structures::paging::PageTableFlags;
 
-/// Spinlock protecting the shared AP boot stack (`VTL1_KERNEL_STACK_PAGE`).
-///
-/// All APs receive the same initial RSP via `hvcall_enable_vp_vtl`. VTL0
-/// controls when APs enter VTL1, so multiple APs may start concurrently.
-/// Each AP spin-acquires this lock before touching the boot stack, and
-/// releases it after switching to its own heap-allocated per-CPU kernel stack.
-static AP_BOOT_STACK_LOCK: AtomicBool = AtomicBool::new(false);
-
-/// Release the AP boot stack spinlock.
-///
-/// Called after the current core has switched RSP to its per-CPU kernel stack.
-extern "C" fn release_boot_stack_lock() {
-    AP_BOOT_STACK_LOCK.store(false, Ordering::Release);
-}
-
 /// ELF64 relocation entry
 #[repr(C)]
 struct Elf64Rela {
@@ -336,32 +320,10 @@ unsafe extern "C" fn high_canonical_trampoline() -> ! {
 /// AP entry point: Entered directly by Hyper-V via `hvcall_enable_vp_vtl`
 /// (the VP context's RIP is set to this symbol). APs inherit the BSP's CR3,
 /// so they already run at high-canonical VAs and need no remap.
-///
-/// # Safety
-///
-/// Must only be used as the initial RIP for an AP's VP context.
-#[unsafe(naked)]
+#[expect(clippy::missing_safety_doc)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn _ap_start() -> ! {
-    naked_asm!(
-        // Spin-acquire the AP boot stack lock entirely in registers.
-        // No stack usage is permitted until the lock is held, because
-        // another AP may still be running on this same stack.
-        "lea rcx, [rip + {lock}]",
-        "2:",
-        "mov al, 1",
-        "xchg byte ptr [rcx], al",
-        "test al, al",
-        "jz 3f",
-        "pause",
-        "jmp 2b",
-        "3:",
-        // This AP has acquired the lock and exclusively owns the boot stack.
-        "xor edi, edi", // is_bsp = false
-        "jmp {common_start}",
-        lock = sym AP_BOOT_STACK_LOCK,
-        common_start = sym common_start,
-    );
+    unsafe { common_start(false) }
 }
 
 /// Shared boot path for BSP and AP cores.
@@ -385,19 +347,11 @@ unsafe extern "C" fn common_start(is_bsp: bool) -> ! {
     let is_bsp_u32 = u32::from(is_bsp);
     unsafe {
         asm!(
-            // Now use this core's heap-allocated kernel stack.
             "mov rsp, gs:[{kernel_sp_off}]",
-            // The boot stack is no longer in use. Release the AP boot stack
-            // spinlock so the next AP can proceed. For the BSP this is a
-            // harmless no-op (the lock was never held).
-            "push rdi",
-            "call {release_lock}",
-            "pop rdi",
             "call {kernel_main}",
             kernel_sp_off = const { PerCpuVariablesAsm::kernel_stack_ptr_offset() },
             in("edi") is_bsp_u32,
-            release_lock = sym release_boot_stack_lock,
-            kernel_main = sym kernel_main,
+            kernel_main = sym kernel_main
         );
     }