diff --git a/litebox/src/fs/in_mem.rs b/litebox/src/fs/in_mem.rs index 11683a455..483982304 100644 --- a/litebox/src/fs/in_mem.rs +++ b/litebox/src/fs/in_mem.rs @@ -785,6 +785,21 @@ impl super::FileSystem for FileSystem blksize: BLOCK_SIZE, }) } + + fn get_static_backing_data(&self, fd: &FileFd) -> Option<&'static [u8]> { + let descriptor_table = self.litebox.descriptor_table(); + let entry = descriptor_table.get_entry(fd)?; + match &entry.entry { + Descriptor::File { file, .. } => { + let file = file.read(); + match &file.data { + alloc::borrow::Cow::Borrowed(slice) => Some(*slice), + alloc::borrow::Cow::Owned(_) => None, + } + } + Descriptor::Dir { .. } => None, + } + } } struct RootDir { diff --git a/litebox/src/fs/layered.rs b/litebox/src/fs/layered.rs index 674f6b4c3..d9fda57b4 100644 --- a/litebox/src/fs/layered.rs +++ b/litebox/src/fs/layered.rs @@ -1336,6 +1336,21 @@ impl< blksize, }) } + + fn get_static_backing_data( + &self, + fd: &FileFd, + ) -> Option<&'static [u8]> { + let entry = self + .litebox + .descriptor_table() + .with_entry(fd, |descriptor| Arc::clone(&descriptor.entry.entry))?; + match entry.as_ref() { + EntryX::Upper { fd } => self.upper.get_static_backing_data(fd), + EntryX::Lower { fd } => self.lower.get_static_backing_data(fd), + EntryX::Tombstone => unreachable!(), + } + } } struct Descriptor { diff --git a/litebox/src/fs/mod.rs b/litebox/src/fs/mod.rs index f66a32e26..d8b3d6a2e 100644 --- a/litebox/src/fs/mod.rs +++ b/litebox/src/fs/mod.rs @@ -136,6 +136,17 @@ pub trait FileSystem: private::Sealed + FdEnabledSubsystem { /// Equivalent to [`Self::file_status`], but open an open `fd` instead. fn fd_file_status(&self, fd: &TypedFd) -> Result; + + /// Get static backing data for a file, if available and supported. + /// + /// This method returns the (entire) underlying static byte slice if the file's contents are + /// backed by borrowed static data (e.g., loaded via `initialize_primarily_read_heavy_file`). + /// + /// Returns `None` if indicating no static backing data is available/supported. + #[expect(unused_variables, reason = "default body, non-underscored param names")] + fn get_static_backing_data(&self, fd: &TypedFd) -> Option<&'static [u8]> { + None + } } bitflags! { diff --git a/litebox/src/mm/linux.rs b/litebox/src/mm/linux.rs index 13262c78c..e7ed436c0 100644 --- a/litebox/src/mm/linux.rs +++ b/litebox/src/mm/linux.rs @@ -315,6 +315,19 @@ impl + 'static, const ALIGN: usize> Vmem self.vmas.iter() } + /// Insert an already-allocated region (e.g., via CoW) without calling the platform allocator. + /// + /// Any existing tracked mappings that overlap `range` are silently removed from tracking + /// (without calling the platform deallocator) before inserting. Use [`Self::overlapping`] to + /// check for overlap before running this if needed. + pub(super) fn register_existing_mapping_overwrite( + &mut self, + range: PageRange, + vma: VmArea, + ) { + self.vmas.insert(range.into(), vma); + } + /// Gets an iterator over all the stored ranges that are /// either partially or completely overlapped by the given range. pub(super) fn overlapping( diff --git a/litebox/src/mm/mod.rs b/litebox/src/mm/mod.rs index bc4b1781f..8f945bdd1 100644 --- a/litebox/src/mm/mod.rs +++ b/litebox/src/mm/mod.rs @@ -14,8 +14,8 @@ use core::ops::Range; use alloc::vec::Vec; use linux::{ - CreatePagesFlags, MappingError, PageFaultError, PageRange, VmFlags, Vmem, VmemPageFaultHandler, - VmemProtectError, VmemUnmapError, + CreatePagesFlags, MappingError, PageFaultError, PageRange, VmArea, VmFlags, Vmem, + VmemPageFaultHandler, VmemProtectError, VmemUnmapError, }; use crate::{ @@ -591,6 +591,39 @@ where ) } + /// Register an already-allocated memory region in the VMA tracker. + /// + /// This is used when memory has been allocated by some means other than the normal + /// `create_*_pages` path (e.g., CoW mappings created directly by the platform), so that the + /// page manager tracks the region for future `mprotect`, `munmap`, etc. + /// + /// If `replace` is `true`, any overlapping tracked mappings are evicted from the tracker + /// (without calling the platform deallocator) before inserting. Otherwise, returns `None` + /// without registering if the provided `range` overlaps with any existing mapping. + /// + /// # Safety + /// + /// The `range` must be an already-mapped region with the given `permissions`. + #[must_use] + pub unsafe fn register_existing_mapping( + &self, + range: PageRange, + permissions: MemoryRegionPermissions, + is_file_backed: bool, + replace: bool, + ) -> Option<()> { + let vma = VmArea::new( + VmFlags::from(permissions) | VmFlags::VM_MAY_ACCESS_FLAGS, + is_file_backed, + ); + let mut vmem = self.vmem.write(); + if !replace && vmem.overlapping(range.into()).next().is_some() { + return None; + } + vmem.register_existing_mapping_overwrite(range, vma); + Some(()) + } + /// Returns all mappings in a vector. pub fn mappings(&self) -> Vec<(Range, VmFlags)> { self.vmem diff --git a/litebox/src/platform/page_mgmt.rs b/litebox/src/platform/page_mgmt.rs index d53f5bc3c..c4fca057a 100644 --- a/litebox/src/platform/page_mgmt.rs +++ b/litebox/src/platform/page_mgmt.rs @@ -172,6 +172,25 @@ pub trait PageManagementProvider: RawPointerProvider { /// /// Note that the returned ranges should be `ALIGN`-aligned. fn reserved_pages(&self) -> impl Iterator>; + + /// Attempt to allocate pages with copy-on-write semantics backed by static data. + /// + /// This method allows platforms that support it to create CoW mappings instead of performing + /// expensive page-by-page memory copies. This is particularly useful when mapping pre-loaded + /// file data that was mmap'd by the host. + /// + /// The default implementation returns unsupported CoW. Platforms that DO support COW should + /// override this method to unlock better performance. + #[expect(unused_variables, reason = "default body, non-underscored param names")] + fn try_allocate_cow_pages( + &self, + suggested_start: usize, + source_data: &'static [u8], + permissions: MemoryRegionPermissions, + fixed_address_behavior: FixedAddressBehavior, + ) -> Result, CowAllocationError> { + Err(CowAllocationError::UnsupportedByPlatform) + } } /// Behavior when allocating pages at a fixed address. @@ -243,3 +262,29 @@ pub enum PermissionUpdateError { #[error("provided range contains unallocated pages")] Unallocated, } + +/// Possible errors for [`PageManagementProvider::try_allocate_cow_pages`] +/// +/// ```text +/// ____________________ +/// ( Maybe the grass is ) +/// ( greener on the ) +/// ( other side? ) +/// -------------------- +/// o ^__^ +/// o (oo)\_______ +/// (__)\ )\/\ +/// ||----w | +/// || || +/// ``` +#[derive(Error, Debug)] +pub enum CowAllocationError { + #[error("copy-on-write page allocation is not supported for this particular platform")] + UnsupportedByPlatform, + #[error("source region is not copy-on-writable")] + UnsupportedSourceRegion, + #[error("unaligned request")] + Unaligned, + #[error("internal failure in creating CoW pages")] + InternalFailure, +} diff --git a/litebox_platform_linux_userland/src/lib.rs b/litebox_platform_linux_userland/src/lib.rs index 2b2c2a3d3..2c8039e2d 100644 --- a/litebox_platform_linux_userland/src/lib.rs +++ b/litebox_platform_linux_userland/src/lib.rs @@ -9,12 +9,15 @@ use std::cell::Cell; use std::os::fd::{AsRawFd as _, FromRawFd as _}; +use std::path::PathBuf; use std::sync::atomic::{AtomicI32, AtomicU32, Ordering}; use std::time::Duration; use litebox::fs::OFlags; use litebox::platform::UnblockedOrTimedOut; -use litebox::platform::page_mgmt::{FixedAddressBehavior, MemoryRegionPermissions}; +use litebox::platform::page_mgmt::{ + CowAllocationError, FixedAddressBehavior, MemoryRegionPermissions, +}; use litebox::platform::{ImmediatelyWokenUp, RawConstPointer as _}; use litebox::shim::ContinueOperation; use litebox::utils::{ReinterpretSignedExt, ReinterpretUnsignedExt as _, TruncateExt}; @@ -40,6 +43,9 @@ pub struct LinuxUserland { reserved_pages: Vec>, /// The base address of the VDSO. vdso_address: Option, + /// CoW-eligible memory regions. Maps start address of the static slice, to the info needed to + /// re-mmap the file. + cow_regions: std::sync::RwLock>, } impl core::fmt::Debug for LinuxUserland { @@ -48,6 +54,15 @@ impl core::fmt::Debug for LinuxUserland { } } +/// Information about a CoW-eligible memory region backed by a file. +#[derive(Debug, Clone)] +struct CowRegionInfo { + /// The path to the backing file on the host filesystem. + file_path: PathBuf, + /// Length of the backing file. + file_length: usize, +} + const IF_NAMESIZE: usize = 16; /// Use TUN device const IFF_TUN: i32 = 0x0001; @@ -163,10 +178,53 @@ impl LinuxUserland { seccomp_interception_enabled: std::sync::atomic::AtomicBool::new(false), reserved_pages, vdso_address, + cow_regions: std::sync::RwLock::new(std::collections::BTreeMap::new()), }; Box::leak(Box::new(platform)) } + /// Register a CoW-eligible memory region backed by a file. + /// + /// # Panics + /// + /// Panics if an overlapping region is already registered. + pub fn register_cow_region(&self, data: &'static [u8], file_path: impl Into) { + let start = data.as_ptr() as usize; + let info = CowRegionInfo { + file_path: file_path.into(), + file_length: data.len(), + }; + + let mut regions = self.cow_regions.write().unwrap(); + assert!( + regions.range(start..start + data.len()).next().is_none(), + "Attempting to register an overlapping region" + ); + let old = regions.insert(start, info); + assert!(old.is_none()); + } + + /// Look up the file backing a static slice for CoW mapping. + /// + /// Returns `Some((file_path, offset_in_file))` if the slice is backed by a registered + /// CoW region, `None` otherwise. + fn lookup_cow_region(&self, source_data: &'static [u8]) -> Option<(PathBuf, usize)> { + let slice_start = source_data.as_ptr() as usize; + let slice_len = source_data.len(); + + let regions = self.cow_regions.read().unwrap(); + + if let Some((®ion_start, info)) = regions.range(..=slice_start).next_back() { + let region_end = region_start.checked_add(info.file_length).unwrap(); + let slice_end = slice_start.checked_add(slice_len).unwrap(); + + if slice_start >= region_start && slice_end <= region_end { + return Some((info.file_path.clone(), slice_start - region_start)); + } + } + None + } + /// Enable seccomp syscall interception on the platform. /// /// # Panics @@ -1490,6 +1548,89 @@ impl litebox::platform::PageManagementProvider for Li fn reserved_pages(&self) -> impl Iterator> { self.reserved_pages.iter() } + + fn try_allocate_cow_pages( + &self, + suggested_start: usize, + source_data: &'static [u8], + permissions: MemoryRegionPermissions, + fixed_address_behavior: FixedAddressBehavior, + ) -> Result, CowAllocationError> { + let Some((file_path, file_offset)) = self.lookup_cow_region(source_data) else { + return Err(CowAllocationError::UnsupportedSourceRegion); + }; + if !file_offset.is_multiple_of(ALIGN) { + return Err(CowAllocationError::Unaligned); + } + + let file_path_cstr = + std::ffi::CString::new(file_path.as_os_str().as_encoded_bytes()).unwrap(); + // TODO(jb): We should likely be storing pre-opened FDs, right? + let fd = unsafe { + syscalls::syscall4( + syscalls::Sysno::open, + file_path_cstr.as_ptr() as usize, + OFlags::RDONLY.bits() as usize, + 0, + // Unused by the syscall but would be checked by Seccomp filter if enabled. + syscall_intercept::SYSCALL_ARG_MAGIC, + ) + }; + let fd = fd.expect("file should remain unchanged on host"); + + let mut flags = MapFlags::MAP_PRIVATE; + match fixed_address_behavior { + FixedAddressBehavior::Hint => {} + FixedAddressBehavior::Replace => flags |= MapFlags::MAP_FIXED, + FixedAddressBehavior::NoReplace => flags |= MapFlags::MAP_FIXED_NOREPLACE, + } + + let result = unsafe { + syscalls::syscall6( + { + #[cfg(target_arch = "x86_64")] + { + syscalls::Sysno::mmap + } + #[cfg(target_arch = "x86")] + { + syscalls::Sysno::mmap2 + } + }, + suggested_start, + source_data.len(), + prot_flags(permissions).bits().reinterpret_as_unsigned() as usize, + (flags.bits().reinterpret_as_unsigned() + // This is to ensure it won't be intercepted by Seccomp if enabled. + | syscall_intercept::MMAP_FLAG_MAGIC) as usize, + fd, + { + #[cfg(target_arch = "x86_64")] + { + file_offset + } + #[cfg(target_arch = "x86")] + { + // mmap2 takes offset in pages, not bytes + file_offset / ALIGN + } + }, + ) + }; + + let _ = unsafe { + syscalls::syscall2( + syscalls::Sysno::close, + fd, // This is to ensure it won't be intercepted by Seccomp if enabled. + syscall_intercept::SYSCALL_ARG_MAGIC, + ) + }; + + match result { + Ok(ptr) => Ok(UserMutPtr::from_usize(ptr)), + Err(_) => Err(CowAllocationError::InternalFailure), + } + } } impl litebox::platform::StdioProvider for LinuxUserland { diff --git a/litebox_runner_linux_userland/src/lib.rs b/litebox_runner_linux_userland/src/lib.rs index 6fa68a5c3..b702decec 100644 --- a/litebox_runner_linux_userland/src/lib.rs +++ b/litebox_runner_linux_userland/src/lib.rs @@ -76,18 +76,28 @@ pub enum InterceptionBackend { static REQUIRE_RTLD_AUDIT: core::sync::atomic::AtomicBool = core::sync::atomic::AtomicBool::new(false); -fn mmapped_file_data(path: impl AsRef) -> Result<&'static [u8]> { +struct MmappedFile { + data: &'static [u8], + abs_path: PathBuf, +} + +fn mmapped_file(path: impl AsRef) -> Result { let path = path.as_ref(); - let file = std::fs::File::open(path)?; - // SAFETY: We assume that the file given to us is not going to change _externally_ while in - // the middle of execution. Since we are mapping it as read-only and mapping it only once, - // we are not planning to change it either. With both these in mind, this call is safe. - // - // We need to leak the `Mmap` object, so that it stays alive until the end of the program, - // rather than being unmapped at function finish (i.e., to get the `'static` lifetime). - Ok(Box::leak(Box::new(unsafe { Mmap::map(&file) }.map_err( - |e| anyhow!("Could not read tar file at {}: {}", path.display(), e), - )?))) + let abs_path = std::path::absolute(path) + .map_err(|e| anyhow!("Could not get absolute path for {}: {}", path.display(), e))?; + let file = std::fs::File::open(&abs_path)?; + let data = { + // SAFETY: We assume that the file given to us is not going to change _externally_ while in + // the middle of execution. Since we are mapping it as read-only and mapping it only once, + // we are not planning to change it either. With both these in mind, this call is safe. + // + // We need to leak the `Mmap` object, so that it stays alive until the end of the program, + // rather than being unmapped at function finish (i.e., to get the `'static` lifetime). + Box::leak(Box::new(unsafe { Mmap::map(&file) }.map_err(|e| { + anyhow!("Could not read tar file at {}: {}", path.display(), e) + })?)) + }; + Ok(MmappedFile { data, abs_path }) } /// Run Linux programs with LiteBox on unmodified Linux @@ -104,6 +114,8 @@ pub fn run(cli_args: CliArgs) -> Result<()> { ) } + let mut cow_eligible_regions: Vec = Vec::new(); + let (ancestor_modes_and_users, prog_data): ( Vec<(litebox::fs::Mode, u32)>, alloc::borrow::Cow<'static, [u8]>, @@ -122,13 +134,15 @@ pub fn run(cli_args: CliArgs) -> Result<()> { ) }) .collect(); - let data = mmapped_file_data(prog)?; + let file = mmapped_file(&prog)?; let data = if cli_args.rewrite_syscalls { - litebox_syscall_rewriter::hook_syscalls_in_elf(data, None) + litebox_syscall_rewriter::hook_syscalls_in_elf(file.data, None) .unwrap() .into() } else { - data.into() + let data = file.data.into(); + cow_eligible_regions.push(file); + data }; (modes, data) }; @@ -136,7 +150,7 @@ pub fn run(cli_args: CliArgs) -> Result<()> { if tar_file.extension().and_then(|x| x.to_str()) != Some("tar") { anyhow::bail!("Expected a .tar file, found {}", tar_file.display()); } - mmapped_file_data(tar_file)? + mmapped_file(tar_file)?.data } else { litebox::fs::tar_ro::EMPTY_TAR_FILE }; @@ -147,6 +161,11 @@ pub fn run(cli_args: CliArgs) -> Result<()> { // systrap/sigsys interception, or binary rewriting interception. Currently // `litebox_platform_linux_userland` does not provide a way to pick between the two. let platform = Platform::new(cli_args.tun_device_name.as_deref()); + + for file in cow_eligible_regions { + platform.register_cow_region(file.data, file.abs_path); + } + litebox_platform_multiplex::set_platform(platform); let mut shim_builder = litebox_shim_linux::LinuxShimBuilder::new(); let litebox = shim_builder.litebox(); diff --git a/litebox_shim_linux/src/syscalls/mm.rs b/litebox_shim_linux/src/syscalls/mm.rs index d0ba581f4..429932328 100644 --- a/litebox_shim_linux/src/syscalls/mm.rs +++ b/litebox_shim_linux/src/syscalls/mm.rs @@ -5,8 +5,11 @@ //! Most of these syscalls which are not backed by files are implemented in [`litebox_common_linux::mm`]. use litebox::{ - mm::linux::{MappingError, PAGE_SIZE}, - platform::RawMutPointer, + mm::linux::{MappingError, PAGE_SIZE, PageRange}, + platform::{ + PageManagementProvider, RawConstPointer, RawMutPointer, + page_mgmt::{FixedAddressBehavior, MemoryRegionPermissions}, + }, }; use litebox_common_linux::{MRemapFlags, MapFlags, ProtFlags, errno::Errno}; @@ -72,6 +75,130 @@ impl Task { flags: MapFlags, fd: i32, offset: usize, + ) -> Result, MappingError> { + if let Some(cow_result) = + self.try_cow_mmap_file(suggested_addr, len, &prot, &flags, fd, offset) + { + return cow_result; + } + self.do_mmap_file_memcpy(suggested_addr, len, prot, flags, fd, offset) + } + + /// Attempt to create a CoW mapping for a file with static backing data. + /// + /// Returns `Some(result)` if CoW was attempted (success or failure), + /// `None` if CoW is not applicable (fall back to memcpy). + // TODO(jb): does this need to be Option-Result or can it just be Option? + fn try_cow_mmap_file( + &self, + suggested_addr: Option, + len: usize, + prot: &ProtFlags, + flags: &MapFlags, + fd: i32, + offset: usize, + ) -> Option, MappingError>> { + if !len.is_multiple_of(PAGE_SIZE) { + return None; + } + + let Ok(fd) = u32::try_from(fd) else { + return None; + }; + + let files = self.files.borrow(); + let raw_fd = match files.file_descriptors.read().get_fd(fd)? { + crate::Descriptor::LiteBoxRawFd(raw_fd) => *raw_fd, + _ => return None, + }; + + let static_data = files + .run_on_raw_fd( + raw_fd, + |typed_fd| self.global.fs.get_static_backing_data(typed_fd), + |_| None, + |_| None, + ) + .ok()??; + + if offset > static_data.len() { + return None; + } + + let available_len = static_data.len().saturating_sub(offset); + if available_len < len { + // Cannot fill full page + return None; + } + + let fixed_behavior = if flags.contains(MapFlags::MAP_FIXED_NOREPLACE) { + FixedAddressBehavior::NoReplace + } else if flags.contains(MapFlags::MAP_FIXED) { + FixedAddressBehavior::Replace + } else { + FixedAddressBehavior::Hint + }; + + let permissions = { + let mut perms = MemoryRegionPermissions::empty(); + perms.set( + MemoryRegionPermissions::READ, + prot.contains(ProtFlags::PROT_READ), + ); + perms.set( + MemoryRegionPermissions::WRITE, + prot.contains(ProtFlags::PROT_WRITE), + ); + perms.set( + MemoryRegionPermissions::EXEC, + prot.contains(ProtFlags::PROT_EXEC), + ); + perms + }; + + // XXX: `try_allocate_cow_pages` and `register_existing_mapping` are not called under a + // unified lock, so there is a theoretical race if two threads concurrently attempt a + // fixed-address mapping with replacement at the same address. In practice this is benign: + // if a program races like this both threads will register the same mapping anyway. Updating + // to a begin/attempt/commit scheme could close this race window entirely. + match <_ as PageManagementProvider<{ PAGE_SIZE }>>::try_allocate_cow_pages( + litebox_platform_multiplex::platform(), + suggested_addr.unwrap_or(0), + &static_data[offset..offset + len], + permissions, + fixed_behavior, + ) { + Ok(ptr) => { + let range = + PageRange::new(ptr.as_usize(), ptr.as_usize().checked_add(len).unwrap()) + .unwrap(); + // SAFETY: ptr is the freshly CoW-mapped region of exactly `len` bytes with + // `permissions`. + unsafe { + self.global.pm.register_existing_mapping( + range, + permissions, + true, + fixed_behavior == FixedAddressBehavior::Replace, + ) + } + .unwrap(); + Some(Ok(ptr)) + } + Err(_cow_not_supported) => None, + } + } + + /// Fallback mmap implementation using page-by-page memcpy, for files where the CoW attempt + /// fails (either due to lack of support on platform, or non-static-backed data, etc.) + fn do_mmap_file_memcpy( + &self, + suggested_addr: Option, + len: usize, + prot: ProtFlags, + flags: MapFlags, + fd: i32, + offset: usize, ) -> Result, MappingError> { let op = |ptr: MutPtr| -> Result { // Note a malicious user may unmap ptr while we are reading.