Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions crates/sandlock-core/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::os::fd::{AsRawFd, FromRawFd, OwnedFd, RawFd};
use crate::policy::{FsIsolation, Policy};
use crate::seccomp::bpf::{self, stmt, jump};
use crate::sys::structs::{
AF_INET, AF_INET6, AF_NETLINK,
AF_INET, AF_INET6,
BPF_ABS, BPF_ALU, BPF_AND, BPF_JEQ, BPF_JSET, BPF_JMP, BPF_K, BPF_LD, BPF_RET, BPF_W,
CLONE_NS_FLAGS, DEFAULT_DENY_SYSCALLS, EPERM,
SECCOMP_RET_ALLOW, SECCOMP_RET_ERRNO,
Expand Down Expand Up @@ -280,6 +280,21 @@ pub fn notif_syscalls(policy: &Policy) -> Vec<u32> {
libc::SYS_getdents64 as u32,
libc::SYS_getdents as u32,
]);

// Netlink virtualization (always on):
// socket, bind, getsockname — swap in a unix socketpair for AF_NETLINK
// recvfrom, recvmsg — zero msg_name so glibc accepts the reply
// (kernel only writes sun_family on unix
// recvmsg, leaving nl_pid uninitialized)
// close — unregister (pid, fd) so reuse doesn't
// collide with the cookie set
// Send traffic flows through the real socketpair untouched.
nrs.push(libc::SYS_socket as u32);
nrs.push(libc::SYS_bind as u32);
nrs.push(libc::SYS_getsockname as u32);
nrs.push(libc::SYS_recvfrom as u32);
nrs.push(libc::SYS_recvmsg as u32);
nrs.push(libc::SYS_close as u32);
// Virtualize sched_getaffinity so nproc/sysconf agree with /proc/cpuinfo
if policy.num_cpus.is_some() {
nrs.push(libc::SYS_sched_getaffinity as u32);
Expand Down Expand Up @@ -447,7 +462,6 @@ pub fn deny_syscall_numbers(policy: &Policy) -> Vec<u32> {
/// - clone: block namespace creation flags
/// - ioctl: block TIOCSTI, TIOCLINUX
/// - prctl: block PR_SET_DUMPABLE, PR_SET_SECUREBITS, PR_SET_PTRACER
/// - socket: block all AF_NETLINK sockets (network topology enumeration)
/// - socket: block SOCK_RAW/SOCK_DGRAM on AF_INET/AF_INET6 (with type mask)
pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
let ret_errno = SECCOMP_RET_ERRNO | EPERM as u32;
Expand Down Expand Up @@ -497,22 +511,6 @@ pub fn arg_filters(policy: &Policy) -> Vec<SockFilter> {
insns.push(stmt(BPF_RET | BPF_K, ret_errno));
}

// --- socket: block all AF_NETLINK sockets ---
// Netlink sockets allow network topology enumeration (interfaces, routes,
// ARP, etc.) which leaks host network configuration. Block the entire
// AF_NETLINK family, not just NETLINK_SOCK_DIAG.
// 5 instructions:
// LD NR
// JEQ socket → +0, skip 3
// LD arg0 (domain)
// JEQ AF_NETLINK → +0, skip 1
// RET ERRNO
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_NR));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, nr_socket, 0, 3));
insns.push(stmt(BPF_LD | BPF_W | BPF_ABS, OFFSET_ARGS0_LO));
insns.push(jump(BPF_JMP | BPF_JEQ | BPF_K, AF_NETLINK, 0, 1));
insns.push(stmt(BPF_RET | BPF_K, ret_errno));

// --- socket: block SOCK_RAW and/or SOCK_DGRAM on AF_INET/AF_INET6 ---
let mut blocked_types: Vec<u32> = Vec::new();
if policy.no_raw_sockets {
Expand Down Expand Up @@ -1114,9 +1112,6 @@ mod tests {
// Should contain JEQ for PR_SET_DUMPABLE
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == PR_SET_DUMPABLE));
// Should contain JEQ for socket + AF_NETLINK (all netlink blocked)
assert!(filters.iter().any(|f| f.code == (BPF_JMP | BPF_JEQ | BPF_K)
&& f.k == AF_NETLINK));
}

#[test]
Expand Down
1 change: 1 addition & 0 deletions crates/sandlock-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub(crate) mod random;
pub(crate) mod time;
pub(crate) mod cow;
pub(crate) mod checkpoint;
pub mod netlink;
pub(crate) mod procfs;
pub(crate) mod port_remap;
pub mod pipeline;
Expand Down
232 changes: 232 additions & 0 deletions crates/sandlock-core/src/netlink/handlers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
use std::os::unix::io::{FromRawFd, OwnedFd, RawFd};
use std::sync::Arc;

use crate::netlink::{proxy, state::NetlinkState};
use crate::seccomp::notif::{read_child_mem, write_child_mem, NotifAction, OnInjectSuccess};
use crate::sys::structs::SeccompNotif;

const AF_NETLINK: u64 = 16;
const NETLINK_ROUTE: u64 = 0;

/// Resolve `notif.pid` (which is a TID per the kernel's `task_pid_vnr`) to
/// the enclosing thread group id. fds are shared across all threads of a
/// process, so cookie entries must be keyed by TGID — otherwise a cookie
/// created by thread A is invisible to thread B in the same process.
fn tgid_of(tid: i32) -> i32 {
let path = format!("/proc/{}/status", tid);
if let Ok(s) = std::fs::read_to_string(&path) {
for line in s.lines() {
if let Some(rest) = line.strip_prefix("Tgid:") {
if let Ok(v) = rest.trim().parse::<i32>() {
return v;
}
}
}
}
// Fallback: if we can't read status, treat the tid as the tgid.
tid
}

/// Read a POD struct `T` from child memory via `process_vm_readv`, with the
/// shared `notif::read_child_mem` helper that ID-validates the notification
/// before and after the read.
fn read_struct<T: Copy>(
notif_fd: RawFd,
id: u64,
pid: u32,
addr: usize,
) -> Option<T> {
let bytes = read_child_mem(notif_fd, id, pid, addr as u64, std::mem::size_of::<T>()).ok()?;
Some(unsafe { std::ptr::read_unaligned(bytes.as_ptr() as *const T) })
}

/// Intercept `socket(AF_NETLINK, *, NETLINK_ROUTE)` and substitute one end
/// of a `socketpair(AF_UNIX, SOCK_SEQPACKET)`. A tokio task takes the
/// supervisor-side end and speaks synthesized NETLINK_ROUTE replies.
/// Other domains pass through; other netlink protocols are denied.
pub async fn handle_socket(
notif: &SeccompNotif,
state: &Arc<NetlinkState>,
) -> NotifAction {
let domain = notif.data.args[0];
let protocol = notif.data.args[2];

if domain != AF_NETLINK {
return NotifAction::Continue;
}
if protocol != NETLINK_ROUTE {
return NotifAction::Errno(libc::EAFNOSUPPORT);
}

let mut fds = [0i32; 2];
let rc = unsafe {
libc::socketpair(
libc::AF_UNIX,
libc::SOCK_SEQPACKET | libc::SOCK_CLOEXEC,
0,
fds.as_mut_ptr(),
)
};
if rc != 0 {
return NotifAction::Errno(libc::ENOMEM);
}
// fds[0] → supervisor side (responder owns)
// fds[1] → child side (injected)
//
// The supervisor end is driven by a tokio task via AsyncFd, so it
// must be non-blocking. The child end stays blocking (glibc's
// netlink code expects blocking semantics).
let flags = unsafe { libc::fcntl(fds[0], libc::F_GETFL) };
if flags < 0
|| unsafe { libc::fcntl(fds[0], libc::F_SETFL, flags | libc::O_NONBLOCK) } < 0
{
unsafe {
libc::close(fds[0]);
libc::close(fds[1]);
}
return NotifAction::Errno(libc::ENOMEM);
}
let responder_fd = unsafe { OwnedFd::from_raw_fd(fds[0]) };
let child_fd = unsafe { OwnedFd::from_raw_fd(fds[1]) };

// tgid, not tid: fds are process-scoped, so the cookie set must be
// keyed per-process to be visible across threads of the same app.
// The responder also uses tgid as `nlmsg_pid` in its replies so the
// value is consistent with what `handle_getsockname` writes for the
// same process (glibc compares incoming nlmsg_pid against the value
// it read back from getsockname — they must agree).
let tgid = tgid_of(notif.pid as i32);
proxy::spawn_responder(responder_fd, tgid as u32);

// Record the (tgid, fd) once the kernel's ADDFD ioctl returns the
// child-side fd number. Doing it from the on-success callback
// (rather than guessing via inode matching afterwards) closes the
// TOCTOU gap: the entry lands in the state map *before* the child's
// syscall unblocks, and the key is the exact fd slot the kernel
// allocated — not derivable by racing the child.
let state = Arc::clone(state);
NotifAction::InjectFdSendTracked {
srcfd: child_fd,
newfd_flags: libc::O_CLOEXEC as u32,
on_success: OnInjectSuccess::new(move |child_fd_num| {
state.register(tgid, child_fd_num);
}),
}
}

/// Zero out the `msg_name` region of a recvmsg/recvfrom before the kernel
/// runs the syscall, so that the source address glibc sees has
/// `nl_pid == 0` (the kernel only writes `sun_family` = AF_UNIX = 2 bytes
/// into a unix-socketpair recvmsg's source address; bytes 2..end remain as
/// whatever we pre-filled).
///
/// glibc's netlink receive loop rejects messages where
/// `source_addr.nl_pid != 0` with a silent `continue`, interpreting them as
/// coming from a non-kernel peer. Without this zeroing the `nl_pid` bits
/// are uninitialized stack and the check is flaky.
pub async fn handle_netlink_recvmsg(
notif: &SeccompNotif,
state: &Arc<NetlinkState>,
notif_fd: RawFd,
) -> NotifAction {
let fd = notif.data.args[0] as i32;
let tgid = tgid_of(notif.pid as i32);
if !state.is_cookie(tgid, fd) {
return NotifAction::Continue;
}

let nr = notif.data.nr as i64;
let sockaddr_nl_len: usize = 12;
let zeros = [0u8; 12];
let pid = notif.pid;
let id = notif.id;

if nr == libc::SYS_recvmsg {
// args: (fd, msghdr*, flags)
let msghdr_ptr = notif.data.args[1] as usize;
if let Some(hdr) = read_struct::<libc::msghdr>(notif_fd, id, pid, msghdr_ptr) {
if !hdr.msg_name.is_null() && (hdr.msg_namelen as usize) >= sockaddr_nl_len {
let _ = write_child_mem(notif_fd, id, pid, hdr.msg_name as u64, &zeros);
}
}
} else if nr == libc::SYS_recvfrom {
// args: (fd, buf, len, flags, src_addr*, addrlen_ptr)
let src_addr = notif.data.args[4] as u64;
let addrlen_ptr = notif.data.args[5] as u64;
if src_addr != 0 && addrlen_ptr != 0 {
if let Ok(b) = read_child_mem(notif_fd, id, pid, addrlen_ptr, 4) {
let cap = u32::from_ne_bytes(b.try_into().unwrap_or([0; 4])) as usize;
if cap >= sockaddr_nl_len {
let _ = write_child_mem(notif_fd, id, pid, src_addr, &zeros);
}
}
}
}

NotifAction::Continue
}

pub async fn handle_bind(
notif: &SeccompNotif,
state: &Arc<NetlinkState>,
) -> NotifAction {
let fd = notif.data.args[0] as i32;
let tgid = tgid_of(notif.pid as i32);
if state.is_cookie(tgid, fd) {
return NotifAction::ReturnValue(0);
}
NotifAction::Continue
}

/// Remove `(tgid, fd)` from the cookie set when the child closes a
/// tracked netlink socket. Lets the kernel actually close the fd too.
pub async fn handle_close(
notif: &SeccompNotif,
state: &Arc<NetlinkState>,
) -> NotifAction {
let fd = notif.data.args[0] as i32;
let tgid = tgid_of(notif.pid as i32);
if state.is_cookie(tgid, fd) {
state.unregister(tgid, fd);
}
NotifAction::Continue
}

pub async fn handle_getsockname(
notif: &SeccompNotif,
state: &Arc<NetlinkState>,
notif_fd: RawFd,
) -> NotifAction {
let fd = notif.data.args[0] as i32;
let tgid = tgid_of(notif.pid as i32);
if !state.is_cookie(tgid, fd) {
return NotifAction::Continue;
}

// struct sockaddr_nl { u16 nl_family; u16 _pad; u32 nl_pid; u32 nl_groups; }
//
// We use the tgid as the synthesized nl_pid so it's stable across
// threads of the same process — matching the real kernel's netlink
// auto-bind behavior which assigns one nl_pid per netlink socket.
let mut addr = [0u8; 12];
let nl_family = libc::AF_NETLINK as u16;
addr[0..2].copy_from_slice(&nl_family.to_ne_bytes());
addr[4..8].copy_from_slice(&(tgid as u32).to_ne_bytes());

let addr_ptr = notif.data.args[1] as u64;
let addrlen_ptr = notif.data.args[2] as u64;
let pid = notif.pid;
let id = notif.id;

let cur = match read_child_mem(notif_fd, id, pid, addrlen_ptr, 4) {
Ok(b) => u32::from_ne_bytes(b.try_into().unwrap_or([0; 4])) as usize,
Err(_) => return NotifAction::Errno(libc::EFAULT),
};
let to_write = cur.min(addr.len());
if write_child_mem(notif_fd, id, pid, addr_ptr, &addr[..to_write]).is_err() {
return NotifAction::Errno(libc::EFAULT);
}
let actual = (addr.len() as u32).to_ne_bytes();
let _ = write_child_mem(notif_fd, id, pid, addrlen_ptr, &actual);
NotifAction::ReturnValue(0)
}
13 changes: 13 additions & 0 deletions crates/sandlock-core/src/netlink/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
//! NETLINK_ROUTE virtualization for sandboxed processes.
//!
//! Presents a synthetic network view (one loopback interface) without
//! exposing real host netlink. See `state.rs` for the fd registry and
//! `handlers.rs` for seccomp-notify integration.

pub mod handlers;
pub mod proto;
pub mod proxy;
pub mod state;
pub mod synth;

pub use state::NetlinkState;
Loading
Loading