⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/virt/containers/
seccomp.rs

1//! Seccomp BPF - filter instructions, syscall filtering, arg inspection,
2//! inheritance.
3
4#[cfg(feature = "alloc")]
5use alloc::vec::Vec;
6use core::sync::atomic::{AtomicU64, Ordering};
7
8use crate::error::KernelError;
9
10/// BPF instruction opcodes for seccomp filters.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12#[repr(u16)]
13pub enum BpfOpcode {
14    /// Load word at absolute offset.
15    LdAbsW = 0x20,
16    /// Load half-word at absolute offset.
17    LdAbsH = 0x28,
18    /// Load byte at absolute offset.
19    LdAbsB = 0x30,
20    /// Jump if equal (immediate).
21    JmpJeqK = 0x15,
22    /// Jump if greater or equal (immediate).
23    JmpJgeK = 0x35,
24    /// Jump if set (bitwise AND, immediate).
25    JmpJsetK = 0x45,
26    /// Unconditional jump.
27    JmpJa = 0x05,
28    /// Return (action).
29    Ret = 0x06,
30    /// ALU AND (immediate).
31    AluAndK = 0x54,
32}
33
34/// Seccomp return action values.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36#[repr(u32)]
37pub enum SeccompAction {
38    /// Allow the syscall.
39    Allow = 0x7fff_0000,
40    /// Kill the thread.
41    KillThread = 0x0000_0000,
42    /// Kill the process.
43    KillProcess = 0x8000_0000,
44    /// Trigger a SIGSYS and deliver a signal.
45    Trap = 0x0003_0000,
46    /// Return an errno value (low 16 bits).
47    Errno = 0x0005_0000,
48    /// Notify a tracing process.
49    Trace = 0x7ff0_0000,
50    /// Log the syscall and allow it.
51    Log = 0x7ffc_0000,
52}
53
54impl SeccompAction {
55    /// Create an Errno action with a specific errno value.
56    pub fn errno(errno: u16) -> u32 {
57        Self::Errno as u32 | (errno as u32)
58    }
59}
60
61/// A single BPF instruction.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub struct BpfInstruction {
64    /// Opcode.
65    pub code: u16,
66    /// Jump target if condition is true.
67    pub jt: u8,
68    /// Jump target if condition is false.
69    pub jf: u8,
70    /// Immediate value.
71    pub k: u32,
72}
73
74impl BpfInstruction {
75    /// Create a load-word instruction at the given offset.
76    pub fn load_word(offset: u32) -> Self {
77        Self {
78            code: BpfOpcode::LdAbsW as u16,
79            jt: 0,
80            jf: 0,
81            k: offset,
82        }
83    }
84
85    /// Create a jump-if-equal instruction.
86    pub fn jump_eq(value: u32, jt: u8, jf: u8) -> Self {
87        Self {
88            code: BpfOpcode::JmpJeqK as u16,
89            jt,
90            jf,
91            k: value,
92        }
93    }
94
95    /// Create a jump-if-greater-or-equal instruction.
96    pub fn jump_ge(value: u32, jt: u8, jf: u8) -> Self {
97        Self {
98            code: BpfOpcode::JmpJgeK as u16,
99            jt,
100            jf,
101            k: value,
102        }
103    }
104
105    /// Create a bitwise AND test (jump if set) instruction.
106    pub fn jump_set(mask: u32, jt: u8, jf: u8) -> Self {
107        Self {
108            code: BpfOpcode::JmpJsetK as u16,
109            jt,
110            jf,
111            k: mask,
112        }
113    }
114
115    /// Create an unconditional jump.
116    pub fn jump(offset: u32) -> Self {
117        Self {
118            code: BpfOpcode::JmpJa as u16,
119            jt: 0,
120            jf: 0,
121            k: offset,
122        }
123    }
124
125    /// Create a return instruction.
126    pub fn ret(action: u32) -> Self {
127        Self {
128            code: BpfOpcode::Ret as u16,
129            jt: 0,
130            jf: 0,
131            k: action,
132        }
133    }
134
135    /// Create an ALU AND instruction.
136    pub fn alu_and(mask: u32) -> Self {
137        Self {
138            code: BpfOpcode::AluAndK as u16,
139            jt: 0,
140            jf: 0,
141            k: mask,
142        }
143    }
144}
145
146/// Seccomp data offsets (for x86_64 struct seccomp_data layout).
147pub mod seccomp_offsets {
148    /// Offset of syscall number (nr field).
149    pub const NR: u32 = 0;
150    /// Offset of architecture (arch field).
151    pub const ARCH: u32 = 4;
152    /// Offset of instruction pointer (instruction_pointer field).
153    pub const IP_LO: u32 = 8;
154    pub const IP_HI: u32 = 12;
155    /// Offset of syscall arguments (args[0..5]).
156    pub const ARG0_LO: u32 = 16;
157    pub const ARG0_HI: u32 = 20;
158    pub const ARG1_LO: u32 = 24;
159    pub const ARG1_HI: u32 = 28;
160    pub const ARG2_LO: u32 = 32;
161    pub const ARG2_HI: u32 = 36;
162    pub const ARG3_LO: u32 = 40;
163    pub const ARG3_HI: u32 = 44;
164    pub const ARG4_LO: u32 = 48;
165    pub const ARG4_HI: u32 = 52;
166    pub const ARG5_LO: u32 = 56;
167    pub const ARG5_HI: u32 = 60;
168}
169
170/// Audit architecture values.
171pub mod audit_arch {
172    pub const X86_64: u32 = 0xC000_003E;
173    pub const AARCH64: u32 = 0xC000_00B7;
174    pub const RISCV64: u32 = 0xC000_00F3;
175}
176
177/// Seccomp operating modes.
178#[derive(Debug, Clone, Copy, PartialEq, Eq)]
179pub enum SeccompMode {
180    /// No filtering (disabled).
181    Disabled,
182    /// Strict mode: only read, write, exit, sigreturn allowed.
183    Strict,
184    /// Filter mode: BPF program decides.
185    Filter,
186}
187
188/// A seccomp BPF filter program.
189#[cfg(feature = "alloc")]
190#[derive(Debug, Clone)]
191pub struct SeccompFilter {
192    /// BPF instructions.
193    pub instructions: Vec<BpfInstruction>,
194    /// Whether this filter should be inherited on fork.
195    pub inherit_on_fork: bool,
196    /// Filter ID for tracking.
197    pub filter_id: u64,
198}
199
200static NEXT_FILTER_ID: AtomicU64 = AtomicU64::new(1);
201
202#[cfg(feature = "alloc")]
203impl SeccompFilter {
204    /// Create a new empty filter.
205    pub fn new() -> Self {
206        Self {
207            instructions: Vec::new(),
208            inherit_on_fork: true,
209            filter_id: NEXT_FILTER_ID.fetch_add(1, Ordering::Relaxed),
210        }
211    }
212
213    /// Add an instruction to the filter.
214    pub fn push(&mut self, insn: BpfInstruction) {
215        self.instructions.push(insn);
216    }
217
218    /// Get the number of instructions.
219    pub fn len(&self) -> usize {
220        self.instructions.len()
221    }
222
223    /// Check if the filter is empty.
224    pub fn is_empty(&self) -> bool {
225        self.instructions.is_empty()
226    }
227
228    /// Validate the filter program.
229    pub fn validate(&self) -> Result<(), KernelError> {
230        if self.instructions.is_empty() {
231            return Err(KernelError::InvalidArgument {
232                name: "seccomp filter",
233                value: "empty program",
234            });
235        }
236        // Max 4096 instructions (Linux limit)
237        if self.instructions.len() > 4096 {
238            return Err(KernelError::InvalidArgument {
239                name: "seccomp filter",
240                value: "exceeds 4096 instructions",
241            });
242        }
243        // Last instruction must be a return
244        if let Some(last) = self.instructions.last() {
245            if last.code != BpfOpcode::Ret as u16 {
246                return Err(KernelError::InvalidArgument {
247                    name: "seccomp filter",
248                    value: "must end with RET",
249                });
250            }
251        }
252        // Validate jump targets
253        let len = self.instructions.len();
254        for (i, insn) in self.instructions.iter().enumerate() {
255            let code = insn.code;
256            if code == BpfOpcode::JmpJeqK as u16
257                || code == BpfOpcode::JmpJgeK as u16
258                || code == BpfOpcode::JmpJsetK as u16
259            {
260                let jt_target = i + 1 + insn.jt as usize;
261                let jf_target = i + 1 + insn.jf as usize;
262                if jt_target >= len || jf_target >= len {
263                    return Err(KernelError::InvalidArgument {
264                        name: "seccomp filter",
265                        value: "jump target out of bounds",
266                    });
267                }
268            }
269            if code == BpfOpcode::JmpJa as u16 {
270                let target = i + 1 + insn.k as usize;
271                if target >= len {
272                    return Err(KernelError::InvalidArgument {
273                        name: "seccomp filter",
274                        value: "jump target out of bounds",
275                    });
276                }
277            }
278        }
279        Ok(())
280    }
281
282    /// Execute the filter against a seccomp_data structure.
283    /// Returns the action (SeccompAction value | errno).
284    pub fn evaluate(&self, data: &SeccompData) -> u32 {
285        let mut accumulator: u32 = 0;
286        let mut pc: usize = 0;
287        let data_bytes = data.as_bytes();
288
289        while pc < self.instructions.len() {
290            let insn = &self.instructions[pc];
291            match insn.code {
292                c if c == BpfOpcode::LdAbsW as u16 => {
293                    let off = insn.k as usize;
294                    if off + 4 <= data_bytes.len() {
295                        accumulator = u32::from_ne_bytes([
296                            data_bytes[off],
297                            data_bytes[off + 1],
298                            data_bytes[off + 2],
299                            data_bytes[off + 3],
300                        ]);
301                    }
302                    pc += 1;
303                }
304                c if c == BpfOpcode::LdAbsH as u16 => {
305                    let off = insn.k as usize;
306                    if off + 2 <= data_bytes.len() {
307                        accumulator =
308                            u16::from_ne_bytes([data_bytes[off], data_bytes[off + 1]]) as u32;
309                    }
310                    pc += 1;
311                }
312                c if c == BpfOpcode::LdAbsB as u16 => {
313                    let off = insn.k as usize;
314                    if off < data_bytes.len() {
315                        accumulator = data_bytes[off] as u32;
316                    }
317                    pc += 1;
318                }
319                c if c == BpfOpcode::JmpJeqK as u16 => {
320                    if accumulator == insn.k {
321                        pc += 1 + insn.jt as usize;
322                    } else {
323                        pc += 1 + insn.jf as usize;
324                    }
325                }
326                c if c == BpfOpcode::JmpJgeK as u16 => {
327                    if accumulator >= insn.k {
328                        pc += 1 + insn.jt as usize;
329                    } else {
330                        pc += 1 + insn.jf as usize;
331                    }
332                }
333                c if c == BpfOpcode::JmpJsetK as u16 => {
334                    if accumulator & insn.k != 0 {
335                        pc += 1 + insn.jt as usize;
336                    } else {
337                        pc += 1 + insn.jf as usize;
338                    }
339                }
340                c if c == BpfOpcode::JmpJa as u16 => {
341                    pc += 1 + insn.k as usize;
342                }
343                c if c == BpfOpcode::Ret as u16 => {
344                    return insn.k;
345                }
346                c if c == BpfOpcode::AluAndK as u16 => {
347                    accumulator &= insn.k;
348                    pc += 1;
349                }
350                _ => {
351                    // Unknown opcode: kill
352                    return SeccompAction::KillThread as u32;
353                }
354            }
355
356            // Safety: prevent infinite loops
357            if pc >= self.instructions.len() {
358                return SeccompAction::KillThread as u32;
359            }
360        }
361
362        SeccompAction::KillThread as u32
363    }
364
365    /// Build a filter that checks architecture and denies a set of syscall
366    /// numbers.
367    pub fn deny_syscalls(arch: u32, denied: &[u32], errno_val: u16) -> Self {
368        let mut filter = Self::new();
369        let num_denied = denied.len();
370
371        // Load architecture
372        filter.push(BpfInstruction::load_word(seccomp_offsets::ARCH));
373        // If arch doesn't match, kill
374        filter.push(BpfInstruction::jump_eq(arch, 1, 0));
375        filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
376
377        // Load syscall number
378        filter.push(BpfInstruction::load_word(seccomp_offsets::NR));
379
380        // For each denied syscall, check and return errno
381        for (i, &nr) in denied.iter().enumerate() {
382            let remaining = num_denied - i - 1;
383            // jt = jump to errno return (which is at the end of deny checks)
384            // jf = check next deny or fall through to allow
385            // jt must skip remaining deny checks + the allow return to reach errno return
386            let jt = (remaining as u8).saturating_add(1);
387            filter.push(BpfInstruction::jump_eq(nr, jt, 0));
388        }
389
390        // Default: allow
391        filter.push(BpfInstruction::ret(SeccompAction::Allow as u32));
392
393        // Errno return
394        filter.push(BpfInstruction::ret(SeccompAction::errno(errno_val)));
395
396        filter
397    }
398
399    /// Build a filter that only allows a whitelist of syscalls.
400    pub fn allow_syscalls(arch: u32, allowed: &[u32]) -> Self {
401        let mut filter = Self::new();
402        let num_allowed = allowed.len();
403
404        // Load architecture
405        filter.push(BpfInstruction::load_word(seccomp_offsets::ARCH));
406        filter.push(BpfInstruction::jump_eq(arch, 1, 0));
407        filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
408
409        // Load syscall number
410        filter.push(BpfInstruction::load_word(seccomp_offsets::NR));
411
412        // For each allowed syscall, jump to allow
413        for (i, &nr) in allowed.iter().enumerate() {
414            let remaining = num_allowed - i - 1;
415            // jt = jump to allow (which is `remaining` checks + 1 kill instruction away)
416            let jt = (remaining as u8).saturating_add(1);
417            filter.push(BpfInstruction::jump_eq(nr, jt, 0));
418        }
419
420        // Default: kill
421        filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
422
423        // Allow return
424        filter.push(BpfInstruction::ret(SeccompAction::Allow as u32));
425
426        filter
427    }
428}
429
430#[cfg(feature = "alloc")]
431impl Default for SeccompFilter {
432    fn default() -> Self {
433        Self::new()
434    }
435}
436
437/// Seccomp data structure matching the kernel's struct seccomp_data.
438#[derive(Debug, Clone, Copy, PartialEq, Eq)]
439#[repr(C)]
440pub struct SeccompData {
441    /// Syscall number.
442    pub nr: u32,
443    /// Architecture (AUDIT_ARCH_*).
444    pub arch: u32,
445    /// Instruction pointer.
446    pub instruction_pointer: u64,
447    /// Syscall arguments (up to 6).
448    pub args: [u64; 6],
449}
450
451impl SeccompData {
452    pub fn new(nr: u32, arch: u32, args: [u64; 6]) -> Self {
453        Self {
454            nr,
455            arch,
456            instruction_pointer: 0,
457            args,
458        }
459    }
460
461    /// Convert to a byte representation for BPF evaluation.
462    pub fn as_bytes(&self) -> [u8; 64] {
463        let mut buf = [0u8; 64];
464        // nr at offset 0
465        buf[0..4].copy_from_slice(&self.nr.to_ne_bytes());
466        // arch at offset 4
467        buf[4..8].copy_from_slice(&self.arch.to_ne_bytes());
468        // instruction_pointer at offset 8
469        buf[8..16].copy_from_slice(&self.instruction_pointer.to_ne_bytes());
470        // args at offset 16
471        for (i, &arg) in self.args.iter().enumerate() {
472            let off = 16 + i * 8;
473            buf[off..off + 8].copy_from_slice(&arg.to_ne_bytes());
474        }
475        buf
476    }
477}
478
479/// Per-process seccomp state.
480#[cfg(feature = "alloc")]
481#[derive(Debug, Clone)]
482pub struct SeccompState {
483    /// Current mode.
484    pub mode: SeccompMode,
485    /// Stack of filters (all evaluated, most restrictive wins).
486    pub filters: Vec<SeccompFilter>,
487}
488
489#[cfg(feature = "alloc")]
490impl SeccompState {
491    pub fn new() -> Self {
492        Self {
493            mode: SeccompMode::Disabled,
494            filters: Vec::new(),
495        }
496    }
497
498    /// Install a new filter. Mode transitions to Filter.
499    pub fn install_filter(&mut self, filter: SeccompFilter) -> Result<(), KernelError> {
500        filter.validate()?;
501        self.mode = SeccompMode::Filter;
502        self.filters.push(filter);
503        Ok(())
504    }
505
506    /// Evaluate all filters against the given syscall data.
507    /// Returns the most restrictive action (lowest value wins per Linux
508    /// semantics).
509    pub fn evaluate(&self, data: &SeccompData) -> u32 {
510        match self.mode {
511            SeccompMode::Disabled => SeccompAction::Allow as u32,
512            SeccompMode::Strict => {
513                // Only allow read(0), write(1), exit(60), sigreturn(15)
514                match data.nr {
515                    0 | 1 | 15 | 60 => SeccompAction::Allow as u32,
516                    _ => SeccompAction::KillThread as u32,
517                }
518            }
519            SeccompMode::Filter => {
520                let mut result = SeccompAction::Allow as u32;
521                for filter in &self.filters {
522                    let action = filter.evaluate(data);
523                    // Most restrictive wins (lower value = more restrictive)
524                    if action < result {
525                        result = action;
526                    }
527                }
528                result
529            }
530        }
531    }
532
533    /// Create a copy for a forked process (inherits filters marked for
534    /// inheritance).
535    pub fn fork_inherit(&self) -> Self {
536        Self {
537            mode: self.mode,
538            filters: self
539                .filters
540                .iter()
541                .filter(|f| f.inherit_on_fork)
542                .cloned()
543                .collect(),
544        }
545    }
546
547    pub fn filter_count(&self) -> usize {
548        self.filters.len()
549    }
550}
551
552#[cfg(feature = "alloc")]
553impl Default for SeccompState {
554    fn default() -> Self {
555        Self::new()
556    }
557}