veridian_kernel/virt/containers/
seccomp.rs1#[cfg(feature = "alloc")]
5use alloc::vec::Vec;
6use core::sync::atomic::{AtomicU64, Ordering};
7
8use crate::error::KernelError;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12#[repr(u16)]
13pub enum BpfOpcode {
14 LdAbsW = 0x20,
16 LdAbsH = 0x28,
18 LdAbsB = 0x30,
20 JmpJeqK = 0x15,
22 JmpJgeK = 0x35,
24 JmpJsetK = 0x45,
26 JmpJa = 0x05,
28 Ret = 0x06,
30 AluAndK = 0x54,
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36#[repr(u32)]
37pub enum SeccompAction {
38 Allow = 0x7fff_0000,
40 KillThread = 0x0000_0000,
42 KillProcess = 0x8000_0000,
44 Trap = 0x0003_0000,
46 Errno = 0x0005_0000,
48 Trace = 0x7ff0_0000,
50 Log = 0x7ffc_0000,
52}
53
54impl SeccompAction {
55 pub fn errno(errno: u16) -> u32 {
57 Self::Errno as u32 | (errno as u32)
58 }
59}
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub struct BpfInstruction {
64 pub code: u16,
66 pub jt: u8,
68 pub jf: u8,
70 pub k: u32,
72}
73
74impl BpfInstruction {
75 pub fn load_word(offset: u32) -> Self {
77 Self {
78 code: BpfOpcode::LdAbsW as u16,
79 jt: 0,
80 jf: 0,
81 k: offset,
82 }
83 }
84
85 pub fn jump_eq(value: u32, jt: u8, jf: u8) -> Self {
87 Self {
88 code: BpfOpcode::JmpJeqK as u16,
89 jt,
90 jf,
91 k: value,
92 }
93 }
94
95 pub fn jump_ge(value: u32, jt: u8, jf: u8) -> Self {
97 Self {
98 code: BpfOpcode::JmpJgeK as u16,
99 jt,
100 jf,
101 k: value,
102 }
103 }
104
105 pub fn jump_set(mask: u32, jt: u8, jf: u8) -> Self {
107 Self {
108 code: BpfOpcode::JmpJsetK as u16,
109 jt,
110 jf,
111 k: mask,
112 }
113 }
114
115 pub fn jump(offset: u32) -> Self {
117 Self {
118 code: BpfOpcode::JmpJa as u16,
119 jt: 0,
120 jf: 0,
121 k: offset,
122 }
123 }
124
125 pub fn ret(action: u32) -> Self {
127 Self {
128 code: BpfOpcode::Ret as u16,
129 jt: 0,
130 jf: 0,
131 k: action,
132 }
133 }
134
135 pub fn alu_and(mask: u32) -> Self {
137 Self {
138 code: BpfOpcode::AluAndK as u16,
139 jt: 0,
140 jf: 0,
141 k: mask,
142 }
143 }
144}
145
146pub mod seccomp_offsets {
148 pub const NR: u32 = 0;
150 pub const ARCH: u32 = 4;
152 pub const IP_LO: u32 = 8;
154 pub const IP_HI: u32 = 12;
155 pub const ARG0_LO: u32 = 16;
157 pub const ARG0_HI: u32 = 20;
158 pub const ARG1_LO: u32 = 24;
159 pub const ARG1_HI: u32 = 28;
160 pub const ARG2_LO: u32 = 32;
161 pub const ARG2_HI: u32 = 36;
162 pub const ARG3_LO: u32 = 40;
163 pub const ARG3_HI: u32 = 44;
164 pub const ARG4_LO: u32 = 48;
165 pub const ARG4_HI: u32 = 52;
166 pub const ARG5_LO: u32 = 56;
167 pub const ARG5_HI: u32 = 60;
168}
169
170pub mod audit_arch {
172 pub const X86_64: u32 = 0xC000_003E;
173 pub const AARCH64: u32 = 0xC000_00B7;
174 pub const RISCV64: u32 = 0xC000_00F3;
175}
176
177#[derive(Debug, Clone, Copy, PartialEq, Eq)]
179pub enum SeccompMode {
180 Disabled,
182 Strict,
184 Filter,
186}
187
188#[cfg(feature = "alloc")]
190#[derive(Debug, Clone)]
191pub struct SeccompFilter {
192 pub instructions: Vec<BpfInstruction>,
194 pub inherit_on_fork: bool,
196 pub filter_id: u64,
198}
199
200static NEXT_FILTER_ID: AtomicU64 = AtomicU64::new(1);
201
202#[cfg(feature = "alloc")]
203impl SeccompFilter {
204 pub fn new() -> Self {
206 Self {
207 instructions: Vec::new(),
208 inherit_on_fork: true,
209 filter_id: NEXT_FILTER_ID.fetch_add(1, Ordering::Relaxed),
210 }
211 }
212
213 pub fn push(&mut self, insn: BpfInstruction) {
215 self.instructions.push(insn);
216 }
217
218 pub fn len(&self) -> usize {
220 self.instructions.len()
221 }
222
223 pub fn is_empty(&self) -> bool {
225 self.instructions.is_empty()
226 }
227
228 pub fn validate(&self) -> Result<(), KernelError> {
230 if self.instructions.is_empty() {
231 return Err(KernelError::InvalidArgument {
232 name: "seccomp filter",
233 value: "empty program",
234 });
235 }
236 if self.instructions.len() > 4096 {
238 return Err(KernelError::InvalidArgument {
239 name: "seccomp filter",
240 value: "exceeds 4096 instructions",
241 });
242 }
243 if let Some(last) = self.instructions.last() {
245 if last.code != BpfOpcode::Ret as u16 {
246 return Err(KernelError::InvalidArgument {
247 name: "seccomp filter",
248 value: "must end with RET",
249 });
250 }
251 }
252 let len = self.instructions.len();
254 for (i, insn) in self.instructions.iter().enumerate() {
255 let code = insn.code;
256 if code == BpfOpcode::JmpJeqK as u16
257 || code == BpfOpcode::JmpJgeK as u16
258 || code == BpfOpcode::JmpJsetK as u16
259 {
260 let jt_target = i + 1 + insn.jt as usize;
261 let jf_target = i + 1 + insn.jf as usize;
262 if jt_target >= len || jf_target >= len {
263 return Err(KernelError::InvalidArgument {
264 name: "seccomp filter",
265 value: "jump target out of bounds",
266 });
267 }
268 }
269 if code == BpfOpcode::JmpJa as u16 {
270 let target = i + 1 + insn.k as usize;
271 if target >= len {
272 return Err(KernelError::InvalidArgument {
273 name: "seccomp filter",
274 value: "jump target out of bounds",
275 });
276 }
277 }
278 }
279 Ok(())
280 }
281
282 pub fn evaluate(&self, data: &SeccompData) -> u32 {
285 let mut accumulator: u32 = 0;
286 let mut pc: usize = 0;
287 let data_bytes = data.as_bytes();
288
289 while pc < self.instructions.len() {
290 let insn = &self.instructions[pc];
291 match insn.code {
292 c if c == BpfOpcode::LdAbsW as u16 => {
293 let off = insn.k as usize;
294 if off + 4 <= data_bytes.len() {
295 accumulator = u32::from_ne_bytes([
296 data_bytes[off],
297 data_bytes[off + 1],
298 data_bytes[off + 2],
299 data_bytes[off + 3],
300 ]);
301 }
302 pc += 1;
303 }
304 c if c == BpfOpcode::LdAbsH as u16 => {
305 let off = insn.k as usize;
306 if off + 2 <= data_bytes.len() {
307 accumulator =
308 u16::from_ne_bytes([data_bytes[off], data_bytes[off + 1]]) as u32;
309 }
310 pc += 1;
311 }
312 c if c == BpfOpcode::LdAbsB as u16 => {
313 let off = insn.k as usize;
314 if off < data_bytes.len() {
315 accumulator = data_bytes[off] as u32;
316 }
317 pc += 1;
318 }
319 c if c == BpfOpcode::JmpJeqK as u16 => {
320 if accumulator == insn.k {
321 pc += 1 + insn.jt as usize;
322 } else {
323 pc += 1 + insn.jf as usize;
324 }
325 }
326 c if c == BpfOpcode::JmpJgeK as u16 => {
327 if accumulator >= insn.k {
328 pc += 1 + insn.jt as usize;
329 } else {
330 pc += 1 + insn.jf as usize;
331 }
332 }
333 c if c == BpfOpcode::JmpJsetK as u16 => {
334 if accumulator & insn.k != 0 {
335 pc += 1 + insn.jt as usize;
336 } else {
337 pc += 1 + insn.jf as usize;
338 }
339 }
340 c if c == BpfOpcode::JmpJa as u16 => {
341 pc += 1 + insn.k as usize;
342 }
343 c if c == BpfOpcode::Ret as u16 => {
344 return insn.k;
345 }
346 c if c == BpfOpcode::AluAndK as u16 => {
347 accumulator &= insn.k;
348 pc += 1;
349 }
350 _ => {
351 return SeccompAction::KillThread as u32;
353 }
354 }
355
356 if pc >= self.instructions.len() {
358 return SeccompAction::KillThread as u32;
359 }
360 }
361
362 SeccompAction::KillThread as u32
363 }
364
365 pub fn deny_syscalls(arch: u32, denied: &[u32], errno_val: u16) -> Self {
368 let mut filter = Self::new();
369 let num_denied = denied.len();
370
371 filter.push(BpfInstruction::load_word(seccomp_offsets::ARCH));
373 filter.push(BpfInstruction::jump_eq(arch, 1, 0));
375 filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
376
377 filter.push(BpfInstruction::load_word(seccomp_offsets::NR));
379
380 for (i, &nr) in denied.iter().enumerate() {
382 let remaining = num_denied - i - 1;
383 let jt = (remaining as u8).saturating_add(1);
387 filter.push(BpfInstruction::jump_eq(nr, jt, 0));
388 }
389
390 filter.push(BpfInstruction::ret(SeccompAction::Allow as u32));
392
393 filter.push(BpfInstruction::ret(SeccompAction::errno(errno_val)));
395
396 filter
397 }
398
399 pub fn allow_syscalls(arch: u32, allowed: &[u32]) -> Self {
401 let mut filter = Self::new();
402 let num_allowed = allowed.len();
403
404 filter.push(BpfInstruction::load_word(seccomp_offsets::ARCH));
406 filter.push(BpfInstruction::jump_eq(arch, 1, 0));
407 filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
408
409 filter.push(BpfInstruction::load_word(seccomp_offsets::NR));
411
412 for (i, &nr) in allowed.iter().enumerate() {
414 let remaining = num_allowed - i - 1;
415 let jt = (remaining as u8).saturating_add(1);
417 filter.push(BpfInstruction::jump_eq(nr, jt, 0));
418 }
419
420 filter.push(BpfInstruction::ret(SeccompAction::KillProcess as u32));
422
423 filter.push(BpfInstruction::ret(SeccompAction::Allow as u32));
425
426 filter
427 }
428}
429
430#[cfg(feature = "alloc")]
431impl Default for SeccompFilter {
432 fn default() -> Self {
433 Self::new()
434 }
435}
436
437#[derive(Debug, Clone, Copy, PartialEq, Eq)]
439#[repr(C)]
440pub struct SeccompData {
441 pub nr: u32,
443 pub arch: u32,
445 pub instruction_pointer: u64,
447 pub args: [u64; 6],
449}
450
451impl SeccompData {
452 pub fn new(nr: u32, arch: u32, args: [u64; 6]) -> Self {
453 Self {
454 nr,
455 arch,
456 instruction_pointer: 0,
457 args,
458 }
459 }
460
461 pub fn as_bytes(&self) -> [u8; 64] {
463 let mut buf = [0u8; 64];
464 buf[0..4].copy_from_slice(&self.nr.to_ne_bytes());
466 buf[4..8].copy_from_slice(&self.arch.to_ne_bytes());
468 buf[8..16].copy_from_slice(&self.instruction_pointer.to_ne_bytes());
470 for (i, &arg) in self.args.iter().enumerate() {
472 let off = 16 + i * 8;
473 buf[off..off + 8].copy_from_slice(&arg.to_ne_bytes());
474 }
475 buf
476 }
477}
478
479#[cfg(feature = "alloc")]
481#[derive(Debug, Clone)]
482pub struct SeccompState {
483 pub mode: SeccompMode,
485 pub filters: Vec<SeccompFilter>,
487}
488
489#[cfg(feature = "alloc")]
490impl SeccompState {
491 pub fn new() -> Self {
492 Self {
493 mode: SeccompMode::Disabled,
494 filters: Vec::new(),
495 }
496 }
497
498 pub fn install_filter(&mut self, filter: SeccompFilter) -> Result<(), KernelError> {
500 filter.validate()?;
501 self.mode = SeccompMode::Filter;
502 self.filters.push(filter);
503 Ok(())
504 }
505
506 pub fn evaluate(&self, data: &SeccompData) -> u32 {
510 match self.mode {
511 SeccompMode::Disabled => SeccompAction::Allow as u32,
512 SeccompMode::Strict => {
513 match data.nr {
515 0 | 1 | 15 | 60 => SeccompAction::Allow as u32,
516 _ => SeccompAction::KillThread as u32,
517 }
518 }
519 SeccompMode::Filter => {
520 let mut result = SeccompAction::Allow as u32;
521 for filter in &self.filters {
522 let action = filter.evaluate(data);
523 if action < result {
525 result = action;
526 }
527 }
528 result
529 }
530 }
531 }
532
533 pub fn fork_inherit(&self) -> Self {
536 Self {
537 mode: self.mode,
538 filters: self
539 .filters
540 .iter()
541 .filter(|f| f.inherit_on_fork)
542 .cloned()
543 .collect(),
544 }
545 }
546
547 pub fn filter_count(&self) -> usize {
548 self.filters.len()
549 }
550}
551
552#[cfg(feature = "alloc")]
553impl Default for SeccompState {
554 fn default() -> Self {
555 Self::new()
556 }
557}