⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/arch/x86_64/
syscall.rs

1//! x86_64 system call entry point and SYSCALL/SYSRET MSR configuration.
2//!
3//! This module configures the CPU's SYSCALL/SYSRET mechanism for user-kernel
4//! transitions. The key components are:
5//! - `syscall_entry`: naked assembly handler invoked by the SYSCALL instruction
6//! - `PerCpuData`: per-CPU storage for kernel/user RSP, accessed via GS segment
7//! - `init_syscall`: MSR configuration (EFER, STAR, LSTAR, SFMASK,
8//!   KernelGsBase)
9
10#![allow(function_casts_as_integer)]
11
12use core::{
13    cell::UnsafeCell,
14    sync::atomic::{AtomicU64, Ordering},
15};
16
17use crate::syscall::syscall_handler;
18
19/// Saved user register frame from SYSCALL entry.
20///
21/// This struct matches the exact push order in `syscall_entry` assembly.
22/// After all pushes, RSP points to this layout (lowest address = first field).
23/// The struct is used by `fork_process()` to capture the live register state
24/// of the parent at the moment of the fork() syscall, so the child gets a
25/// copy of the parent's actual CPU registers rather than the stale
26/// ThreadContext from process creation time.
27#[repr(C)]
28#[derive(Debug, Clone, Copy)]
29pub struct SyscallFrame {
30    pub r9: u64,  // arg6 (pushed last)
31    pub r8: u64,  // arg5
32    pub r10: u64, // arg4
33    pub rdx: u64, // arg3
34    pub rsi: u64, // arg2
35    pub rdi: u64, // arg1
36    pub r15: u64,
37    pub r14: u64,
38    pub r13: u64,
39    pub r12: u64,
40    pub rbx: u64,
41    pub rbp: u64,
42    pub r11: u64, // User RFLAGS (clobbered by SYSCALL)
43    pub rcx: u64, // User RIP (clobbered by SYSCALL)
44}
45
46/// Kernel stack pointer after all user registers are saved in syscall_entry.
47/// Points to a valid `SyscallFrame` during syscall handler execution.
48/// Set to 0 outside of syscall context.
49static SYSCALL_FRAME_PTR: AtomicU64 = AtomicU64::new(0);
50
51/// Get a reference to the saved syscall register frame.
52///
53/// Only valid during syscall handler execution. Returns `None` if called
54/// outside of a syscall context.
55///
56/// # Safety
57/// The returned reference points to the kernel stack. It is valid only while
58/// the syscall handler is executing (before registers are popped on return).
59pub fn get_syscall_frame() -> Option<&'static SyscallFrame> {
60    let ptr = SYSCALL_FRAME_PTR.load(Ordering::Acquire);
61    if ptr == 0 {
62        return None;
63    }
64    // SAFETY: SYSCALL_FRAME_PTR is set by syscall_entry to point to the
65    // kernel stack after all registers are pushed. The pointer is valid
66    // for the duration of the syscall handler. The SyscallFrame layout
67    // matches the exact push order in the assembly.
68    Some(unsafe { &*(ptr as *const SyscallFrame) })
69}
70
71/// Get the user RSP saved by syscall_entry into per-CPU data.
72///
73/// Only valid during syscall handler execution.
74pub fn get_saved_user_rsp() -> u64 {
75    // SAFETY: PER_CPU_AREA.user_rsp is set by syscall_entry (mov gs:[0x8], rsp)
76    // before switching to the kernel stack. It is valid during syscall handling.
77    unsafe { (*PER_CPU_AREA.0.get()).user_rsp }
78}
79
80/// Per-CPU data accessed via GS segment register during syscall entry/exit.
81///
82/// The `syscall_entry` naked asm reads `kernel_rsp` from `gs:[0x0]` and saves
83/// `user_rsp` to `gs:[0x8]`. This struct must be `#[repr(C)]` to guarantee
84/// field layout matches the assembly offsets.
85#[repr(C)]
86pub struct PerCpuData {
87    /// Kernel stack pointer (offset 0x0) -- loaded into RSP on syscall entry
88    pub kernel_rsp: u64,
89    /// User stack pointer (offset 0x8) -- saved from RSP on syscall entry
90    pub user_rsp: u64,
91}
92
93#[repr(transparent)]
94pub(super) struct PerCpuDataCell(UnsafeCell<PerCpuData>);
95
96// SAFETY: Per-CPU data is only accessed via GS register from the current CPU
97// during syscall entry/exit. On a single-CPU system (our current QEMU config),
98// there are no concurrent accesses. The naked asm in syscall_entry uses
99// `mov gs:[offset]` which does not go through Rust's aliasing rules.
100unsafe impl Sync for PerCpuDataCell {}
101
102pub(super) static PER_CPU_AREA: PerCpuDataCell = PerCpuDataCell(UnsafeCell::new(PerCpuData {
103    kernel_rsp: 0,
104    user_rsp: 0,
105}));
106
107// CR3 switching removed: Process page tables now contain complete kernel
108// mapping (L4 entries 256-511 copied from boot tables), so syscalls run
109// with user CR3 active. This eliminates the GP fault on CR3 restore that
110// occurred when switching back to incompatible user page tables.
111
112/// Get a mutable pointer to the per-CPU data.
113///
114/// Used to update `kernel_rsp` on context switch and to set up KernelGsBase
115/// during init. The returned pointer is valid for the lifetime of the kernel.
116pub fn per_cpu_data_ptr() -> *mut PerCpuData {
117    PER_CPU_AREA.0.get()
118}
119
120/// x86_64 SYSCALL instruction entry point
121///
122/// This function handles the transition from user mode to kernel mode
123/// when a SYSCALL instruction is executed. It saves the user context,
124/// switches to the kernel stack, and calls the system call handler.
125///
126/// # Safety
127/// This function must only be called by the CPU's SYSCALL instruction.
128/// It expects specific register states as defined by the x86_64 ABI.
129#[no_mangle]
130#[unsafe(naked)]
131pub unsafe extern "C" fn syscall_entry() {
132    core::arch::naked_asm!(
133        // Save user context on kernel stack
134        "swapgs",                    // Switch to kernel GS
135        "mov gs:[0x8], rsp",        // Save user RSP in per-CPU data (offset 0x8)
136        "mov rsp, gs:[0x0]",        // Load kernel RSP from per-CPU data (offset 0x0)
137
138        // CR3 switching removed: Process page tables contain complete kernel
139        // mapping, so we can access kernel data structures directly without
140        // switching to boot page tables.
141
142        // Save all user registers.
143        // rcx and r11 are clobbered by SYSCALL (RIP / RFLAGS), saved first.
144        // Callee-saved: rbp, rbx, r12-r15. Caller-saved / args: rdi, rsi,
145        // rdx, r10, r8, r9. All must be preserved so the user sees correct
146        // values after SYSRET (except rax which holds the return value).
147        "push rcx",                  // User RIP
148        "push r11",                  // User RFLAGS
149        "push rbp",
150        "push rbx",
151        "push r12",
152        "push r13",
153        "push r14",
154        "push r15",
155        "push rdi",                  // arg1 (will be clobbered by ABI shuffle)
156        "push rsi",                  // arg2
157        "push rdx",                  // arg3
158        "push r10",                  // arg4
159        "push r8",                   // arg5
160        "push r9",                   // arg6
161
162        // Save frame pointer for fork() register capture.
163        // RSP now points to the complete SyscallFrame on the kernel stack.
164        // fork_process() reads this to give the child a copy of the parent's
165        // live registers instead of the stale ThreadContext from exec/load.
166        "mov [{frame_ptr}], rsp",
167
168        // Rearrange registers from SYSCALL ABI to C calling convention.
169        //
170        // SYSCALL ABI:  rax=number, rdi=arg1, rsi=arg2, rdx=arg3, r10=arg4, r8=arg5
171        // C convention: rdi=param1, rsi=param2, rdx=param3, rcx=param4, r8=param5, r9=param6
172        //
173        // We need: rdi=rax, rsi=rdi, rdx=rsi, rcx=rdx, r8=r10, r9=r8
174        // Use xchg chain through rax as accumulator to rotate the values.
175        "xchg rdi, rax",             // rdi = syscall_num (rax), rax = arg1 (old rdi)
176        "xchg rsi, rax",             // rsi = arg1 (rax), rax = arg2 (old rsi)
177        "xchg rdx, rax",             // rdx = arg2 (rax), rax = arg3 (old rdx)
178        "mov rcx, rax",              // rcx = arg3 (old rdx)
179        "mov r9, r8",                // r9 = arg5 (must precede r8 overwrite)
180        "mov r8, r10",               // r8 = arg4
181
182        "call {handler}",
183
184        // Clear frame pointer now that handler has returned.
185        // This prevents stale pointer use outside syscall context.
186        "mov qword ptr [{frame_ptr}], 0",
187
188        // Restore user registers (reverse order of saves).
189        // rax holds the syscall return value and is NOT restored.
190        "pop r9",
191        "pop r8",
192        "pop r10",
193        "pop rdx",
194        "pop rsi",
195        "pop rdi",
196        "pop r15",
197        "pop r14",
198        "pop r13",
199        "pop r12",
200        "pop rbx",
201        "pop rbp",
202        "pop r11",                   // User RFLAGS
203        "pop rcx",                   // User RIP
204
205        // Restore user stack and return (no CR3 switching)
206        "mov rsp, gs:[0x8]",        // Restore user RSP
207        "swapgs",                    // Switch back to user GS
208        "sysretq",
209
210        handler = sym syscall_handler,
211        frame_ptr = sym SYSCALL_FRAME_PTR,
212    );
213}
214
215/// Initialize SYSCALL/SYSRET support.
216///
217/// Configures the following MSRs:
218/// - **EFER**: Enable SYSCALL/SYSRET extensions
219/// - **LSTAR**: Set syscall entry point to `syscall_entry`
220/// - **STAR**: Set segment selectors for SYSCALL (kernel) and SYSRET (user)
221/// - **SFMASK**: Mask IF flag so syscall entry runs with interrupts disabled
222/// - **KernelGsBase**: Point to `PerCpuData` for swapgs in syscall_entry
223///
224/// Must be called after `gdt::init()` and before any user-mode transitions.
225pub fn init_syscall() {
226    use x86_64::registers::{
227        model_specific::{Efer, EferFlags, KernelGsBase, LStar, SFMask, Star},
228        rflags::RFlags,
229    };
230
231    use super::gdt;
232
233    let sels = gdt::selectors();
234
235    // SAFETY: Writing MSRs to configure SYSCALL/SYSRET is required during
236    // kernel init for system call support. EFER, LSTAR, STAR, SFMASK, and
237    // KernelGsBase are x86_64 model-specific registers that control the
238    // SYSCALL instruction behavior. This is called with interrupts disabled
239    // during single-threaded init.
240    unsafe {
241        // Enable SYSCALL/SYSRET
242        Efer::update(|flags| {
243            flags.insert(EferFlags::SYSTEM_CALL_EXTENSIONS);
244        });
245    }
246
247    // Set up SYSCALL entry point
248    LStar::write(x86_64::VirtAddr::new(syscall_entry as usize as u64));
249
250    // Set up segment selectors for SYSCALL/SYSRET transitions.
251    //
252    // GDT layout after gdt::init():
253    //   0x08: Kernel CS (Ring 0)
254    //   0x10: Kernel DS (Ring 0)
255    //   0x18: TSS (occupies 2 entries)
256    //   0x28: User Data (Ring 3, selector 0x2B with RPL)
257    //   0x30: User Code (Ring 3, selector 0x33 with RPL)
258    //
259    // Star::write validates:
260    //   cs_sysret(0x33) - 16 = 0x23 == ss_sysret(0x2B) - 8 = 0x23  (match)
261    //   cs_syscall(0x08) == ss_syscall(0x10) - 8 = 0x08              (match)
262    //   ss_sysret RPL = 3 (Ring3)                                     (correct)
263    //   ss_syscall RPL = 0 (Ring0)                                    (correct)
264    //
265    // Internally writes STAR[63:48] = ss_sysret - 8 = 0x23, which means:
266    //   SYSRET: CS = 0x23+16 = 0x33 (user code), SS = 0x23+8 = 0x2B (user data)
267    Star::write(
268        sels.user_code_selector, // User CS for SYSRET (0x33)
269        sels.user_data_selector, // User SS for SYSRET (0x2B)
270        sels.code_selector,      // Kernel CS for SYSCALL (0x08)
271        sels.data_selector,      // Kernel SS for SYSCALL (0x10)
272    )
273    .expect("failed to configure STAR MSR segment selectors");
274
275    // SFMASK: mask the IF flag during SYSCALL so we enter with interrupts
276    // disabled. This prevents interrupt handlers from firing before we have
277    // switched to the kernel stack.
278    SFMask::write(RFlags::INTERRUPT_FLAG);
279
280    // Set up per-CPU data for swapgs.
281    // KernelGsBase is swapped with GsBase on the `swapgs` instruction.
282    // After swapgs in syscall_entry, GS points to our PerCpuData so the
283    // assembly can read kernel_rsp from gs:[0x0] and save user_rsp to gs:[0x8].
284    //
285    // CR3 initialization removed: Process page tables now contain complete
286    // kernel mappings (L4 entries 256-511), so syscalls run with user CR3
287    // and can directly access kernel data structures.
288
289    let per_cpu_addr = per_cpu_data_ptr() as u64;
290    KernelGsBase::write(x86_64::VirtAddr::new(per_cpu_addr));
291}