⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/arch/x86_64/
usermode.rs

1//! User-mode entry point for x86_64
2//!
3//! Provides `enter_usermode()` which pushes the iretq frame and transitions
4//! the CPU from Ring 0 to Ring 3. Also provides `map_user_page()` for
5//! creating user-accessible page table entries through the bootloader's
6//! physical memory mapping.
7//!
8//! `enter_usermode_returnable()` is a variant that saves the boot context
9//! (callee-saved registers, RSP, CR3) so that `sys_exit` can restore it
10//! and effectively "return" to the caller, allowing sequential user-mode
11//! program execution during bootstrap.
12
13use core::{
14    arch::asm,
15    sync::atomic::{AtomicU64, Ordering},
16};
17
18/// Saved bootstrap RSP for returning after a user process exits.
19/// Set by `enter_usermode_returnable()`, consumed by `boot_return_to_kernel()`.
20pub(crate) static BOOT_RETURN_RSP: AtomicU64 = AtomicU64::new(0);
21
22/// Saved bootstrap CR3 for returning after a user process exits.
23pub(crate) static BOOT_RETURN_CR3: AtomicU64 = AtomicU64::new(0);
24
25/// Stack canary for detecting corruption of the boot context.
26/// Set to a known value when the boot context is saved, verified before
27/// restore. A mismatch indicates stack corruption (buffer overflow,
28/// use-after-free, etc.).
29pub(crate) static BOOT_STACK_CANARY: AtomicU64 = AtomicU64::new(0);
30
31/// Magic value for the boot stack canary.
32/// Chosen to be unlikely to appear naturally in memory.
33const BOOT_CANARY_MAGIC: u64 = 0xDEAD_BEEF_CAFE_BABE;
34
35/// Enter user mode for the first time via iretq.
36///
37/// The iretq instruction pops SS, RSP, RFLAGS, CS, RIP from the stack
38/// and transitions the CPU to the privilege level specified in the CS
39/// selector's RPL field.
40///
41/// # Arguments
42/// - `entry_point`: User-space RIP (entry point of the user program)
43/// - `user_stack`: User-space RSP (top of user stack)
44/// - `user_cs`: User code segment selector with RPL=3 (0x33)
45/// - `user_ss`: User data segment selector with RPL=3 (0x2B)
46///
47/// # Safety
48/// - `entry_point` must be a valid user-space address with executable code
49///   mapped
50/// - `user_stack` must be a valid user-space stack address, 16-byte aligned
51/// - The correct page tables must be loaded in CR3 with USER-accessible
52///   mappings
53/// - Per-CPU data (`kernel_rsp`) must be set before calling this, otherwise the
54///   first syscall or interrupt will crash due to invalid kernel stack
55/// - The GDT must contain valid Ring 3 segments at the specified selectors
56pub unsafe fn enter_usermode(entry_point: u64, user_stack: u64, user_cs: u64, user_ss: u64) -> ! {
57    // SAFETY: We build the iretq frame on the current kernel stack.
58    // iretq expects (from top of stack): RIP, CS, RFLAGS, RSP, SS.
59    // We set DS and ES to the user data selector and clear FS/GS.
60    // RFLAGS = 0x202: bit 1 (reserved, always 1) + bit 9 (IF = interrupts enabled).
61    // The caller guarantees all arguments point to valid mapped memory and
62    // the GDT/TSS/per-CPU data are properly configured.
63    //
64    // If the process has TLS (fs_base != 0), set FS_BASE via wrmsr AFTER
65    // clearing the FS selector. Writing 0 to FS zeros FS_BASE, so the
66    // wrmsr must come after to re-establish TLS.
67    let fs_base = crate::process::current_process()
68        .map(|p| p.tls_fs_base.load(core::sync::atomic::Ordering::Acquire))
69        .unwrap_or(0);
70
71    let fs_lo = fs_base as u32;
72    let fs_hi = (fs_base >> 32) as u32;
73
74    asm!(
75        // Set data segment registers to user data selector
76        "mov ds, {ss:r}",
77        "mov es, {ss:r}",
78        // Clear FS and GS. Writing 0 to FS zeros FS_BASE (MSR 0xC0000100).
79        "mov fs, {zero:x}",
80        "mov gs, {zero:x}",
81        // Build iretq frame on current kernel stack FIRST, before wrmsr.
82        // wrmsr uses ECX (MSR number), EDX:EAX (value) as IMPLICIT operands.
83        // All iretq frame values must be pushed BEFORE wrmsr, because wrmsr
84        // clobbers EAX/ECX/EDX.
85        //
86        // CRITICAL: fs_lo and fs_hi are bound to EAX and EDX via explicit
87        // register constraints (`in("eax")` / `in("edx")`). This prevents
88        // the compiler from allocating them to ECX, which gets overwritten
89        // by `mov ecx, 0xC0000100`. Without this, the release optimizer can
90        // allocate fs_hi to ECX, causing `mov edx, ecx` to load 0xC0000100
91        // instead of the actual fs_hi value, which makes wrmsr write a
92        // non-canonical address to FS_BASE and triggers a GP fault.
93        //
94        //   [RSP+0]  RIP    - user entry point
95        //   [RSP+8]  CS     - user code segment (Ring 3)
96        //   [RSP+16] RFLAGS - IF set (0x202)
97        //   [RSP+24] RSP    - user stack pointer
98        //   [RSP+32] SS     - user stack segment (Ring 3)
99        "push {ss}",       // SS
100        "push {rsp}",      // RSP (user stack)
101        "push {rflags}",   // RFLAGS (IF enabled)
102        "push {cs}",       // CS
103        "push {rip}",      // RIP (entry point)
104        // Now restore FS_BASE for TLS if non-zero. All operand values are safely
105        // on the stack. fs_lo is already in EAX, fs_hi is already in EDX.
106        // Only ECX needs to be loaded with the MSR number.
107        "test edx, edx",
108        "jnz 2f",
109        "test eax, eax",
110        "jz 3f",
111        "2:",
112        "mov ecx, 0xC0000100",
113        "wrmsr",
114        "3:",
115        "iretq",
116        ss = in(reg) user_ss,
117        rsp = in(reg) user_stack,
118        rflags = in(reg) 0x202u64,
119        cs = in(reg) user_cs,
120        rip = in(reg) entry_point,
121        zero = in(reg) 0u64,
122        in("eax") fs_lo,
123        in("edx") fs_hi,
124        options(noreturn)
125    );
126}
127
128/// Enter user mode with the ability to return when the process exits.
129///
130/// Saves callee-saved registers and the current RSP/CR3 to globals before
131/// performing iretq. When the user process calls `sys_exit`, the
132/// `boot_return_to_kernel()` function restores the saved context, making
133/// this function appear to return normally.
134///
135/// # Arguments
136/// - `entry_point`: User-space RIP
137/// - `user_stack`: User-space RSP
138/// - `user_cs`: User CS selector (Ring 3)
139/// - `user_ss`: User SS selector (Ring 3)
140/// - `process_cr3`: Physical address of the process's L4 page table
141/// - `kernel_rsp_ptr`: Pointer to per-CPU kernel_rsp (written after context
142///   save)
143///
144/// # Safety
145/// Same requirements as `enter_usermode`, plus:
146/// - `process_cr3` must be a valid L4 page table with both user and kernel
147///   mappings
148/// - `kernel_rsp_ptr` must point to a valid u64 for storing the kernel RSP
149#[unsafe(naked)]
150pub unsafe extern "C" fn enter_usermode_returnable(
151    _entry_point: u64,    // rdi
152    _user_stack: u64,     // rsi
153    _user_cs: u64,        // rdx
154    _user_ss: u64,        // rcx
155    _process_cr3: u64,    // r8
156    _kernel_rsp_ptr: u64, // r9
157) {
158    core::arch::naked_asm!(
159        // Save callee-saved registers (System V ABI)
160        "push rbp",
161        "push rbx",
162        "push r12",
163        "push r13",
164        "push r14",
165        "push r15",
166
167        // Alignment padding: after 6 pushes from function entry (RSP was
168        // 16n+8 after the CALL), RSP is now 16n+8 - 48 = 16m+8 (mod 16 = 8).
169        // syscall_entry loads kernel_rsp, pushes 14 registers (112 bytes,
170        // alignment-neutral), then does CALL handler. For the handler to get
171        // the ABI-required RSP mod 16 = 8, the loaded kernel_rsp must be
172        // mod 16 = 0. Adding 8 bytes of padding achieves this:
173        //   16m+8 - 8 = 16m (mod 16 = 0).
174        // boot_return_to_kernel must skip this padding when restoring RSP.
175        "sub rsp, 8",
176
177        // FIX 3: Set stack canary BEFORE saving boot context
178        // Load canary magic value and store to global
179        "mov rax, {canary_magic}",
180        "lea r12, [rip + {boot_canary}]",
181        "mov [r12], rax",
182
183        // Save boot CR3 to global
184        "mov rax, cr3",
185        "lea r12, [rip + {boot_cr3}]",
186        "mov [r12], rax",
187
188        // Save boot RSP to global (includes alignment padding)
189        "lea r12, [rip + {boot_rsp}]",
190        "mov [r12], rsp",
191
192        // Update per-CPU kernel_rsp via pointer passed in r9
193        // This value is 16-byte aligned, ensuring syscall handlers get
194        // correct SSE alignment for movaps instructions.
195        "mov [r9], rsp",
196
197        // Switch to process page tables
198        "mov cr3, r8",
199
200        // Set segment registers for user mode
201        "mov ds, ecx",
202        "mov es, ecx",
203        "xor eax, eax",
204        "mov fs, ax",
205        "mov gs, ax",
206
207        // Build iretq frame on stack
208        "push rcx",       // SS
209        "push rsi",       // RSP (user stack)
210        "push 0x202",     // RFLAGS (IF enabled)
211        "push rdx",       // CS
212        "push rdi",       // RIP (entry point)
213
214        "iretq",
215
216        boot_cr3 = sym BOOT_RETURN_CR3,
217        boot_rsp = sym BOOT_RETURN_RSP,
218        boot_canary = sym BOOT_STACK_CANARY,
219        canary_magic = const BOOT_CANARY_MAGIC,
220    );
221}
222
223/// All GPRs needed to resume a forked child in user mode.
224///
225/// Built by `boot_run_forked_child` from the child's `X86_64Context`
226/// (which captured the parent's live registers at fork time).
227/// Passed to `enter_forked_child_returnable` so it can restore every
228/// register before `iretq`, not just RAX/RIP/RSP.
229#[repr(C)]
230pub struct ForkChildRegs {
231    pub rip: u64,    // offset  0
232    pub rsp: u64,    // offset  8
233    pub rflags: u64, // offset 16
234    pub rax: u64,    // offset 24
235    pub rbx: u64,    // offset 32
236    pub rcx: u64,    // offset 40
237    pub rdx: u64,    // offset 48
238    pub rsi: u64,    // offset 56
239    pub rdi: u64,    // offset 64
240    pub rbp: u64,    // offset 72
241    pub r8: u64,     // offset 80
242    pub r9: u64,     // offset 88
243    pub r10: u64,    // offset 96
244    pub r11: u64,    // offset 104
245    pub r12: u64,    // offset 112
246    pub r13: u64,    // offset 120
247    pub r14: u64,    // offset 128
248    pub r15: u64,    // offset 136
249}
250
251/// Enter user mode for a forked child, restoring ALL GPRs from `regs`.
252///
253/// Used by `boot_run_forked_child` to dispatch a forked child inline
254/// from the parent's wait loop.  The child resumes at the instruction
255/// after fork() with every register matching the parent's state at the
256/// moment of the SYSCALL, except RAX which is 0 (fork child return).
257///
258/// # Safety
259/// Same preconditions as `enter_usermode_returnable`.
260/// `regs` must point to a valid `ForkChildRegs` that remains accessible
261/// after the CR3 switch (kernel mapping must cover it).
262#[unsafe(naked)]
263pub unsafe extern "C" fn enter_forked_child_returnable(
264    _regs: *const ForkChildRegs, // rdi
265    _process_cr3: u64,           // rsi
266    _kernel_rsp_ptr: u64,        // rdx
267) {
268    core::arch::naked_asm!(
269        // ---- save boot context (same layout as enter_usermode_returnable) ----
270        "push rbp",
271        "push rbx",
272        "push r12",
273        "push r13",
274        "push r14",
275        "push r15",
276        "sub rsp, 8",          // alignment padding
277
278        // Set stack canary
279        "mov rax, {canary_magic}",
280        "lea r12, [rip + {boot_canary}]",
281        "mov [r12], rax",
282
283        // Save boot CR3
284        "mov rax, cr3",
285        "lea r12, [rip + {boot_cr3}]",
286        "mov [r12], rax",
287
288        // Save boot RSP
289        "lea r12, [rip + {boot_rsp}]",
290        "mov [r12], rsp",
291
292        // Stash regs pointer in r15 (already saved on boot stack)
293        "mov r15, rdi",
294
295        // Update per-CPU kernel_rsp
296        "mov [rdx], rsp",
297
298        // Switch to child's page tables
299        "mov cr3, rsi",
300
301        // Set segment registers for user mode
302        "mov eax, 0x2B",
303        "mov ds, ax",
304        "mov es, ax",
305        "xor eax, eax",
306        "mov fs, ax",
307        "mov gs, ax",
308
309        // ---- build iretq frame from ForkChildRegs ----
310        "push 0x2B",                    // SS  (user data segment)
311        "push qword ptr [r15 + 8]",    // RSP (user stack)
312        "push qword ptr [r15 + 16]",   // RFLAGS
313        "push 0x33",                    // CS  (user code segment)
314        "push qword ptr [r15 + 0]",    // RIP (entry point)
315
316        // ---- restore ALL GPRs from struct (r15 = pointer, loaded last) ----
317        "mov rax, [r15 + 24]",
318        "mov rbx, [r15 + 32]",
319        "mov rcx, [r15 + 40]",
320        "mov rdx, [r15 + 48]",
321        "mov rsi, [r15 + 56]",
322        "mov rdi, [r15 + 64]",
323        "mov rbp, [r15 + 72]",
324        "mov r8,  [r15 + 80]",
325        "mov r9,  [r15 + 88]",
326        "mov r10, [r15 + 96]",
327        "mov r11, [r15 + 104]",
328        "mov r12, [r15 + 112]",
329        "mov r13, [r15 + 120]",
330        "mov r14, [r15 + 128]",
331        "mov r15, [r15 + 136]",        // pointer gone — must be last
332
333        "iretq",
334
335        boot_cr3 = sym BOOT_RETURN_CR3,
336        boot_rsp = sym BOOT_RETURN_RSP,
337        boot_canary = sym BOOT_STACK_CANARY,
338        canary_magic = const BOOT_CANARY_MAGIC,
339    );
340}
341
342/// Restore the boot context saved by `enter_usermode_returnable` and return
343/// to the bootstrap code.
344///
345/// Called from `sys_exit` after cleaning up the exiting process. This function:
346/// 1. Restores the boot CR3 (switching back to boot page tables)
347/// 2. Restores kernel segment registers (DS, ES, FS, GS cleared)
348/// 3. Does `swapgs` to balance the swapgs from `syscall_entry`
349/// 4. Restores RSP to the saved value (past the callee-saved pushes)
350/// 5. Pops callee-saved registers and returns to the caller of
351///    `enter_usermode_returnable`
352///
353/// # Safety
354/// - Must only be called when `BOOT_RETURN_RSP` and `BOOT_RETURN_CR3` are valid
355/// - Must be called from kernel mode on the kernel stack set by syscall_entry
356/// - The saved boot stack frame must still be intact
357///
358/// # Implementation Notes
359/// - `#[inline(never)]` prevents aggressive optimization that could corrupt the
360///   stack frame restoration in release builds
361/// - `compiler_fence` ensures loads complete before subsequent operations
362/// - `black_box` prevents constant propagation and reordering of critical
363///   values
364#[inline(never)]
365pub unsafe fn boot_return_to_kernel() -> ! {
366    // RAW SERIAL DIAGNOSTIC: Trace boot return entry
367    crate::arch::x86_64::idt::raw_serial_str(b"[BOOT_RETURN ENTRY]\n");
368
369    // FIX 2 & 6: Use black_box to force compiler to treat values as opaque,
370    // preventing optimization assumptions. Follow with compiler fence to
371    // prevent instruction reordering across this boundary.
372    //
373    // CRITICAL FIX: The release optimizer was reusing RAX after `xor eax,eax`
374    // (used to zero FS/GS) to load RSP, which set RSP=0 and caused a double
375    // fault. We now use inline assembly with explicit register constraints
376    // to force RSP into a register that won't be clobbered, and keep CR3
377    // separate. The asm! block below uses `inout` constraints to prevent
378    // the compiler from reusing these registers.
379    let rsp: u64;
380    let cr3: u64;
381    let canary: u64;
382
383    // Load values with explicit register assignments to prevent optimization
384    asm!(
385        "mov {rsp}, [{rsp_addr}]",
386        "mov {cr3}, [{cr3_addr}]",
387        "mov {canary}, [{canary_addr}]",
388        rsp = out(reg) rsp,
389        cr3 = out(reg) cr3,
390        canary = out(reg) canary,
391        rsp_addr = in(reg) &BOOT_RETURN_RSP,
392        cr3_addr = in(reg) &BOOT_RETURN_CR3,
393        canary_addr = in(reg) &BOOT_STACK_CANARY,
394        options(nostack, preserves_flags)
395    );
396
397    // Apply black_box to prevent further optimization
398    let rsp = core::hint::black_box(rsp);
399    let cr3 = core::hint::black_box(cr3);
400    let canary = core::hint::black_box(canary);
401    core::sync::atomic::compiler_fence(Ordering::SeqCst);
402
403    // FIX 3: Validate stack canary before restoring context
404    // If the canary doesn't match, the boot stack has been corrupted
405    if canary != BOOT_CANARY_MAGIC {
406        crate::arch::x86_64::idt::raw_serial_str(b"[BOOT_RETURN] FATAL: Stack canary mismatch!\n");
407        crate::arch::x86_64::idt::raw_serial_str(b"Expected: 0x");
408        crate::arch::x86_64::idt::raw_serial_hex(BOOT_CANARY_MAGIC);
409        crate::arch::x86_64::idt::raw_serial_str(b"\nGot:      0x");
410        crate::arch::x86_64::idt::raw_serial_hex(canary);
411        crate::arch::x86_64::idt::raw_serial_str(b"\n");
412        panic!("Stack canary mismatch - boot context corrupted");
413    }
414
415    // NOTE: Cannot use println! here - would access locks/memory with wrong CR3
416    // crate::println!("[BOOT-RETURN] RSP={:#x} CR3={:#x}", rsp, cr3);
417
418    // Clear the boot return context (one-shot)
419    BOOT_RETURN_RSP.store(0, Ordering::SeqCst);
420    BOOT_RETURN_CR3.store(0, Ordering::SeqCst);
421    BOOT_STACK_CANARY.store(0, Ordering::SeqCst);
422
423    // SAFETY: cr3 is the boot page table address saved before entering user
424    // mode. rsp points to the stack with 8 bytes of alignment padding and
425    // 6 callee-saved registers, with the return address below them. We
426    // restore kernel segment registers and
427    // balance the swapgs from syscall_entry. The swapgs must come BEFORE
428    // clearing GS so we don't corrupt KERNEL_GS_BASE. After restoring RSP
429    // and popping registers, ret returns to the caller of
430    // enter_usermode_returnable.
431    //
432    // CRITICAL FIX FOR OPT-LEVEL S/Z/3: The optimizer was allocating RSP
433    // to RAX, which then got clobbered by `xor eax,eax` used for zeroing
434    // FS/GS. We now explicitly allocate RSP to RCX and CR3 to RDX, both
435    // of which are preserved across the segment register operations. This
436    // is the ONLY way to prevent the optimizer from reusing RAX.
437    asm!(
438        "mov cr3, rdx",       // Restore boot page tables (CR3 in RDX)
439        "swapgs",              // Balance syscall_entry's swapgs (before touching GS!)
440        "mov ax, 0x10",       // Kernel data segment (GDT index 2, RPL 0)
441        "mov ds, ax",         // Restore kernel DS
442        "mov es, ax",         // Restore kernel ES
443        "xor eax, eax",       // Zero FS and GS (clobbers RAX but NOT RCX/RDX!)
444        "mov fs, ax",
445        "mov gs, ax",
446        "mov rsp, rcx",       // Restore saved boot RSP (RSP in RCX, safe!)
447        "add rsp, 8",         // Skip alignment padding from enter_usermode_returnable
448        "pop r15",
449        "pop r14",
450        "pop r13",
451        "pop r12",
452        "pop rbx",
453        "pop rbp",
454        "ret",                 // Return to caller of enter_usermode_returnable
455        in("rcx") rsp,        // RSP MUST be in RCX (preserved across xor eax,eax)
456        in("rdx") cr3,        // CR3 MUST be in RDX (preserved across xor eax,eax)
457        options(noreturn)
458    );
459}
460
461/// Check whether a boot return context is available.
462///
463/// Returns `true` if `enter_usermode_returnable` has saved a boot context
464/// that `boot_return_to_kernel` can restore.
465pub fn has_boot_return_context() -> bool {
466    BOOT_RETURN_RSP.load(Ordering::SeqCst) != 0
467}
468
469/// Physical memory offset provided by the bootloader.
470///
471/// All physical memory is mapped at virtual address `phys_addr + PHYS_OFFSET`.
472/// Initialized during `try_enter_usermode()` from BOOT_INFO.
473static PHYS_OFFSET: core::sync::atomic::AtomicU64 = core::sync::atomic::AtomicU64::new(0);
474
475/// Get the physical memory offset, or 0 if not yet initialized.
476///
477/// Used by kernel subsystems that need to convert physical addresses to
478/// virtual addresses after the initial user-mode setup.
479#[allow(dead_code)] // Helper for phys_to_virt below
480fn phys_offset() -> u64 {
481    PHYS_OFFSET.load(core::sync::atomic::Ordering::Relaxed)
482}
483
484/// Convert a physical address to a virtual address via the bootloader's
485/// physical memory mapping.
486///
487/// Returns `None` if the physical memory offset has not been initialized.
488/// Used by kernel subsystems that need to access physical memory after
489/// the initial user-mode setup.
490#[allow(dead_code)] // Physical-to-virtual conversion for page table manipulation
491fn phys_to_virt(phys: u64) -> Option<u64> {
492    let offset = phys_offset();
493    if offset == 0 {
494        return None;
495    }
496    Some(phys + offset)
497}
498
499/// Page table entry flags for x86_64 4-level paging.
500const PTE_PRESENT: u64 = 1 << 0;
501const PTE_WRITABLE: u64 = 1 << 1;
502const PTE_USER: u64 = 1 << 2;
503
504/// Extract the physical address of the next-level page table from a PTE.
505///
506/// The physical address is stored in bits 12..51 of the entry.
507fn pte_phys_addr(entry: u64) -> u64 {
508    entry & 0x000F_FFFF_FFFF_F000
509}
510
511/// Map a single 4KiB page in the current page tables with USER access.
512///
513/// Walks the 4-level page table hierarchy (PML4 -> PDPT -> PD -> PT),
514/// allocating intermediate tables as needed from the frame allocator.
515/// The leaf entry maps `virt_addr` to `phys_frame_addr` with the given flags.
516///
517/// # Safety
518/// - `phys_offset_val` must be the correct bootloader physical memory offset
519/// - `virt_addr` must be page-aligned (4KiB)
520/// - `phys_frame_addr` must be a valid, page-aligned physical address
521/// - The caller must ensure no conflicting mapping exists
522unsafe fn map_user_page(
523    phys_offset_val: u64,
524    virt_addr: u64,
525    phys_frame_addr: u64,
526    flags: u64,
527) -> Result<(), crate::error::KernelError> {
528    // Read current CR3 to get the PML4 physical address
529    let cr3: u64;
530    // SAFETY: Reading CR3 is always valid in kernel mode.
531    asm!("mov {}, cr3", out(reg) cr3);
532    let pml4_phys = cr3 & 0x000F_FFFF_FFFF_F000;
533
534    // Extract page table indices from the virtual address
535    let pml4_idx = ((virt_addr >> 39) & 0x1FF) as usize;
536    let pdpt_idx = ((virt_addr >> 30) & 0x1FF) as usize;
537    let pd_idx = ((virt_addr >> 21) & 0x1FF) as usize;
538    let pt_idx = ((virt_addr >> 12) & 0x1FF) as usize;
539
540    // Walk PML4 -> PDPT
541    let pml4_virt = (pml4_phys + phys_offset_val) as *mut u64;
542    let pml4_entry = pml4_virt.add(pml4_idx);
543    let pdpt_phys = ensure_table_present(pml4_entry, phys_offset_val)?;
544
545    // Walk PDPT -> PD
546    let pdpt_virt = (pdpt_phys + phys_offset_val) as *mut u64;
547    let pdpt_entry = pdpt_virt.add(pdpt_idx);
548    let pd_phys = ensure_table_present(pdpt_entry, phys_offset_val)?;
549
550    // Walk PD -> PT
551    let pd_virt = (pd_phys + phys_offset_val) as *mut u64;
552    let pd_entry = pd_virt.add(pd_idx);
553    let pt_phys = ensure_table_present(pd_entry, phys_offset_val)?;
554
555    // Set the leaf PT entry
556    let pt_virt = (pt_phys + phys_offset_val) as *mut u64;
557    let pt_entry = pt_virt.add(pt_idx);
558    // SAFETY: pt_entry points into a valid page table mapped via the physical
559    // memory offset. We write the leaf mapping: physical frame + flags.
560    pt_entry.write_volatile(phys_frame_addr | flags);
561
562    // Flush TLB for this address
563    // SAFETY: invlpg invalidates the TLB entry for virt_addr. No side effects.
564    asm!("invlpg [{}]", in(reg) virt_addr);
565
566    Ok(())
567}
568
569/// Ensure a page table entry at `entry_ptr` is present. If not, allocate
570/// a new zeroed frame for the next-level table and write the entry.
571///
572/// Returns the physical address of the next-level table.
573///
574/// # Safety
575/// - `entry_ptr` must point to a valid page table entry in mapped memory
576/// - `phys_offset_val` must be the correct physical memory offset
577unsafe fn ensure_table_present(
578    entry_ptr: *mut u64,
579    phys_offset_val: u64,
580) -> Result<u64, crate::error::KernelError> {
581    // SAFETY: entry_ptr was computed from a valid page table base + index,
582    // both within the physical memory mapping provided by the bootloader.
583    let entry = entry_ptr.read_volatile();
584
585    if (entry & PTE_PRESENT) != 0 {
586        // Table already exists. Ensure USER bit is set on intermediate entries
587        // so user-mode accesses can traverse the hierarchy.
588        let updated = entry | PTE_USER | PTE_WRITABLE;
589        if updated != entry {
590            // SAFETY: Updating flags on an existing present entry is safe.
591            // We only add USER and WRITABLE bits to intermediate tables.
592            entry_ptr.write_volatile(updated);
593        }
594        Ok(pte_phys_addr(entry))
595    } else {
596        // Allocate a new frame for the next-level table
597        let frame = crate::mm::FRAME_ALLOCATOR
598            .lock()
599            .allocate_frames(1, None)
600            .map_err(|_| crate::error::KernelError::ResourceExhausted {
601                resource: "physical frames",
602            })?;
603        let frame_phys = frame.as_u64() * crate::mm::FRAME_SIZE as u64;
604
605        // Zero the new table
606        let frame_virt = (frame_phys + phys_offset_val) as *mut u8;
607        // SAFETY: frame_virt points to a freshly allocated 4KiB frame mapped
608        // via the physical memory offset. write_bytes zeroes the entire page.
609        core::ptr::write_bytes(frame_virt, 0, 4096);
610
611        // Write the entry: physical address + PRESENT + WRITABLE + USER
612        let new_entry = frame_phys | PTE_PRESENT | PTE_WRITABLE | PTE_USER;
613        // SAFETY: entry_ptr points to a valid PTE slot. Writing a new entry
614        // that points to our freshly zeroed frame is safe.
615        entry_ptr.write_volatile(new_entry);
616
617        Ok(frame_phys)
618    }
619}
620
621/// Check if a physical address is used by the active page table hierarchy.
622///
623/// Walks PML4 -> PDPT -> PD -> PT and returns true if `phys` matches any
624/// page-table frame's base address. This is O(n) in the number of page table
625/// pages (~1000 for a typical bootloader mapping).
626///
627/// # Safety
628/// - `phys_offset` must be the bootloader's physical memory offset
629/// - `pml4_phys` must be a valid PML4 physical address (from CR3)
630unsafe fn is_page_table_frame(phys_offset: u64, pml4_phys: u64, phys: u64) -> bool {
631    if phys == pml4_phys {
632        return true;
633    }
634
635    let pml4_virt = (pml4_phys + phys_offset) as *const u64;
636    for i in 0..512 {
637        // SAFETY: pml4_virt + i is within the PML4 page, mapped via phys_offset.
638        let pml4_entry = pml4_virt.add(i).read_volatile();
639        if (pml4_entry & PTE_PRESENT) == 0 {
640            continue;
641        }
642        let pdpt_phys = pte_phys_addr(pml4_entry);
643        if phys == pdpt_phys {
644            return true;
645        }
646
647        let pdpt_virt = (pdpt_phys + phys_offset) as *const u64;
648        for j in 0..512 {
649            // SAFETY: pdpt_virt + j is within the PDPT page.
650            let pdpt_entry = pdpt_virt.add(j).read_volatile();
651            if (pdpt_entry & PTE_PRESENT) == 0 {
652                continue;
653            }
654            if (pdpt_entry & (1 << 7)) != 0 {
655                continue; // 1GiB huge page
656            }
657            let pd_phys = pte_phys_addr(pdpt_entry);
658            if phys == pd_phys {
659                return true;
660            }
661
662            let pd_virt = (pd_phys + phys_offset) as *const u64;
663            for k in 0..512 {
664                // SAFETY: pd_virt + k is within the PD page.
665                let pd_entry = pd_virt.add(k).read_volatile();
666                if (pd_entry & PTE_PRESENT) == 0 {
667                    continue;
668                }
669                if (pd_entry & (1 << 7)) != 0 {
670                    continue; // 2MiB huge page
671                }
672                let pt_phys = pte_phys_addr(pd_entry);
673                if phys == pt_phys {
674                    return true;
675                }
676            }
677        }
678    }
679
680    false
681}
682
683/// Allocate a physical frame that does not overlap with any active page table
684/// page. Frames that are page table pages are allocated (to consume them from
685/// the free pool) but not returned.
686///
687/// # Safety
688/// - `phys_offset` and `pml4_phys` must be valid (see `is_page_table_frame`)
689unsafe fn allocate_safe_frame(
690    phys_offset: u64,
691    pml4_phys: u64,
692    count: usize,
693) -> Result<crate::mm::FrameNumber, crate::error::KernelError> {
694    use crate::mm::{FRAME_ALLOCATOR, FRAME_SIZE};
695
696    // Try up to 8192 times (enough to skip the ~1050 page table frames)
697    for _ in 0..8192 {
698        let frame = FRAME_ALLOCATOR
699            .lock()
700            .allocate_frames(count, None)
701            .map_err(|_| crate::error::KernelError::ResourceExhausted {
702                resource: "physical frames",
703            })?;
704        let phys = frame.as_u64() * FRAME_SIZE as u64;
705
706        // Check all allocated frames in the range
707        let mut overlaps = false;
708        for f in 0..count as u64 {
709            if is_page_table_frame(phys_offset, pml4_phys, phys + f * FRAME_SIZE as u64) {
710                overlaps = true;
711                break;
712            }
713        }
714
715        if !overlaps {
716            return Ok(frame);
717        }
718        // Frame overlaps a page table page -- leave it allocated (consumed)
719        // so the allocator won't return it again, and try the next one.
720    }
721
722    Err(crate::error::KernelError::ResourceExhausted {
723        resource: "non-page-table frames",
724    })
725}
726
727/// Attempt to enter user mode with the embedded init binary.
728///
729/// This function:
730/// 1. Retrieves the physical memory offset from BOOT_INFO
731/// 2. Allocates physical frames for user code and stack
732/// 3. Maps them at user-accessible virtual addresses in the current page tables
733/// 4. Copies the embedded INIT_CODE machine code to the code page
734/// 5. Sets up the per-CPU kernel_rsp for syscall/interrupt return
735/// 6. Transitions to Ring 3 via iretq
736///
737/// On success, this function does not return (enters user mode).
738/// On failure, returns a KernelError for the caller to log.
739pub fn try_enter_usermode() -> Result<(), crate::error::KernelError> {
740    use crate::{mm::FRAME_SIZE, userspace::embedded};
741
742    // Step 1: Get the physical memory offset from BOOT_INFO
743    // SAFETY: BOOT_INFO is a static mut written once during early boot
744    // (in main.rs) and only read afterwards. At this point we are in
745    // single-threaded Stage 6 bootstrap, so no data race is possible.
746    // We use addr_of! to avoid creating a direct reference to the static mut.
747    let phys_offset_val = unsafe {
748        let boot_info_ptr = core::ptr::addr_of!(crate::arch::x86_64::boot::BOOT_INFO);
749        let boot_info =
750            (*boot_info_ptr)
751                .as_ref()
752                .ok_or(crate::error::KernelError::NotInitialized {
753                    subsystem: "BOOT_INFO",
754                })?;
755        boot_info.physical_memory_offset.into_option().ok_or(
756            crate::error::KernelError::NotInitialized {
757                subsystem: "physical memory offset",
758            },
759        )?
760    };
761
762    PHYS_OFFSET.store(phys_offset_val, core::sync::atomic::Ordering::Relaxed);
763
764    // Step 1b: Read CR3 to identify page table frames that must not be
765    // allocated for user-space use (the bootloader doesn't mark them as
766    // reserved in the memory map).
767    let cr3_val: u64;
768    // SAFETY: Reading CR3 is always valid in kernel mode.
769    unsafe {
770        asm!("mov {}, cr3", out(reg) cr3_val);
771    }
772    let pml4_phys = cr3_val & 0x000F_FFFF_FFFF_F000;
773
774    // Step 2: Get the embedded init code
775    let init_code = embedded::init_code_bytes();
776
777    // Step 3: Allocate physical frames, skipping any that are page table pages.
778    // One frame for code (mapped at 0x400000)
779    // One frame for stack (mapped at 0x7FFFF000, stack grows down from 0x80000000)
780    // SAFETY: phys_offset_val and pml4_phys are valid (verified above).
781    let code_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 1)? };
782    let code_phys = code_frame.as_u64() * FRAME_SIZE as u64;
783
784    let stack_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 1)? };
785    let stack_phys = stack_frame.as_u64() * FRAME_SIZE as u64;
786
787    // Step 4: Map pages in the current page tables
788    // Code page at 0x400000 (PRESENT + WRITABLE + USER, executable)
789    let code_vaddr: u64 = 0x40_0000;
790    let stack_vaddr: u64 = 0x7FFF_F000;
791
792    // SAFETY: We have verified that phys_offset_val is the correct bootloader
793    // mapping offset. The virtual addresses are in the user-space range (below
794    // 0x0000_8000_0000_0000) and do not conflict with kernel mappings. The
795    // physical frames were just allocated and are valid.
796    unsafe {
797        map_user_page(
798            phys_offset_val,
799            code_vaddr,
800            code_phys,
801            PTE_PRESENT | PTE_WRITABLE | PTE_USER,
802        )?;
803
804        map_user_page(
805            phys_offset_val,
806            stack_vaddr,
807            stack_phys,
808            PTE_PRESENT | PTE_WRITABLE | PTE_USER,
809        )?;
810    }
811
812    // Step 5: Copy init code to the code page
813    // Access the code frame through the physical memory mapping
814    let code_virt_via_phys = phys_offset_val + code_phys;
815    // SAFETY: code_virt_via_phys points to a freshly allocated, zeroed frame
816    // accessible through the bootloader's physical memory mapping. We copy
817    // init_code.len() bytes (< 4096) into the frame.
818    unsafe {
819        let dest = code_virt_via_phys as *mut u8;
820        core::ptr::copy_nonoverlapping(init_code.as_ptr(), dest, init_code.len());
821    }
822
823    // Step 6: Set up per-CPU kernel_rsp
824    // Allocate a dedicated kernel stack for syscall/interrupt return
825    // SAFETY: phys_offset_val and pml4_phys are valid.
826    let kernel_stack_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 4)? };
827    let kernel_stack_phys = kernel_stack_frame.as_u64() * FRAME_SIZE as u64;
828    let kernel_stack_top = phys_offset_val + kernel_stack_phys + (4 * FRAME_SIZE as u64);
829
830    // Write kernel_rsp to per-CPU data so syscall_entry can find it
831    let per_cpu = crate::arch::x86_64::syscall::per_cpu_data_ptr();
832    // SAFETY: per_cpu_data_ptr() returns a valid pointer to the static
833    // PerCpuData. We are in single-threaded bootstrap context. Setting
834    // kernel_rsp before entering user mode is required for syscall_entry
835    // to have a valid kernel stack.
836    unsafe {
837        (*per_cpu).kernel_rsp = kernel_stack_top;
838    }
839
840    // Step 7: Enter user mode
841    // User entry point = start of code at 0x400000
842    // User stack pointer = top of stack page at 0x80000000 (grows down from top of
843    // page)
844    let user_entry = code_vaddr;
845    let user_stack = stack_vaddr + FRAME_SIZE as u64; // Top of the stack page
846    let user_cs: u64 = 0x33; // User code segment (GDT index 6, RPL 3)
847    let user_ss: u64 = 0x2B; // User data segment (GDT index 5, RPL 3)
848
849    crate::println!(
850        "[USERMODE] Entering Ring 3: entry={:#x} stack={:#x}",
851        user_entry,
852        user_stack,
853    );
854
855    // SAFETY: All preconditions for enter_usermode are met:
856    // - entry_point (0x400000) has executable code mapped with USER access
857    // - user_stack (0x80000000) points to the top of a mapped user stack page
858    // - CS/SS are valid Ring 3 selectors from the GDT
859    // - CR3 contains page tables with USER-accessible mappings
860    // - Per-CPU kernel_rsp is set for syscall/interrupt return
861    unsafe {
862        enter_usermode(user_entry, user_stack, user_cs, user_ss);
863    }
864}