veridian_kernel/arch/x86_64/usermode.rs
1//! User-mode entry point for x86_64
2//!
3//! Provides `enter_usermode()` which pushes the iretq frame and transitions
4//! the CPU from Ring 0 to Ring 3. Also provides `map_user_page()` for
5//! creating user-accessible page table entries through the bootloader's
6//! physical memory mapping.
7//!
8//! `enter_usermode_returnable()` is a variant that saves the boot context
9//! (callee-saved registers, RSP, CR3) so that `sys_exit` can restore it
10//! and effectively "return" to the caller, allowing sequential user-mode
11//! program execution during bootstrap.
12
13use core::{
14 arch::asm,
15 sync::atomic::{AtomicU64, Ordering},
16};
17
18/// Saved bootstrap RSP for returning after a user process exits.
19/// Set by `enter_usermode_returnable()`, consumed by `boot_return_to_kernel()`.
20pub(crate) static BOOT_RETURN_RSP: AtomicU64 = AtomicU64::new(0);
21
22/// Saved bootstrap CR3 for returning after a user process exits.
23pub(crate) static BOOT_RETURN_CR3: AtomicU64 = AtomicU64::new(0);
24
25/// Stack canary for detecting corruption of the boot context.
26/// Set to a known value when the boot context is saved, verified before
27/// restore. A mismatch indicates stack corruption (buffer overflow,
28/// use-after-free, etc.).
29pub(crate) static BOOT_STACK_CANARY: AtomicU64 = AtomicU64::new(0);
30
31/// Magic value for the boot stack canary.
32/// Chosen to be unlikely to appear naturally in memory.
33const BOOT_CANARY_MAGIC: u64 = 0xDEAD_BEEF_CAFE_BABE;
34
35/// Enter user mode for the first time via iretq.
36///
37/// The iretq instruction pops SS, RSP, RFLAGS, CS, RIP from the stack
38/// and transitions the CPU to the privilege level specified in the CS
39/// selector's RPL field.
40///
41/// # Arguments
42/// - `entry_point`: User-space RIP (entry point of the user program)
43/// - `user_stack`: User-space RSP (top of user stack)
44/// - `user_cs`: User code segment selector with RPL=3 (0x33)
45/// - `user_ss`: User data segment selector with RPL=3 (0x2B)
46///
47/// # Safety
48/// - `entry_point` must be a valid user-space address with executable code
49/// mapped
50/// - `user_stack` must be a valid user-space stack address, 16-byte aligned
51/// - The correct page tables must be loaded in CR3 with USER-accessible
52/// mappings
53/// - Per-CPU data (`kernel_rsp`) must be set before calling this, otherwise the
54/// first syscall or interrupt will crash due to invalid kernel stack
55/// - The GDT must contain valid Ring 3 segments at the specified selectors
56pub unsafe fn enter_usermode(entry_point: u64, user_stack: u64, user_cs: u64, user_ss: u64) -> ! {
57 // SAFETY: We build the iretq frame on the current kernel stack.
58 // iretq expects (from top of stack): RIP, CS, RFLAGS, RSP, SS.
59 // We set DS and ES to the user data selector and clear FS/GS.
60 // RFLAGS = 0x202: bit 1 (reserved, always 1) + bit 9 (IF = interrupts enabled).
61 // The caller guarantees all arguments point to valid mapped memory and
62 // the GDT/TSS/per-CPU data are properly configured.
63 //
64 // If the process has TLS (fs_base != 0), set FS_BASE via wrmsr AFTER
65 // clearing the FS selector. Writing 0 to FS zeros FS_BASE, so the
66 // wrmsr must come after to re-establish TLS.
67 let fs_base = crate::process::current_process()
68 .map(|p| p.tls_fs_base.load(core::sync::atomic::Ordering::Acquire))
69 .unwrap_or(0);
70
71 let fs_lo = fs_base as u32;
72 let fs_hi = (fs_base >> 32) as u32;
73
74 asm!(
75 // Set data segment registers to user data selector
76 "mov ds, {ss:r}",
77 "mov es, {ss:r}",
78 // Clear FS and GS. Writing 0 to FS zeros FS_BASE (MSR 0xC0000100).
79 "mov fs, {zero:x}",
80 "mov gs, {zero:x}",
81 // Build iretq frame on current kernel stack FIRST, before wrmsr.
82 // wrmsr uses ECX (MSR number), EDX:EAX (value) as IMPLICIT operands.
83 // All iretq frame values must be pushed BEFORE wrmsr, because wrmsr
84 // clobbers EAX/ECX/EDX.
85 //
86 // CRITICAL: fs_lo and fs_hi are bound to EAX and EDX via explicit
87 // register constraints (`in("eax")` / `in("edx")`). This prevents
88 // the compiler from allocating them to ECX, which gets overwritten
89 // by `mov ecx, 0xC0000100`. Without this, the release optimizer can
90 // allocate fs_hi to ECX, causing `mov edx, ecx` to load 0xC0000100
91 // instead of the actual fs_hi value, which makes wrmsr write a
92 // non-canonical address to FS_BASE and triggers a GP fault.
93 //
94 // [RSP+0] RIP - user entry point
95 // [RSP+8] CS - user code segment (Ring 3)
96 // [RSP+16] RFLAGS - IF set (0x202)
97 // [RSP+24] RSP - user stack pointer
98 // [RSP+32] SS - user stack segment (Ring 3)
99 "push {ss}", // SS
100 "push {rsp}", // RSP (user stack)
101 "push {rflags}", // RFLAGS (IF enabled)
102 "push {cs}", // CS
103 "push {rip}", // RIP (entry point)
104 // Now restore FS_BASE for TLS if non-zero. All operand values are safely
105 // on the stack. fs_lo is already in EAX, fs_hi is already in EDX.
106 // Only ECX needs to be loaded with the MSR number.
107 "test edx, edx",
108 "jnz 2f",
109 "test eax, eax",
110 "jz 3f",
111 "2:",
112 "mov ecx, 0xC0000100",
113 "wrmsr",
114 "3:",
115 "iretq",
116 ss = in(reg) user_ss,
117 rsp = in(reg) user_stack,
118 rflags = in(reg) 0x202u64,
119 cs = in(reg) user_cs,
120 rip = in(reg) entry_point,
121 zero = in(reg) 0u64,
122 in("eax") fs_lo,
123 in("edx") fs_hi,
124 options(noreturn)
125 );
126}
127
128/// Enter user mode with the ability to return when the process exits.
129///
130/// Saves callee-saved registers and the current RSP/CR3 to globals before
131/// performing iretq. When the user process calls `sys_exit`, the
132/// `boot_return_to_kernel()` function restores the saved context, making
133/// this function appear to return normally.
134///
135/// # Arguments
136/// - `entry_point`: User-space RIP
137/// - `user_stack`: User-space RSP
138/// - `user_cs`: User CS selector (Ring 3)
139/// - `user_ss`: User SS selector (Ring 3)
140/// - `process_cr3`: Physical address of the process's L4 page table
141/// - `kernel_rsp_ptr`: Pointer to per-CPU kernel_rsp (written after context
142/// save)
143///
144/// # Safety
145/// Same requirements as `enter_usermode`, plus:
146/// - `process_cr3` must be a valid L4 page table with both user and kernel
147/// mappings
148/// - `kernel_rsp_ptr` must point to a valid u64 for storing the kernel RSP
149#[unsafe(naked)]
150pub unsafe extern "C" fn enter_usermode_returnable(
151 _entry_point: u64, // rdi
152 _user_stack: u64, // rsi
153 _user_cs: u64, // rdx
154 _user_ss: u64, // rcx
155 _process_cr3: u64, // r8
156 _kernel_rsp_ptr: u64, // r9
157) {
158 core::arch::naked_asm!(
159 // Save callee-saved registers (System V ABI)
160 "push rbp",
161 "push rbx",
162 "push r12",
163 "push r13",
164 "push r14",
165 "push r15",
166
167 // Alignment padding: after 6 pushes from function entry (RSP was
168 // 16n+8 after the CALL), RSP is now 16n+8 - 48 = 16m+8 (mod 16 = 8).
169 // syscall_entry loads kernel_rsp, pushes 14 registers (112 bytes,
170 // alignment-neutral), then does CALL handler. For the handler to get
171 // the ABI-required RSP mod 16 = 8, the loaded kernel_rsp must be
172 // mod 16 = 0. Adding 8 bytes of padding achieves this:
173 // 16m+8 - 8 = 16m (mod 16 = 0).
174 // boot_return_to_kernel must skip this padding when restoring RSP.
175 "sub rsp, 8",
176
177 // FIX 3: Set stack canary BEFORE saving boot context
178 // Load canary magic value and store to global
179 "mov rax, {canary_magic}",
180 "lea r12, [rip + {boot_canary}]",
181 "mov [r12], rax",
182
183 // Save boot CR3 to global
184 "mov rax, cr3",
185 "lea r12, [rip + {boot_cr3}]",
186 "mov [r12], rax",
187
188 // Save boot RSP to global (includes alignment padding)
189 "lea r12, [rip + {boot_rsp}]",
190 "mov [r12], rsp",
191
192 // Update per-CPU kernel_rsp via pointer passed in r9
193 // This value is 16-byte aligned, ensuring syscall handlers get
194 // correct SSE alignment for movaps instructions.
195 "mov [r9], rsp",
196
197 // Switch to process page tables
198 "mov cr3, r8",
199
200 // Set segment registers for user mode
201 "mov ds, ecx",
202 "mov es, ecx",
203 "xor eax, eax",
204 "mov fs, ax",
205 "mov gs, ax",
206
207 // Build iretq frame on stack
208 "push rcx", // SS
209 "push rsi", // RSP (user stack)
210 "push 0x202", // RFLAGS (IF enabled)
211 "push rdx", // CS
212 "push rdi", // RIP (entry point)
213
214 "iretq",
215
216 boot_cr3 = sym BOOT_RETURN_CR3,
217 boot_rsp = sym BOOT_RETURN_RSP,
218 boot_canary = sym BOOT_STACK_CANARY,
219 canary_magic = const BOOT_CANARY_MAGIC,
220 );
221}
222
223/// All GPRs needed to resume a forked child in user mode.
224///
225/// Built by `boot_run_forked_child` from the child's `X86_64Context`
226/// (which captured the parent's live registers at fork time).
227/// Passed to `enter_forked_child_returnable` so it can restore every
228/// register before `iretq`, not just RAX/RIP/RSP.
229#[repr(C)]
230pub struct ForkChildRegs {
231 pub rip: u64, // offset 0
232 pub rsp: u64, // offset 8
233 pub rflags: u64, // offset 16
234 pub rax: u64, // offset 24
235 pub rbx: u64, // offset 32
236 pub rcx: u64, // offset 40
237 pub rdx: u64, // offset 48
238 pub rsi: u64, // offset 56
239 pub rdi: u64, // offset 64
240 pub rbp: u64, // offset 72
241 pub r8: u64, // offset 80
242 pub r9: u64, // offset 88
243 pub r10: u64, // offset 96
244 pub r11: u64, // offset 104
245 pub r12: u64, // offset 112
246 pub r13: u64, // offset 120
247 pub r14: u64, // offset 128
248 pub r15: u64, // offset 136
249}
250
251/// Enter user mode for a forked child, restoring ALL GPRs from `regs`.
252///
253/// Used by `boot_run_forked_child` to dispatch a forked child inline
254/// from the parent's wait loop. The child resumes at the instruction
255/// after fork() with every register matching the parent's state at the
256/// moment of the SYSCALL, except RAX which is 0 (fork child return).
257///
258/// # Safety
259/// Same preconditions as `enter_usermode_returnable`.
260/// `regs` must point to a valid `ForkChildRegs` that remains accessible
261/// after the CR3 switch (kernel mapping must cover it).
262#[unsafe(naked)]
263pub unsafe extern "C" fn enter_forked_child_returnable(
264 _regs: *const ForkChildRegs, // rdi
265 _process_cr3: u64, // rsi
266 _kernel_rsp_ptr: u64, // rdx
267) {
268 core::arch::naked_asm!(
269 // ---- save boot context (same layout as enter_usermode_returnable) ----
270 "push rbp",
271 "push rbx",
272 "push r12",
273 "push r13",
274 "push r14",
275 "push r15",
276 "sub rsp, 8", // alignment padding
277
278 // Set stack canary
279 "mov rax, {canary_magic}",
280 "lea r12, [rip + {boot_canary}]",
281 "mov [r12], rax",
282
283 // Save boot CR3
284 "mov rax, cr3",
285 "lea r12, [rip + {boot_cr3}]",
286 "mov [r12], rax",
287
288 // Save boot RSP
289 "lea r12, [rip + {boot_rsp}]",
290 "mov [r12], rsp",
291
292 // Stash regs pointer in r15 (already saved on boot stack)
293 "mov r15, rdi",
294
295 // Update per-CPU kernel_rsp
296 "mov [rdx], rsp",
297
298 // Switch to child's page tables
299 "mov cr3, rsi",
300
301 // Set segment registers for user mode
302 "mov eax, 0x2B",
303 "mov ds, ax",
304 "mov es, ax",
305 "xor eax, eax",
306 "mov fs, ax",
307 "mov gs, ax",
308
309 // ---- build iretq frame from ForkChildRegs ----
310 "push 0x2B", // SS (user data segment)
311 "push qword ptr [r15 + 8]", // RSP (user stack)
312 "push qword ptr [r15 + 16]", // RFLAGS
313 "push 0x33", // CS (user code segment)
314 "push qword ptr [r15 + 0]", // RIP (entry point)
315
316 // ---- restore ALL GPRs from struct (r15 = pointer, loaded last) ----
317 "mov rax, [r15 + 24]",
318 "mov rbx, [r15 + 32]",
319 "mov rcx, [r15 + 40]",
320 "mov rdx, [r15 + 48]",
321 "mov rsi, [r15 + 56]",
322 "mov rdi, [r15 + 64]",
323 "mov rbp, [r15 + 72]",
324 "mov r8, [r15 + 80]",
325 "mov r9, [r15 + 88]",
326 "mov r10, [r15 + 96]",
327 "mov r11, [r15 + 104]",
328 "mov r12, [r15 + 112]",
329 "mov r13, [r15 + 120]",
330 "mov r14, [r15 + 128]",
331 "mov r15, [r15 + 136]", // pointer gone — must be last
332
333 "iretq",
334
335 boot_cr3 = sym BOOT_RETURN_CR3,
336 boot_rsp = sym BOOT_RETURN_RSP,
337 boot_canary = sym BOOT_STACK_CANARY,
338 canary_magic = const BOOT_CANARY_MAGIC,
339 );
340}
341
342/// Restore the boot context saved by `enter_usermode_returnable` and return
343/// to the bootstrap code.
344///
345/// Called from `sys_exit` after cleaning up the exiting process. This function:
346/// 1. Restores the boot CR3 (switching back to boot page tables)
347/// 2. Restores kernel segment registers (DS, ES, FS, GS cleared)
348/// 3. Does `swapgs` to balance the swapgs from `syscall_entry`
349/// 4. Restores RSP to the saved value (past the callee-saved pushes)
350/// 5. Pops callee-saved registers and returns to the caller of
351/// `enter_usermode_returnable`
352///
353/// # Safety
354/// - Must only be called when `BOOT_RETURN_RSP` and `BOOT_RETURN_CR3` are valid
355/// - Must be called from kernel mode on the kernel stack set by syscall_entry
356/// - The saved boot stack frame must still be intact
357///
358/// # Implementation Notes
359/// - `#[inline(never)]` prevents aggressive optimization that could corrupt the
360/// stack frame restoration in release builds
361/// - `compiler_fence` ensures loads complete before subsequent operations
362/// - `black_box` prevents constant propagation and reordering of critical
363/// values
364#[inline(never)]
365pub unsafe fn boot_return_to_kernel() -> ! {
366 // RAW SERIAL DIAGNOSTIC: Trace boot return entry
367 crate::arch::x86_64::idt::raw_serial_str(b"[BOOT_RETURN ENTRY]\n");
368
369 // FIX 2 & 6: Use black_box to force compiler to treat values as opaque,
370 // preventing optimization assumptions. Follow with compiler fence to
371 // prevent instruction reordering across this boundary.
372 //
373 // CRITICAL FIX: The release optimizer was reusing RAX after `xor eax,eax`
374 // (used to zero FS/GS) to load RSP, which set RSP=0 and caused a double
375 // fault. We now use inline assembly with explicit register constraints
376 // to force RSP into a register that won't be clobbered, and keep CR3
377 // separate. The asm! block below uses `inout` constraints to prevent
378 // the compiler from reusing these registers.
379 let rsp: u64;
380 let cr3: u64;
381 let canary: u64;
382
383 // Load values with explicit register assignments to prevent optimization
384 asm!(
385 "mov {rsp}, [{rsp_addr}]",
386 "mov {cr3}, [{cr3_addr}]",
387 "mov {canary}, [{canary_addr}]",
388 rsp = out(reg) rsp,
389 cr3 = out(reg) cr3,
390 canary = out(reg) canary,
391 rsp_addr = in(reg) &BOOT_RETURN_RSP,
392 cr3_addr = in(reg) &BOOT_RETURN_CR3,
393 canary_addr = in(reg) &BOOT_STACK_CANARY,
394 options(nostack, preserves_flags)
395 );
396
397 // Apply black_box to prevent further optimization
398 let rsp = core::hint::black_box(rsp);
399 let cr3 = core::hint::black_box(cr3);
400 let canary = core::hint::black_box(canary);
401 core::sync::atomic::compiler_fence(Ordering::SeqCst);
402
403 // FIX 3: Validate stack canary before restoring context
404 // If the canary doesn't match, the boot stack has been corrupted
405 if canary != BOOT_CANARY_MAGIC {
406 crate::arch::x86_64::idt::raw_serial_str(b"[BOOT_RETURN] FATAL: Stack canary mismatch!\n");
407 crate::arch::x86_64::idt::raw_serial_str(b"Expected: 0x");
408 crate::arch::x86_64::idt::raw_serial_hex(BOOT_CANARY_MAGIC);
409 crate::arch::x86_64::idt::raw_serial_str(b"\nGot: 0x");
410 crate::arch::x86_64::idt::raw_serial_hex(canary);
411 crate::arch::x86_64::idt::raw_serial_str(b"\n");
412 panic!("Stack canary mismatch - boot context corrupted");
413 }
414
415 // NOTE: Cannot use println! here - would access locks/memory with wrong CR3
416 // crate::println!("[BOOT-RETURN] RSP={:#x} CR3={:#x}", rsp, cr3);
417
418 // Clear the boot return context (one-shot)
419 BOOT_RETURN_RSP.store(0, Ordering::SeqCst);
420 BOOT_RETURN_CR3.store(0, Ordering::SeqCst);
421 BOOT_STACK_CANARY.store(0, Ordering::SeqCst);
422
423 // SAFETY: cr3 is the boot page table address saved before entering user
424 // mode. rsp points to the stack with 8 bytes of alignment padding and
425 // 6 callee-saved registers, with the return address below them. We
426 // restore kernel segment registers and
427 // balance the swapgs from syscall_entry. The swapgs must come BEFORE
428 // clearing GS so we don't corrupt KERNEL_GS_BASE. After restoring RSP
429 // and popping registers, ret returns to the caller of
430 // enter_usermode_returnable.
431 //
432 // CRITICAL FIX FOR OPT-LEVEL S/Z/3: The optimizer was allocating RSP
433 // to RAX, which then got clobbered by `xor eax,eax` used for zeroing
434 // FS/GS. We now explicitly allocate RSP to RCX and CR3 to RDX, both
435 // of which are preserved across the segment register operations. This
436 // is the ONLY way to prevent the optimizer from reusing RAX.
437 asm!(
438 "mov cr3, rdx", // Restore boot page tables (CR3 in RDX)
439 "swapgs", // Balance syscall_entry's swapgs (before touching GS!)
440 "mov ax, 0x10", // Kernel data segment (GDT index 2, RPL 0)
441 "mov ds, ax", // Restore kernel DS
442 "mov es, ax", // Restore kernel ES
443 "xor eax, eax", // Zero FS and GS (clobbers RAX but NOT RCX/RDX!)
444 "mov fs, ax",
445 "mov gs, ax",
446 "mov rsp, rcx", // Restore saved boot RSP (RSP in RCX, safe!)
447 "add rsp, 8", // Skip alignment padding from enter_usermode_returnable
448 "pop r15",
449 "pop r14",
450 "pop r13",
451 "pop r12",
452 "pop rbx",
453 "pop rbp",
454 "ret", // Return to caller of enter_usermode_returnable
455 in("rcx") rsp, // RSP MUST be in RCX (preserved across xor eax,eax)
456 in("rdx") cr3, // CR3 MUST be in RDX (preserved across xor eax,eax)
457 options(noreturn)
458 );
459}
460
461/// Check whether a boot return context is available.
462///
463/// Returns `true` if `enter_usermode_returnable` has saved a boot context
464/// that `boot_return_to_kernel` can restore.
465pub fn has_boot_return_context() -> bool {
466 BOOT_RETURN_RSP.load(Ordering::SeqCst) != 0
467}
468
469/// Physical memory offset provided by the bootloader.
470///
471/// All physical memory is mapped at virtual address `phys_addr + PHYS_OFFSET`.
472/// Initialized during `try_enter_usermode()` from BOOT_INFO.
473static PHYS_OFFSET: core::sync::atomic::AtomicU64 = core::sync::atomic::AtomicU64::new(0);
474
475/// Get the physical memory offset, or 0 if not yet initialized.
476///
477/// Used by kernel subsystems that need to convert physical addresses to
478/// virtual addresses after the initial user-mode setup.
479#[allow(dead_code)] // Helper for phys_to_virt below
480fn phys_offset() -> u64 {
481 PHYS_OFFSET.load(core::sync::atomic::Ordering::Relaxed)
482}
483
484/// Convert a physical address to a virtual address via the bootloader's
485/// physical memory mapping.
486///
487/// Returns `None` if the physical memory offset has not been initialized.
488/// Used by kernel subsystems that need to access physical memory after
489/// the initial user-mode setup.
490#[allow(dead_code)] // Physical-to-virtual conversion for page table manipulation
491fn phys_to_virt(phys: u64) -> Option<u64> {
492 let offset = phys_offset();
493 if offset == 0 {
494 return None;
495 }
496 Some(phys + offset)
497}
498
499/// Page table entry flags for x86_64 4-level paging.
500const PTE_PRESENT: u64 = 1 << 0;
501const PTE_WRITABLE: u64 = 1 << 1;
502const PTE_USER: u64 = 1 << 2;
503
504/// Extract the physical address of the next-level page table from a PTE.
505///
506/// The physical address is stored in bits 12..51 of the entry.
507fn pte_phys_addr(entry: u64) -> u64 {
508 entry & 0x000F_FFFF_FFFF_F000
509}
510
511/// Map a single 4KiB page in the current page tables with USER access.
512///
513/// Walks the 4-level page table hierarchy (PML4 -> PDPT -> PD -> PT),
514/// allocating intermediate tables as needed from the frame allocator.
515/// The leaf entry maps `virt_addr` to `phys_frame_addr` with the given flags.
516///
517/// # Safety
518/// - `phys_offset_val` must be the correct bootloader physical memory offset
519/// - `virt_addr` must be page-aligned (4KiB)
520/// - `phys_frame_addr` must be a valid, page-aligned physical address
521/// - The caller must ensure no conflicting mapping exists
522unsafe fn map_user_page(
523 phys_offset_val: u64,
524 virt_addr: u64,
525 phys_frame_addr: u64,
526 flags: u64,
527) -> Result<(), crate::error::KernelError> {
528 // Read current CR3 to get the PML4 physical address
529 let cr3: u64;
530 // SAFETY: Reading CR3 is always valid in kernel mode.
531 asm!("mov {}, cr3", out(reg) cr3);
532 let pml4_phys = cr3 & 0x000F_FFFF_FFFF_F000;
533
534 // Extract page table indices from the virtual address
535 let pml4_idx = ((virt_addr >> 39) & 0x1FF) as usize;
536 let pdpt_idx = ((virt_addr >> 30) & 0x1FF) as usize;
537 let pd_idx = ((virt_addr >> 21) & 0x1FF) as usize;
538 let pt_idx = ((virt_addr >> 12) & 0x1FF) as usize;
539
540 // Walk PML4 -> PDPT
541 let pml4_virt = (pml4_phys + phys_offset_val) as *mut u64;
542 let pml4_entry = pml4_virt.add(pml4_idx);
543 let pdpt_phys = ensure_table_present(pml4_entry, phys_offset_val)?;
544
545 // Walk PDPT -> PD
546 let pdpt_virt = (pdpt_phys + phys_offset_val) as *mut u64;
547 let pdpt_entry = pdpt_virt.add(pdpt_idx);
548 let pd_phys = ensure_table_present(pdpt_entry, phys_offset_val)?;
549
550 // Walk PD -> PT
551 let pd_virt = (pd_phys + phys_offset_val) as *mut u64;
552 let pd_entry = pd_virt.add(pd_idx);
553 let pt_phys = ensure_table_present(pd_entry, phys_offset_val)?;
554
555 // Set the leaf PT entry
556 let pt_virt = (pt_phys + phys_offset_val) as *mut u64;
557 let pt_entry = pt_virt.add(pt_idx);
558 // SAFETY: pt_entry points into a valid page table mapped via the physical
559 // memory offset. We write the leaf mapping: physical frame + flags.
560 pt_entry.write_volatile(phys_frame_addr | flags);
561
562 // Flush TLB for this address
563 // SAFETY: invlpg invalidates the TLB entry for virt_addr. No side effects.
564 asm!("invlpg [{}]", in(reg) virt_addr);
565
566 Ok(())
567}
568
569/// Ensure a page table entry at `entry_ptr` is present. If not, allocate
570/// a new zeroed frame for the next-level table and write the entry.
571///
572/// Returns the physical address of the next-level table.
573///
574/// # Safety
575/// - `entry_ptr` must point to a valid page table entry in mapped memory
576/// - `phys_offset_val` must be the correct physical memory offset
577unsafe fn ensure_table_present(
578 entry_ptr: *mut u64,
579 phys_offset_val: u64,
580) -> Result<u64, crate::error::KernelError> {
581 // SAFETY: entry_ptr was computed from a valid page table base + index,
582 // both within the physical memory mapping provided by the bootloader.
583 let entry = entry_ptr.read_volatile();
584
585 if (entry & PTE_PRESENT) != 0 {
586 // Table already exists. Ensure USER bit is set on intermediate entries
587 // so user-mode accesses can traverse the hierarchy.
588 let updated = entry | PTE_USER | PTE_WRITABLE;
589 if updated != entry {
590 // SAFETY: Updating flags on an existing present entry is safe.
591 // We only add USER and WRITABLE bits to intermediate tables.
592 entry_ptr.write_volatile(updated);
593 }
594 Ok(pte_phys_addr(entry))
595 } else {
596 // Allocate a new frame for the next-level table
597 let frame = crate::mm::FRAME_ALLOCATOR
598 .lock()
599 .allocate_frames(1, None)
600 .map_err(|_| crate::error::KernelError::ResourceExhausted {
601 resource: "physical frames",
602 })?;
603 let frame_phys = frame.as_u64() * crate::mm::FRAME_SIZE as u64;
604
605 // Zero the new table
606 let frame_virt = (frame_phys + phys_offset_val) as *mut u8;
607 // SAFETY: frame_virt points to a freshly allocated 4KiB frame mapped
608 // via the physical memory offset. write_bytes zeroes the entire page.
609 core::ptr::write_bytes(frame_virt, 0, 4096);
610
611 // Write the entry: physical address + PRESENT + WRITABLE + USER
612 let new_entry = frame_phys | PTE_PRESENT | PTE_WRITABLE | PTE_USER;
613 // SAFETY: entry_ptr points to a valid PTE slot. Writing a new entry
614 // that points to our freshly zeroed frame is safe.
615 entry_ptr.write_volatile(new_entry);
616
617 Ok(frame_phys)
618 }
619}
620
621/// Check if a physical address is used by the active page table hierarchy.
622///
623/// Walks PML4 -> PDPT -> PD -> PT and returns true if `phys` matches any
624/// page-table frame's base address. This is O(n) in the number of page table
625/// pages (~1000 for a typical bootloader mapping).
626///
627/// # Safety
628/// - `phys_offset` must be the bootloader's physical memory offset
629/// - `pml4_phys` must be a valid PML4 physical address (from CR3)
630unsafe fn is_page_table_frame(phys_offset: u64, pml4_phys: u64, phys: u64) -> bool {
631 if phys == pml4_phys {
632 return true;
633 }
634
635 let pml4_virt = (pml4_phys + phys_offset) as *const u64;
636 for i in 0..512 {
637 // SAFETY: pml4_virt + i is within the PML4 page, mapped via phys_offset.
638 let pml4_entry = pml4_virt.add(i).read_volatile();
639 if (pml4_entry & PTE_PRESENT) == 0 {
640 continue;
641 }
642 let pdpt_phys = pte_phys_addr(pml4_entry);
643 if phys == pdpt_phys {
644 return true;
645 }
646
647 let pdpt_virt = (pdpt_phys + phys_offset) as *const u64;
648 for j in 0..512 {
649 // SAFETY: pdpt_virt + j is within the PDPT page.
650 let pdpt_entry = pdpt_virt.add(j).read_volatile();
651 if (pdpt_entry & PTE_PRESENT) == 0 {
652 continue;
653 }
654 if (pdpt_entry & (1 << 7)) != 0 {
655 continue; // 1GiB huge page
656 }
657 let pd_phys = pte_phys_addr(pdpt_entry);
658 if phys == pd_phys {
659 return true;
660 }
661
662 let pd_virt = (pd_phys + phys_offset) as *const u64;
663 for k in 0..512 {
664 // SAFETY: pd_virt + k is within the PD page.
665 let pd_entry = pd_virt.add(k).read_volatile();
666 if (pd_entry & PTE_PRESENT) == 0 {
667 continue;
668 }
669 if (pd_entry & (1 << 7)) != 0 {
670 continue; // 2MiB huge page
671 }
672 let pt_phys = pte_phys_addr(pd_entry);
673 if phys == pt_phys {
674 return true;
675 }
676 }
677 }
678 }
679
680 false
681}
682
683/// Allocate a physical frame that does not overlap with any active page table
684/// page. Frames that are page table pages are allocated (to consume them from
685/// the free pool) but not returned.
686///
687/// # Safety
688/// - `phys_offset` and `pml4_phys` must be valid (see `is_page_table_frame`)
689unsafe fn allocate_safe_frame(
690 phys_offset: u64,
691 pml4_phys: u64,
692 count: usize,
693) -> Result<crate::mm::FrameNumber, crate::error::KernelError> {
694 use crate::mm::{FRAME_ALLOCATOR, FRAME_SIZE};
695
696 // Try up to 8192 times (enough to skip the ~1050 page table frames)
697 for _ in 0..8192 {
698 let frame = FRAME_ALLOCATOR
699 .lock()
700 .allocate_frames(count, None)
701 .map_err(|_| crate::error::KernelError::ResourceExhausted {
702 resource: "physical frames",
703 })?;
704 let phys = frame.as_u64() * FRAME_SIZE as u64;
705
706 // Check all allocated frames in the range
707 let mut overlaps = false;
708 for f in 0..count as u64 {
709 if is_page_table_frame(phys_offset, pml4_phys, phys + f * FRAME_SIZE as u64) {
710 overlaps = true;
711 break;
712 }
713 }
714
715 if !overlaps {
716 return Ok(frame);
717 }
718 // Frame overlaps a page table page -- leave it allocated (consumed)
719 // so the allocator won't return it again, and try the next one.
720 }
721
722 Err(crate::error::KernelError::ResourceExhausted {
723 resource: "non-page-table frames",
724 })
725}
726
727/// Attempt to enter user mode with the embedded init binary.
728///
729/// This function:
730/// 1. Retrieves the physical memory offset from BOOT_INFO
731/// 2. Allocates physical frames for user code and stack
732/// 3. Maps them at user-accessible virtual addresses in the current page tables
733/// 4. Copies the embedded INIT_CODE machine code to the code page
734/// 5. Sets up the per-CPU kernel_rsp for syscall/interrupt return
735/// 6. Transitions to Ring 3 via iretq
736///
737/// On success, this function does not return (enters user mode).
738/// On failure, returns a KernelError for the caller to log.
739pub fn try_enter_usermode() -> Result<(), crate::error::KernelError> {
740 use crate::{mm::FRAME_SIZE, userspace::embedded};
741
742 // Step 1: Get the physical memory offset from BOOT_INFO
743 // SAFETY: BOOT_INFO is a static mut written once during early boot
744 // (in main.rs) and only read afterwards. At this point we are in
745 // single-threaded Stage 6 bootstrap, so no data race is possible.
746 // We use addr_of! to avoid creating a direct reference to the static mut.
747 let phys_offset_val = unsafe {
748 let boot_info_ptr = core::ptr::addr_of!(crate::arch::x86_64::boot::BOOT_INFO);
749 let boot_info =
750 (*boot_info_ptr)
751 .as_ref()
752 .ok_or(crate::error::KernelError::NotInitialized {
753 subsystem: "BOOT_INFO",
754 })?;
755 boot_info.physical_memory_offset.into_option().ok_or(
756 crate::error::KernelError::NotInitialized {
757 subsystem: "physical memory offset",
758 },
759 )?
760 };
761
762 PHYS_OFFSET.store(phys_offset_val, core::sync::atomic::Ordering::Relaxed);
763
764 // Step 1b: Read CR3 to identify page table frames that must not be
765 // allocated for user-space use (the bootloader doesn't mark them as
766 // reserved in the memory map).
767 let cr3_val: u64;
768 // SAFETY: Reading CR3 is always valid in kernel mode.
769 unsafe {
770 asm!("mov {}, cr3", out(reg) cr3_val);
771 }
772 let pml4_phys = cr3_val & 0x000F_FFFF_FFFF_F000;
773
774 // Step 2: Get the embedded init code
775 let init_code = embedded::init_code_bytes();
776
777 // Step 3: Allocate physical frames, skipping any that are page table pages.
778 // One frame for code (mapped at 0x400000)
779 // One frame for stack (mapped at 0x7FFFF000, stack grows down from 0x80000000)
780 // SAFETY: phys_offset_val and pml4_phys are valid (verified above).
781 let code_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 1)? };
782 let code_phys = code_frame.as_u64() * FRAME_SIZE as u64;
783
784 let stack_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 1)? };
785 let stack_phys = stack_frame.as_u64() * FRAME_SIZE as u64;
786
787 // Step 4: Map pages in the current page tables
788 // Code page at 0x400000 (PRESENT + WRITABLE + USER, executable)
789 let code_vaddr: u64 = 0x40_0000;
790 let stack_vaddr: u64 = 0x7FFF_F000;
791
792 // SAFETY: We have verified that phys_offset_val is the correct bootloader
793 // mapping offset. The virtual addresses are in the user-space range (below
794 // 0x0000_8000_0000_0000) and do not conflict with kernel mappings. The
795 // physical frames were just allocated and are valid.
796 unsafe {
797 map_user_page(
798 phys_offset_val,
799 code_vaddr,
800 code_phys,
801 PTE_PRESENT | PTE_WRITABLE | PTE_USER,
802 )?;
803
804 map_user_page(
805 phys_offset_val,
806 stack_vaddr,
807 stack_phys,
808 PTE_PRESENT | PTE_WRITABLE | PTE_USER,
809 )?;
810 }
811
812 // Step 5: Copy init code to the code page
813 // Access the code frame through the physical memory mapping
814 let code_virt_via_phys = phys_offset_val + code_phys;
815 // SAFETY: code_virt_via_phys points to a freshly allocated, zeroed frame
816 // accessible through the bootloader's physical memory mapping. We copy
817 // init_code.len() bytes (< 4096) into the frame.
818 unsafe {
819 let dest = code_virt_via_phys as *mut u8;
820 core::ptr::copy_nonoverlapping(init_code.as_ptr(), dest, init_code.len());
821 }
822
823 // Step 6: Set up per-CPU kernel_rsp
824 // Allocate a dedicated kernel stack for syscall/interrupt return
825 // SAFETY: phys_offset_val and pml4_phys are valid.
826 let kernel_stack_frame = unsafe { allocate_safe_frame(phys_offset_val, pml4_phys, 4)? };
827 let kernel_stack_phys = kernel_stack_frame.as_u64() * FRAME_SIZE as u64;
828 let kernel_stack_top = phys_offset_val + kernel_stack_phys + (4 * FRAME_SIZE as u64);
829
830 // Write kernel_rsp to per-CPU data so syscall_entry can find it
831 let per_cpu = crate::arch::x86_64::syscall::per_cpu_data_ptr();
832 // SAFETY: per_cpu_data_ptr() returns a valid pointer to the static
833 // PerCpuData. We are in single-threaded bootstrap context. Setting
834 // kernel_rsp before entering user mode is required for syscall_entry
835 // to have a valid kernel stack.
836 unsafe {
837 (*per_cpu).kernel_rsp = kernel_stack_top;
838 }
839
840 // Step 7: Enter user mode
841 // User entry point = start of code at 0x400000
842 // User stack pointer = top of stack page at 0x80000000 (grows down from top of
843 // page)
844 let user_entry = code_vaddr;
845 let user_stack = stack_vaddr + FRAME_SIZE as u64; // Top of the stack page
846 let user_cs: u64 = 0x33; // User code segment (GDT index 6, RPL 3)
847 let user_ss: u64 = 0x2B; // User data segment (GDT index 5, RPL 3)
848
849 crate::println!(
850 "[USERMODE] Entering Ring 3: entry={:#x} stack={:#x}",
851 user_entry,
852 user_stack,
853 );
854
855 // SAFETY: All preconditions for enter_usermode are met:
856 // - entry_point (0x400000) has executable code mapped with USER access
857 // - user_stack (0x80000000) points to the top of a mapped user stack page
858 // - CS/SS are valid Ring 3 selectors from the GDT
859 // - CR3 contains page tables with USER-accessible mappings
860 // - Per-CPU kernel_rsp is set for syscall/interrupt return
861 unsafe {
862 enter_usermode(user_entry, user_stack, user_cs, user_ss);
863 }
864}