veridian_kernel/arch/x86_64/kpti.rs
1//! Kernel Page Table Isolation (KPTI) for x86_64
2//!
3//! Mitigates Meltdown (CVE-2017-5754) by maintaining separate page table
4//! hierarchies for user mode and kernel mode. When running in user mode,
5//! the shadow page table contains only the minimal kernel mappings needed
6//! for the syscall/interrupt trampoline. On kernel entry, CR3 is switched
7//! to the full kernel page table.
8//!
9//! ## Design
10//!
11//! - **Kernel page table**: The full L4 table with both user (L4[0..255]) and
12//! kernel (L4[256..511]) entries.
13//! - **Shadow page table**: A separate L4 with user entries copied from the
14//! kernel table, but only a single trampoline mapping in the kernel half
15//! (L4[511]) that maps the syscall entry/exit code.
16//! - **CR3 switching**: `switch_to_user()` loads the shadow CR3 before
17//! returning to Ring 3; `switch_to_kernel()` restores the full CR3 on entry
18//! to Ring 0.
19
20#![allow(dead_code)]
21
22use core::sync::atomic::{AtomicU64, Ordering};
23
24use spin::Mutex;
25
26use crate::mm::{phys_to_virt_addr, PhysicalAddress, FRAME_ALLOCATOR};
27
28// ===========================================================================
29// Constants
30// ===========================================================================
31
32/// Virtual address of the syscall trampoline page.
33/// Placed at the top of the address space (last page of L4[511]).
34const TRAMPOLINE_VADDR: u64 = 0xFFFF_FFFF_FFFF_0000;
35
36/// L4 index that separates user-space from kernel-space.
37/// Entries 0..255 are user, 256..511 are kernel.
38const USER_KERNEL_SPLIT: usize = 256;
39
40/// Number of L4 entries
41const L4_ENTRY_COUNT: usize = 512;
42
43// Page table entry flags (raw x86_64 PTE bits)
44const PTE_PRESENT: u64 = 1 << 0;
45const PTE_WRITABLE: u64 = 1 << 1;
46const PTE_USER: u64 = 1 << 2;
47const PTE_NO_EXECUTE: u64 = 1 << 63;
48
49// ===========================================================================
50// KPTI State
51// ===========================================================================
52
53/// Per-process KPTI page table pair.
54#[derive(Debug)]
55pub struct KptiPageTables {
56 /// Physical address of the full kernel L4 table.
57 pub kernel_cr3: u64,
58 /// Physical address of the shadow (user-mode) L4 table.
59 pub shadow_cr3: u64,
60}
61
62/// Global KPTI state: the current page table pair.
63struct KptiState {
64 tables: KptiPageTables,
65 initialized: bool,
66}
67
68static KPTI_STATE: Mutex<Option<KptiState>> = Mutex::new(None);
69
70/// Shadow CR3 for fast access without locking (set during init).
71static SHADOW_CR3: AtomicU64 = AtomicU64::new(0);
72
73// ===========================================================================
74// Initialization
75// ===========================================================================
76
77/// Initialize KPTI with shadow page tables derived from the current CR3.
78///
79/// Must be called after the kernel page tables are fully set up.
80pub fn init() {
81 let kernel_cr3 = super::mmu::read_cr3().as_u64();
82
83 match create_shadow_tables(kernel_cr3) {
84 Ok(shadow_cr3) => {
85 SHADOW_CR3.store(shadow_cr3, Ordering::Release);
86 *KPTI_STATE.lock() = Some(KptiState {
87 tables: KptiPageTables {
88 kernel_cr3,
89 shadow_cr3,
90 },
91 initialized: true,
92 });
93 crate::println!(
94 "[KPTI] Initialized: kernel CR3=0x{:x}, shadow CR3=0x{:x}",
95 kernel_cr3,
96 shadow_cr3
97 );
98 }
99 Err(e) => {
100 crate::println!(
101 "[KPTI] Initialization failed: {:?} -- running without KPTI",
102 e
103 );
104 }
105 }
106}
107
108/// Create shadow page tables from the kernel's L4 table.
109///
110/// Allocates a new L4 frame and:
111/// 1. Copies all user-space entries (L4[0..255]) from the kernel table.
112/// 2. Leaves kernel-space entries (L4[256..510]) empty (unmapped).
113/// 3. Maps a single trampoline page at L4[511] for syscall transitions.
114///
115/// Returns the physical address of the shadow L4 table.
116pub fn create_shadow_tables(kernel_cr3: u64) -> Result<u64, crate::error::KernelError> {
117 // Allocate a frame for the shadow L4 table
118 let shadow_frame = FRAME_ALLOCATOR
119 .lock()
120 .allocate_frames(1, None)
121 .map_err(|_| crate::error::KernelError::OutOfMemory {
122 requested: 4096,
123 available: 0,
124 })?;
125
126 let shadow_phys = shadow_frame.as_u64() * 4096;
127 let shadow_virt = phys_to_virt_addr(shadow_phys) as *mut u64;
128
129 // Zero the entire shadow L4 table
130 // SAFETY: shadow_virt points to a freshly allocated 4KB frame in the
131 // kernel physical memory window. We have exclusive access.
132 unsafe {
133 core::ptr::write_bytes(shadow_virt, 0, 512);
134 }
135
136 // Read the kernel L4 table
137 let kernel_l4_virt = phys_to_virt_addr(kernel_cr3) as *const u64;
138
139 // Copy user-space entries (L4[0..255])
140 // SAFETY: Both pointers are within the kernel physical memory window,
141 // referencing valid L4 page table frames.
142 unsafe {
143 for i in 0..USER_KERNEL_SPLIT {
144 let entry = core::ptr::read_volatile(kernel_l4_virt.add(i));
145 core::ptr::write_volatile(shadow_virt.add(i), entry);
146 }
147 }
148
149 // Map the trampoline page at L4[511]
150 // This provides the minimal kernel mapping needed for syscall entry/exit.
151 map_trampoline_in_l4(shadow_virt, kernel_l4_virt)?;
152
153 Ok(shadow_phys)
154}
155
156/// Map the trampoline entry in L4[511] of the shadow table.
157///
158/// Copies only the L4[511] entry from the kernel table, which covers
159/// the top 512GB of virtual memory including the trampoline page.
160/// In a production implementation, this would create a minimal L3/L2/L1
161/// chain mapping only the trampoline code page.
162fn map_trampoline_in_l4(
163 shadow_l4: *mut u64,
164 kernel_l4: *const u64,
165) -> Result<(), crate::error::KernelError> {
166 // Copy L4[511] from the kernel table.
167 // This gives the shadow table access to the same L3 subtree as the
168 // kernel for the top 512GB, which includes the trampoline address.
169 //
170 // For tighter isolation, a dedicated L3->L2->L1 chain mapping only
171 // the trampoline page should be used (deferred to Phase 7.5).
172 // SAFETY: Both pointers reference valid L4 page table frames within
173 // the kernel physical memory window.
174 unsafe {
175 let kernel_entry = core::ptr::read_volatile(kernel_l4.add(511));
176 if kernel_entry & PTE_PRESENT != 0 {
177 // Keep the entry but mark it user-accessible for the trampoline
178 let trampoline_entry = kernel_entry | PTE_USER;
179 core::ptr::write_volatile(shadow_l4.add(511), trampoline_entry);
180 } else {
181 // L4[511] is not mapped in the kernel -- create a new entry
182 let frame = FRAME_ALLOCATOR
183 .lock()
184 .allocate_frames(1, None)
185 .map_err(|_| crate::error::KernelError::OutOfMemory {
186 requested: 4096,
187 available: 0,
188 })?;
189 let frame_phys = frame.as_u64() * 4096;
190
191 // Zero the L3 table
192 let l3_virt = phys_to_virt_addr(frame_phys) as *mut u8;
193 core::ptr::write_bytes(l3_virt, 0, 4096);
194
195 // Create L4[511] entry pointing to the new L3
196 let entry = frame_phys | PTE_PRESENT | PTE_WRITABLE | PTE_USER;
197 core::ptr::write_volatile(shadow_l4.add(511), entry);
198 }
199 }
200
201 Ok(())
202}
203
204// ===========================================================================
205// CR3 Switching
206// ===========================================================================
207
208/// Switch to the shadow (user-mode) page table.
209///
210/// Called just before returning to Ring 3 (e.g., after syscall completion
211/// or interrupt return). Loads the shadow CR3 which lacks kernel mappings.
212#[inline(always)]
213pub fn switch_to_user() {
214 let shadow = SHADOW_CR3.load(Ordering::Acquire);
215 if shadow != 0 {
216 let cr3_val = PhysicalAddress::new(shadow);
217 super::mmu::write_cr3(cr3_val);
218 }
219}
220
221/// Switch to the full kernel page table.
222///
223/// Called on kernel entry (syscall, interrupt, exception). Restores the
224/// full CR3 so the kernel has access to all its mappings.
225#[inline(always)]
226pub fn switch_to_kernel() {
227 let guard = KPTI_STATE.lock();
228 if let Some(state) = guard.as_ref() {
229 if state.initialized {
230 let cr3_val = PhysicalAddress::new(state.tables.kernel_cr3);
231 super::mmu::write_cr3(cr3_val);
232 }
233 }
234}
235
236// ===========================================================================
237// Syscall Hooks
238// ===========================================================================
239
240/// Called at the start of every syscall handler.
241///
242/// Currently a no-op because the syscall entry assembly switches CR3
243/// before reaching Rust code. This hook exists for future use (e.g.,
244/// per-CPU KPTI state tracking, telemetry).
245#[inline(always)]
246pub fn on_syscall_entry() {
247 // CR3 switch is handled in assembly (syscall_entry) for performance.
248 // This Rust-level hook is reserved for bookkeeping/diagnostics.
249}
250
251/// Called at the end of every syscall handler, just before SYSRET.
252///
253/// Currently a no-op; the SYSRET path in assembly handles CR3 restore.
254#[inline(always)]
255pub fn on_syscall_exit() {
256 // CR3 switch back to shadow is handled in assembly (syscall_return).
257}
258
259// ===========================================================================
260// Query / Diagnostics
261// ===========================================================================
262
263/// Check whether KPTI is initialized and active.
264pub fn is_active() -> bool {
265 SHADOW_CR3.load(Ordering::Acquire) != 0
266}
267
268/// Get the current KPTI page table pair (for diagnostics).
269pub fn get_page_tables() -> Option<(u64, u64)> {
270 let guard = KPTI_STATE.lock();
271 guard
272 .as_ref()
273 .map(|s| (s.tables.kernel_cr3, s.tables.shadow_cr3))
274}
275
276/// Validate shadow table integrity.
277///
278/// Checks that user-space entries in the shadow L4 match the kernel L4,
279/// and that kernel-space entries (except L4[511]) are empty.
280pub fn validate_shadow_tables() -> bool {
281 let guard = KPTI_STATE.lock();
282 let state = match guard.as_ref() {
283 Some(s) if s.initialized => s,
284 _ => return false,
285 };
286
287 let kernel_l4 = phys_to_virt_addr(state.tables.kernel_cr3) as *const u64;
288 let shadow_l4 = phys_to_virt_addr(state.tables.shadow_cr3) as *const u64;
289
290 // SAFETY: Both pointers reference valid L4 page table frames.
291 unsafe {
292 // User entries should match
293 for i in 0..USER_KERNEL_SPLIT {
294 let k = core::ptr::read_volatile(kernel_l4.add(i));
295 let s = core::ptr::read_volatile(shadow_l4.add(i));
296 if k != s {
297 crate::println!(
298 "[KPTI] Mismatch at L4[{}]: kernel=0x{:x}, shadow=0x{:x}",
299 i,
300 k,
301 s
302 );
303 return false;
304 }
305 }
306
307 // Kernel entries [256..510] should be empty in shadow
308 for i in USER_KERNEL_SPLIT..511 {
309 let s = core::ptr::read_volatile(shadow_l4.add(i));
310 if s & PTE_PRESENT != 0 {
311 crate::println!("[KPTI] Shadow L4[{}] unexpectedly present: 0x{:x}", i, s);
312 return false;
313 }
314 }
315
316 // L4[511] should be present (trampoline)
317 let trampoline = core::ptr::read_volatile(shadow_l4.add(511));
318 if trampoline & PTE_PRESENT == 0 {
319 crate::println!("[KPTI] Shadow L4[511] (trampoline) not present");
320 return false;
321 }
322 }
323
324 true
325}
326
327// ===========================================================================
328// Tests
329// ===========================================================================
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn test_constants() {
337 assert_eq!(USER_KERNEL_SPLIT, 256);
338 assert_eq!(L4_ENTRY_COUNT, 512);
339 assert_eq!(TRAMPOLINE_VADDR, 0xFFFF_FFFF_FFFF_0000);
340 }
341
342 #[test]
343 fn test_pte_flags() {
344 assert_eq!(PTE_PRESENT, 1);
345 assert_eq!(PTE_WRITABLE, 2);
346 assert_eq!(PTE_USER, 4);
347 assert_eq!(PTE_NO_EXECUTE, 1u64 << 63);
348 }
349
350 #[test]
351 fn test_kpti_not_active_initially() {
352 // KPTI requires actual page tables, so it should not be active
353 // in a test environment without hardware initialization.
354 // Just verify the atomic loads don't panic.
355 let _ = is_active();
356 }
357
358 #[test]
359 fn test_kpti_page_tables_struct() {
360 let tables = KptiPageTables {
361 kernel_cr3: 0x1000,
362 shadow_cr3: 0x2000,
363 };
364 assert_eq!(tables.kernel_cr3, 0x1000);
365 assert_eq!(tables.shadow_cr3, 0x2000);
366 }
367}