⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/arch/x86_64/
kpti.rs

1//! Kernel Page Table Isolation (KPTI) for x86_64
2//!
3//! Mitigates Meltdown (CVE-2017-5754) by maintaining separate page table
4//! hierarchies for user mode and kernel mode. When running in user mode,
5//! the shadow page table contains only the minimal kernel mappings needed
6//! for the syscall/interrupt trampoline. On kernel entry, CR3 is switched
7//! to the full kernel page table.
8//!
9//! ## Design
10//!
11//! - **Kernel page table**: The full L4 table with both user (L4[0..255]) and
12//!   kernel (L4[256..511]) entries.
13//! - **Shadow page table**: A separate L4 with user entries copied from the
14//!   kernel table, but only a single trampoline mapping in the kernel half
15//!   (L4[511]) that maps the syscall entry/exit code.
16//! - **CR3 switching**: `switch_to_user()` loads the shadow CR3 before
17//!   returning to Ring 3; `switch_to_kernel()` restores the full CR3 on entry
18//!   to Ring 0.
19
20#![allow(dead_code)]
21
22use core::sync::atomic::{AtomicU64, Ordering};
23
24use spin::Mutex;
25
26use crate::mm::{phys_to_virt_addr, PhysicalAddress, FRAME_ALLOCATOR};
27
28// ===========================================================================
29// Constants
30// ===========================================================================
31
32/// Virtual address of the syscall trampoline page.
33/// Placed at the top of the address space (last page of L4[511]).
34const TRAMPOLINE_VADDR: u64 = 0xFFFF_FFFF_FFFF_0000;
35
36/// L4 index that separates user-space from kernel-space.
37/// Entries 0..255 are user, 256..511 are kernel.
38const USER_KERNEL_SPLIT: usize = 256;
39
40/// Number of L4 entries
41const L4_ENTRY_COUNT: usize = 512;
42
43// Page table entry flags (raw x86_64 PTE bits)
44const PTE_PRESENT: u64 = 1 << 0;
45const PTE_WRITABLE: u64 = 1 << 1;
46const PTE_USER: u64 = 1 << 2;
47const PTE_NO_EXECUTE: u64 = 1 << 63;
48
49// ===========================================================================
50// KPTI State
51// ===========================================================================
52
53/// Per-process KPTI page table pair.
54#[derive(Debug)]
55pub struct KptiPageTables {
56    /// Physical address of the full kernel L4 table.
57    pub kernel_cr3: u64,
58    /// Physical address of the shadow (user-mode) L4 table.
59    pub shadow_cr3: u64,
60}
61
62/// Global KPTI state: the current page table pair.
63struct KptiState {
64    tables: KptiPageTables,
65    initialized: bool,
66}
67
68static KPTI_STATE: Mutex<Option<KptiState>> = Mutex::new(None);
69
70/// Shadow CR3 for fast access without locking (set during init).
71static SHADOW_CR3: AtomicU64 = AtomicU64::new(0);
72
73// ===========================================================================
74// Initialization
75// ===========================================================================
76
77/// Initialize KPTI with shadow page tables derived from the current CR3.
78///
79/// Must be called after the kernel page tables are fully set up.
80pub fn init() {
81    let kernel_cr3 = super::mmu::read_cr3().as_u64();
82
83    match create_shadow_tables(kernel_cr3) {
84        Ok(shadow_cr3) => {
85            SHADOW_CR3.store(shadow_cr3, Ordering::Release);
86            *KPTI_STATE.lock() = Some(KptiState {
87                tables: KptiPageTables {
88                    kernel_cr3,
89                    shadow_cr3,
90                },
91                initialized: true,
92            });
93            crate::println!(
94                "[KPTI] Initialized: kernel CR3=0x{:x}, shadow CR3=0x{:x}",
95                kernel_cr3,
96                shadow_cr3
97            );
98        }
99        Err(e) => {
100            crate::println!(
101                "[KPTI] Initialization failed: {:?} -- running without KPTI",
102                e
103            );
104        }
105    }
106}
107
108/// Create shadow page tables from the kernel's L4 table.
109///
110/// Allocates a new L4 frame and:
111/// 1. Copies all user-space entries (L4[0..255]) from the kernel table.
112/// 2. Leaves kernel-space entries (L4[256..510]) empty (unmapped).
113/// 3. Maps a single trampoline page at L4[511] for syscall transitions.
114///
115/// Returns the physical address of the shadow L4 table.
116pub fn create_shadow_tables(kernel_cr3: u64) -> Result<u64, crate::error::KernelError> {
117    // Allocate a frame for the shadow L4 table
118    let shadow_frame = FRAME_ALLOCATOR
119        .lock()
120        .allocate_frames(1, None)
121        .map_err(|_| crate::error::KernelError::OutOfMemory {
122            requested: 4096,
123            available: 0,
124        })?;
125
126    let shadow_phys = shadow_frame.as_u64() * 4096;
127    let shadow_virt = phys_to_virt_addr(shadow_phys) as *mut u64;
128
129    // Zero the entire shadow L4 table
130    // SAFETY: shadow_virt points to a freshly allocated 4KB frame in the
131    // kernel physical memory window. We have exclusive access.
132    unsafe {
133        core::ptr::write_bytes(shadow_virt, 0, 512);
134    }
135
136    // Read the kernel L4 table
137    let kernel_l4_virt = phys_to_virt_addr(kernel_cr3) as *const u64;
138
139    // Copy user-space entries (L4[0..255])
140    // SAFETY: Both pointers are within the kernel physical memory window,
141    // referencing valid L4 page table frames.
142    unsafe {
143        for i in 0..USER_KERNEL_SPLIT {
144            let entry = core::ptr::read_volatile(kernel_l4_virt.add(i));
145            core::ptr::write_volatile(shadow_virt.add(i), entry);
146        }
147    }
148
149    // Map the trampoline page at L4[511]
150    // This provides the minimal kernel mapping needed for syscall entry/exit.
151    map_trampoline_in_l4(shadow_virt, kernel_l4_virt)?;
152
153    Ok(shadow_phys)
154}
155
156/// Map the trampoline entry in L4[511] of the shadow table.
157///
158/// Copies only the L4[511] entry from the kernel table, which covers
159/// the top 512GB of virtual memory including the trampoline page.
160/// In a production implementation, this would create a minimal L3/L2/L1
161/// chain mapping only the trampoline code page.
162fn map_trampoline_in_l4(
163    shadow_l4: *mut u64,
164    kernel_l4: *const u64,
165) -> Result<(), crate::error::KernelError> {
166    // Copy L4[511] from the kernel table.
167    // This gives the shadow table access to the same L3 subtree as the
168    // kernel for the top 512GB, which includes the trampoline address.
169    //
170    // For tighter isolation, a dedicated L3->L2->L1 chain mapping only
171    // the trampoline page should be used (deferred to Phase 7.5).
172    // SAFETY: Both pointers reference valid L4 page table frames within
173    // the kernel physical memory window.
174    unsafe {
175        let kernel_entry = core::ptr::read_volatile(kernel_l4.add(511));
176        if kernel_entry & PTE_PRESENT != 0 {
177            // Keep the entry but mark it user-accessible for the trampoline
178            let trampoline_entry = kernel_entry | PTE_USER;
179            core::ptr::write_volatile(shadow_l4.add(511), trampoline_entry);
180        } else {
181            // L4[511] is not mapped in the kernel -- create a new entry
182            let frame = FRAME_ALLOCATOR
183                .lock()
184                .allocate_frames(1, None)
185                .map_err(|_| crate::error::KernelError::OutOfMemory {
186                    requested: 4096,
187                    available: 0,
188                })?;
189            let frame_phys = frame.as_u64() * 4096;
190
191            // Zero the L3 table
192            let l3_virt = phys_to_virt_addr(frame_phys) as *mut u8;
193            core::ptr::write_bytes(l3_virt, 0, 4096);
194
195            // Create L4[511] entry pointing to the new L3
196            let entry = frame_phys | PTE_PRESENT | PTE_WRITABLE | PTE_USER;
197            core::ptr::write_volatile(shadow_l4.add(511), entry);
198        }
199    }
200
201    Ok(())
202}
203
204// ===========================================================================
205// CR3 Switching
206// ===========================================================================
207
208/// Switch to the shadow (user-mode) page table.
209///
210/// Called just before returning to Ring 3 (e.g., after syscall completion
211/// or interrupt return). Loads the shadow CR3 which lacks kernel mappings.
212#[inline(always)]
213pub fn switch_to_user() {
214    let shadow = SHADOW_CR3.load(Ordering::Acquire);
215    if shadow != 0 {
216        let cr3_val = PhysicalAddress::new(shadow);
217        super::mmu::write_cr3(cr3_val);
218    }
219}
220
221/// Switch to the full kernel page table.
222///
223/// Called on kernel entry (syscall, interrupt, exception). Restores the
224/// full CR3 so the kernel has access to all its mappings.
225#[inline(always)]
226pub fn switch_to_kernel() {
227    let guard = KPTI_STATE.lock();
228    if let Some(state) = guard.as_ref() {
229        if state.initialized {
230            let cr3_val = PhysicalAddress::new(state.tables.kernel_cr3);
231            super::mmu::write_cr3(cr3_val);
232        }
233    }
234}
235
236// ===========================================================================
237// Syscall Hooks
238// ===========================================================================
239
240/// Called at the start of every syscall handler.
241///
242/// Currently a no-op because the syscall entry assembly switches CR3
243/// before reaching Rust code. This hook exists for future use (e.g.,
244/// per-CPU KPTI state tracking, telemetry).
245#[inline(always)]
246pub fn on_syscall_entry() {
247    // CR3 switch is handled in assembly (syscall_entry) for performance.
248    // This Rust-level hook is reserved for bookkeeping/diagnostics.
249}
250
251/// Called at the end of every syscall handler, just before SYSRET.
252///
253/// Currently a no-op; the SYSRET path in assembly handles CR3 restore.
254#[inline(always)]
255pub fn on_syscall_exit() {
256    // CR3 switch back to shadow is handled in assembly (syscall_return).
257}
258
259// ===========================================================================
260// Query / Diagnostics
261// ===========================================================================
262
263/// Check whether KPTI is initialized and active.
264pub fn is_active() -> bool {
265    SHADOW_CR3.load(Ordering::Acquire) != 0
266}
267
268/// Get the current KPTI page table pair (for diagnostics).
269pub fn get_page_tables() -> Option<(u64, u64)> {
270    let guard = KPTI_STATE.lock();
271    guard
272        .as_ref()
273        .map(|s| (s.tables.kernel_cr3, s.tables.shadow_cr3))
274}
275
276/// Validate shadow table integrity.
277///
278/// Checks that user-space entries in the shadow L4 match the kernel L4,
279/// and that kernel-space entries (except L4[511]) are empty.
280pub fn validate_shadow_tables() -> bool {
281    let guard = KPTI_STATE.lock();
282    let state = match guard.as_ref() {
283        Some(s) if s.initialized => s,
284        _ => return false,
285    };
286
287    let kernel_l4 = phys_to_virt_addr(state.tables.kernel_cr3) as *const u64;
288    let shadow_l4 = phys_to_virt_addr(state.tables.shadow_cr3) as *const u64;
289
290    // SAFETY: Both pointers reference valid L4 page table frames.
291    unsafe {
292        // User entries should match
293        for i in 0..USER_KERNEL_SPLIT {
294            let k = core::ptr::read_volatile(kernel_l4.add(i));
295            let s = core::ptr::read_volatile(shadow_l4.add(i));
296            if k != s {
297                crate::println!(
298                    "[KPTI] Mismatch at L4[{}]: kernel=0x{:x}, shadow=0x{:x}",
299                    i,
300                    k,
301                    s
302                );
303                return false;
304            }
305        }
306
307        // Kernel entries [256..510] should be empty in shadow
308        for i in USER_KERNEL_SPLIT..511 {
309            let s = core::ptr::read_volatile(shadow_l4.add(i));
310            if s & PTE_PRESENT != 0 {
311                crate::println!("[KPTI] Shadow L4[{}] unexpectedly present: 0x{:x}", i, s);
312                return false;
313            }
314        }
315
316        // L4[511] should be present (trampoline)
317        let trampoline = core::ptr::read_volatile(shadow_l4.add(511));
318        if trampoline & PTE_PRESENT == 0 {
319            crate::println!("[KPTI] Shadow L4[511] (trampoline) not present");
320            return false;
321        }
322    }
323
324    true
325}
326
327// ===========================================================================
328// Tests
329// ===========================================================================
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_constants() {
337        assert_eq!(USER_KERNEL_SPLIT, 256);
338        assert_eq!(L4_ENTRY_COUNT, 512);
339        assert_eq!(TRAMPOLINE_VADDR, 0xFFFF_FFFF_FFFF_0000);
340    }
341
342    #[test]
343    fn test_pte_flags() {
344        assert_eq!(PTE_PRESENT, 1);
345        assert_eq!(PTE_WRITABLE, 2);
346        assert_eq!(PTE_USER, 4);
347        assert_eq!(PTE_NO_EXECUTE, 1u64 << 63);
348    }
349
350    #[test]
351    fn test_kpti_not_active_initially() {
352        // KPTI requires actual page tables, so it should not be active
353        // in a test environment without hardware initialization.
354        // Just verify the atomic loads don't panic.
355        let _ = is_active();
356    }
357
358    #[test]
359    fn test_kpti_page_tables_struct() {
360        let tables = KptiPageTables {
361            kernel_cr3: 0x1000,
362            shadow_cr3: 0x2000,
363        };
364        assert_eq!(tables.kernel_cr3, 0x1000);
365        assert_eq!(tables.shadow_cr3, 0x2000);
366    }
367}