⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/sched/
smp.rs

1//! Symmetric multiprocessing (SMP) support
2
3#![allow(
4    clippy::fn_to_numeric_cast,
5    clippy::needless_return,
6    function_casts_as_integer
7)]
8
9#[cfg(feature = "alloc")]
10extern crate alloc;
11#[cfg(feature = "alloc")]
12use alloc::{string::String, vec::Vec};
13use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicU8, Ordering};
14
15use spin::Mutex;
16
17use super::{queue::ReadyQueue, scheduler::Scheduler, task::Task};
18use crate::error::KernelError;
19
20/// CPU information
21pub struct CpuInfo {
22    /// CPU ID
23    pub id: u8,
24    /// CPU online status
25    pub online: AtomicBool,
26    /// CPU idle status
27    pub idle: AtomicBool,
28    /// Current task on this CPU
29    pub current_task: AtomicU64,
30    /// Load average (0-100)
31    pub load: AtomicU8,
32    /// Number of tasks in run queue
33    pub nr_running: AtomicU32,
34    /// Per-CPU scheduler
35    pub scheduler: Mutex<Scheduler>,
36    /// Per-CPU ready queue
37    pub ready_queue: Mutex<ReadyQueue>,
38    /// CPU vendor string
39    #[cfg(feature = "alloc")]
40    pub vendor: String,
41    /// CPU model string
42    #[cfg(feature = "alloc")]
43    pub model: String,
44    /// CPU features
45    pub features: CpuFeatures,
46    /// Per-CPU page frame cache index (matches PER_CPU_PAGE_CACHES slot)
47    pub page_cache_id: u8,
48}
49
50/// CPU features
51#[derive(Debug, Default)]
52pub struct CpuFeatures {
53    /// Supports FPU
54    pub fpu: bool,
55    /// Supports SIMD
56    pub simd: bool,
57    /// Supports virtualization
58    pub virtualization: bool,
59    /// Supports hardware security features
60    pub security: bool,
61    /// Maximum physical address bits
62    pub phys_addr_bits: u8,
63    /// Maximum virtual address bits
64    pub virt_addr_bits: u8,
65}
66
67impl CpuInfo {
68    /// Create new CPU info
69    pub const fn new(id: u8) -> Self {
70        Self {
71            id,
72            online: AtomicBool::new(false),
73            idle: AtomicBool::new(true),
74            current_task: AtomicU64::new(0),
75            load: AtomicU8::new(0),
76            nr_running: AtomicU32::new(0),
77            scheduler: Mutex::new(Scheduler::new()),
78            ready_queue: Mutex::new(ReadyQueue::new()),
79            #[cfg(feature = "alloc")]
80            vendor: String::new(),
81            #[cfg(feature = "alloc")]
82            model: String::new(),
83            features: CpuFeatures {
84                fpu: false,
85                simd: false,
86                virtualization: false,
87                security: false,
88                phys_addr_bits: 0,
89                virt_addr_bits: 0,
90            },
91            page_cache_id: id,
92        }
93    }
94
95    /// Mark CPU as online
96    pub fn bring_online(&self) {
97        self.online.store(true, Ordering::Release);
98        self.idle.store(true, Ordering::Release);
99    }
100
101    /// Mark CPU as offline
102    pub fn bring_offline(&self) {
103        self.online.store(false, Ordering::Release);
104    }
105
106    /// Check if CPU is online
107    pub fn is_online(&self) -> bool {
108        self.online.load(Ordering::Acquire)
109    }
110
111    /// Check if CPU is idle
112    pub fn is_idle(&self) -> bool {
113        self.idle.load(Ordering::Acquire)
114    }
115
116    /// Update load average
117    pub fn update_load(&self) {
118        let nr_running = self.nr_running.load(Ordering::Relaxed);
119        let load = (nr_running * 100 / MAX_LOAD_FACTOR).min(100) as u8;
120        self.load.store(load, Ordering::Relaxed);
121    }
122}
123
124/// CPU topology information
125#[derive(Debug)]
126pub struct CpuTopology {
127    /// Total number of CPUs
128    pub total_cpus: u8,
129    /// Number of online CPUs
130    pub online_cpus: AtomicU8,
131    /// Number of CPU sockets
132    pub sockets: u8,
133    /// Number of cores per socket
134    pub cores_per_socket: u8,
135    /// Number of threads per core
136    pub threads_per_core: u8,
137    /// NUMA nodes
138    #[cfg(feature = "alloc")]
139    pub numa_nodes: Vec<NumaNode>,
140}
141
142/// NUMA node information
143#[cfg(feature = "alloc")]
144#[derive(Debug)]
145pub struct NumaNode {
146    /// Node ID
147    pub id: u8,
148    /// CPUs in this node
149    pub cpus: Vec<u8>,
150    /// Memory ranges
151    pub memory_ranges: Vec<(usize, usize)>,
152    /// Distance to other nodes
153    pub distances: Vec<u8>,
154}
155
156impl CpuTopology {
157    /// Create new topology
158    pub fn new() -> Self {
159        Self {
160            total_cpus: 1,
161            online_cpus: AtomicU8::new(1),
162            sockets: 1,
163            cores_per_socket: 1,
164            threads_per_core: 1,
165            #[cfg(feature = "alloc")]
166            numa_nodes: Vec::new(),
167        }
168    }
169
170    /// Detect CPU topology
171    pub fn detect(&mut self) {
172        #[cfg(target_arch = "x86_64")]
173        self.detect_x86_64();
174
175        #[cfg(target_arch = "aarch64")]
176        self.detect_aarch64();
177
178        #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
179        self.detect_riscv();
180    }
181
182    #[cfg(target_arch = "x86_64")]
183    fn detect_x86_64(&mut self) {
184        use core::arch::x86_64::__cpuid;
185
186        // SAFETY: CPUID is an unprivileged instruction that queries CPU
187        // feature information. Leaf 0x1 returns basic processor info and
188        // leaf 0xB returns extended topology. Both are read-only operations
189        // with no side effects. max_cpuid() verifies leaf 0xB is supported
190        // before accessing it.
191        unsafe {
192            // Get basic CPU info
193            let cpuid = __cpuid(0x1);
194            let logical_cpus = ((cpuid.ebx >> 16) & 0xFF) as u8;
195
196            // Get extended topology
197            if max_cpuid() >= 0xB {
198                // Intel topology enumeration
199                let cpuid = __cpuid(0xB);
200                self.threads_per_core = (cpuid.ebx & 0xFFFF) as u8;
201
202                let cpuid = __cpuid(0xB);
203                self.cores_per_socket = ((cpuid.ebx & 0xFFFF) / self.threads_per_core as u32) as u8;
204
205                self.total_cpus = logical_cpus;
206                self.sockets = self.total_cpus / (self.cores_per_socket * self.threads_per_core);
207            } else {
208                // Fallback
209                self.total_cpus = logical_cpus;
210            }
211        }
212    }
213
214    #[cfg(target_arch = "aarch64")]
215    fn detect_aarch64(&mut self) {
216        // SAFETY: MPIDR_EL1 is a read-only system register accessible from
217        // EL1 (kernel mode) that provides the CPU's affinity information
218        // (thread, core, cluster, socket). Reading it has no side effects.
219        unsafe {
220            let mpidr: u64;
221            core::arch::asm!("mrs {}, MPIDR_EL1", out(reg) mpidr);
222
223            // Extract affinity levels
224            let _aff0 = (mpidr & 0xFF) as u8; // Thread
225            let _aff1 = ((mpidr >> 8) & 0xFF) as u8; // Core
226            let _aff2 = ((mpidr >> 16) & 0xFF) as u8; // Cluster
227            let _aff3 = ((mpidr >> 32) & 0xFF) as u8; // Socket
228
229            // This is simplified - real detection would probe all CPUs
230            self.threads_per_core = 1; // SMT not common on ARM
231            self.cores_per_socket = 4; // Common configuration
232            self.sockets = 1;
233            self.total_cpus = self.sockets * self.cores_per_socket * self.threads_per_core;
234        }
235    }
236
237    #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
238    fn detect_riscv(&mut self) {
239        // RISC-V detection through device tree or SBI
240        // For now, assume single core
241        self.total_cpus = 1;
242        self.threads_per_core = 1;
243        self.cores_per_socket = 1;
244        self.sockets = 1;
245    }
246}
247
248/// Per-CPU data
249#[repr(C)]
250pub struct PerCpuData {
251    /// CPU information
252    pub cpu_info: CpuInfo,
253    /// Current privilege level
254    pub privilege_level: u8,
255    /// Interrupt nesting level
256    pub irq_depth: u32,
257    /// Preemption count
258    pub preempt_count: u32,
259    /// Kernel stack pointer
260    pub kernel_stack: usize,
261    /// Thread-local storage
262    pub tls: usize,
263}
264
265impl Default for CpuTopology {
266    fn default() -> Self {
267        Self::new()
268    }
269}
270
271/// Maximum number of CPUs
272/// Reduced from 256 to 16 for bootloader 0.11 compatibility (reduces static
273/// data size)
274pub const MAX_CPUS: usize = 16;
275
276/// Maximum load factor for load calculation
277const MAX_LOAD_FACTOR: u32 = 10;
278
279/// Per-CPU data array
280///
281/// SAFETY JUSTIFICATION: This static mut is intentionally kept because:
282/// 1. Each CPU slot is written exactly once during init_cpu() (single-writer
283///    per index)
284/// 2. After initialization, slots are only read (immutable access)
285/// 3. Each CPU accesses its own slot via cpu_id (no cross-CPU aliasing)
286/// 4. Using a Mutex here would cause deadlocks in scheduler hot paths
287/// 5. This is a pre-heap, per-CPU data structure that cannot use OnceLock
288#[allow(static_mut_refs)]
289static mut PER_CPU_DATA: [Option<PerCpuData>; MAX_CPUS] = [const { None }; MAX_CPUS];
290
291/// CPU topology
292static CPU_TOPOLOGY: Mutex<CpuTopology> = Mutex::new(CpuTopology {
293    total_cpus: 1,
294    online_cpus: AtomicU8::new(1),
295    sockets: 1,
296    cores_per_socket: 1,
297    threads_per_core: 1,
298    #[cfg(feature = "alloc")]
299    numa_nodes: Vec::new(),
300});
301
302/// Initialize SMP support
303pub fn init() {
304    kprintln!("[SMP] Initializing SMP support (BSP only)...");
305
306    // All architectures currently use simplified BSP-only initialization.
307    // Complex topology detection and AP wakeup deferred to Phase 3+.
308
309    kprintln!("[SMP] SMP initialized (BSP only)");
310}
311
312/// Wake up all Application Processors
313fn wake_up_aps() {
314    let topology = CPU_TOPOLOGY.lock();
315    let num_cpus = topology.total_cpus;
316
317    if num_cpus <= 1 {
318        println!("[SMP] Single CPU system, no APs to wake");
319        return;
320    }
321
322    println!("[SMP] Waking up {} Application Processors", num_cpus - 1);
323
324    // Wake up each AP
325    for cpu_id in 1..num_cpus {
326        if let Err(_e) = cpu_up(cpu_id) {
327            println!("[SMP] Failed to wake CPU {}: {}", cpu_id, _e);
328        } else {
329            println!("[SMP] Successfully woke CPU {}", cpu_id);
330        }
331    }
332}
333
334/// Initialize specific CPU
335pub fn init_cpu(cpu_id: u8) {
336    // SAFETY: Each CPU slot is written exactly once during initialization.
337    // cpu_id is bounds-checked by callers (cpu_up checks cpu_id < MAX_CPUS).
338    // No concurrent write to the same index occurs.
339    unsafe {
340        let cpu_info = CpuInfo::new(cpu_id);
341
342        // Initialize per-CPU scheduler with CPU ID
343        {
344            let mut scheduler = cpu_info.scheduler.lock();
345            scheduler.cpu_id = cpu_id;
346        }
347
348        let cpu_data = PerCpuData {
349            cpu_info,
350            privilege_level: 0,
351            irq_depth: 0,
352            preempt_count: 0,
353            kernel_stack: 0,
354            tls: 0,
355        };
356
357        PER_CPU_DATA[cpu_id as usize] = Some(cpu_data);
358
359        if let Some(ref mut data) = PER_CPU_DATA[cpu_id as usize] {
360            data.cpu_info.bring_online();
361
362            // Initialize idle task for this CPU if not BSP
363            #[cfg(feature = "alloc")]
364            if cpu_id != 0 {
365                // Create per-CPU idle task
366                create_cpu_idle_task(cpu_id);
367            }
368        }
369    }
370}
371
372/// Get per-CPU data for current CPU
373pub fn this_cpu() -> &'static PerCpuData {
374    let cpu_id = current_cpu_id();
375    // SAFETY: PER_CPU_DATA is initialized during init_cpu() for each CPU
376    // before any code calls this_cpu(). Each CPU reads only its own slot.
377    // After init, the slot is never modified again.
378    unsafe {
379        PER_CPU_DATA[cpu_id as usize]
380            .as_ref()
381            .expect("Per-CPU data not initialized")
382    }
383}
384
385/// Get per-CPU data for specific CPU
386pub fn per_cpu(cpu_id: u8) -> Option<&'static PerCpuData> {
387    // SAFETY: PER_CPU_DATA slots are written once during init_cpu() and
388    // only read thereafter. The returned reference is valid for 'static
389    // because the array lives for the kernel's lifetime.
390    unsafe { PER_CPU_DATA[cpu_id as usize].as_ref() }
391}
392
393/// Get current CPU ID
394pub fn current_cpu_id() -> u8 {
395    #[cfg(target_arch = "x86_64")]
396    {
397        // SAFETY: CPUID leaf 0x1 is an unprivileged read-only instruction.
398        // The initial APIC ID is in bits 31:24 of EBX. This is safe to call
399        // at any time on x86_64.
400        unsafe {
401            use core::arch::x86_64::__cpuid;
402            let cpuid = __cpuid(0x1);
403            ((cpuid.ebx >> 24) & 0xFF) as u8
404        }
405    }
406
407    #[cfg(target_arch = "aarch64")]
408    {
409        // SAFETY: MPIDR_EL1 is a read-only system register accessible from
410        // EL1 (kernel mode). Bits [7:0] (Aff0) contain the CPU thread ID
411        // within the core. Reading has no side effects.
412        unsafe {
413            let mpidr: u64;
414            core::arch::asm!("mrs {}, MPIDR_EL1", out(reg) mpidr);
415            (mpidr & 0xFF) as u8
416        }
417    }
418
419    #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
420    {
421        // SAFETY: mhartid is a read-only CSR that returns the hardware
422        // thread (hart) ID. It is always readable from M-mode. This may
423        // trap in S-mode if not delegated, but during bootstrap we run in
424        // M-mode or the SBI provides this value.
425        unsafe {
426            let hartid: usize;
427            core::arch::asm!("csrr {}, mhartid", out(reg) hartid);
428            hartid as u8
429        }
430    }
431}
432
433/// Send inter-processor interrupt
434pub fn send_ipi(target_cpu: u8, vector: u8) {
435    #[cfg(target_arch = "x86_64")]
436    {
437        // Send IPI via the Local APIC Interrupt Command Register.
438        if let Err(e) = crate::arch::x86_64::apic::send_ipi(target_cpu, vector) {
439            println!(
440                "[SMP] IPI to CPU {} vector {:#x} failed: {}",
441                target_cpu, vector, e
442            );
443        }
444    }
445
446    #[cfg(target_arch = "aarch64")]
447    {
448        // SAFETY: GICD_SGIR is a memory-mapped I/O register at a fixed address
449        // on the QEMU virt machine GIC (Generic Interrupt Controller). Writing
450        // to it triggers a Software Generated Interrupt (SGI) on the target
451        // CPU(s). The address is always mapped and the write is a volatile
452        // MMIO operation that does not alias any Rust memory.
453        unsafe {
454            // GIC distributor base (QEMU virt machine)
455            const GICD_BASE: usize = 0x0800_0000;
456            const GICD_SGIR: usize = GICD_BASE + 0xF00;
457
458            // SGI target list (bit per CPU)
459            let target_list = 1u32 << target_cpu;
460            // SGI ID (0-15 are software generated)
461            let sgi_id = (vector & 0xF) as u32;
462
463            // Write to GICD_SGIR to trigger SGI
464            let sgir_value = (target_list << 16) | sgi_id;
465            core::ptr::write_volatile(GICD_SGIR as *mut u32, sgir_value);
466        }
467    }
468
469    #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
470    {
471        // Use SBI IPI extension
472
473        // Create hart mask for target CPU
474        let hart_mask = 1u64 << target_cpu;
475        let hart_mask_base = 0;
476
477        // SBI call to send IPI
478        // Function ID 0x735049 ('sPI' in ASCII) for sbi_send_ipi
479        // SAFETY: This performs an SBI ecall to send an IPI to the target
480        // hart. The calling convention uses a0 (hart_mask), a1 (base),
481        // a7 (extension ID), a6 (function ID). The ecall is a supervisor-
482        // level trap to the SBI firmware which handles IPI delivery. The
483        // clobbered registers (a0, a1) are marked as lateout.
484        unsafe {
485            core::arch::asm!(
486                "ecall",
487                in("a0") hart_mask,
488                in("a1") hart_mask_base,
489                in("a7") 0x735049,
490                in("a6") 0,
491                lateout("a0") _,
492                lateout("a1") _,
493            )
494        };
495
496        // Note: vector parameter is not used in RISC-V as IPIs are fixed
497        let _ = vector;
498    }
499
500    #[allow(unused_variables)]
501    let _ = (target_cpu, vector); // Suppress warnings on some architectures
502}
503
504/// CPU hotplug: bring CPU online
505pub fn cpu_up(cpu_id: u8) -> Result<(), KernelError> {
506    if cpu_id >= MAX_CPUS as u8 {
507        return Err(KernelError::InvalidArgument {
508            name: "cpu_id",
509            value: "exceeds MAX_CPUS",
510        });
511    }
512
513    if let Some(cpu_data) = per_cpu(cpu_id) {
514        if cpu_data.cpu_info.is_online() {
515            return Err(KernelError::AlreadyExists {
516                resource: "online CPU",
517                id: cpu_id as u64,
518            });
519        }
520    } else {
521        init_cpu(cpu_id);
522    }
523
524    // Send INIT-SIPI-SIPI sequence to wake up AP (Application Processor).
525    // Per Intel SDM: INIT -> 10ms delay -> SIPI -> 200us delay -> SIPI (if needed).
526    #[cfg(target_arch = "x86_64")]
527    {
528        // Send INIT IPI via the APIC ICR with INIT delivery mode.
529        if let Err(e) = crate::arch::x86_64::apic::send_init_ipi(cpu_id) {
530            println!("[SMP] INIT IPI to CPU {} failed: {}", cpu_id, e);
531            return Err(KernelError::HardwareError {
532                device: "APIC",
533                code: cpu_id as u32,
534            });
535        }
536
537        // 10ms delay for INIT to be processed (spin-wait).
538        for _ in 0..10_000_000 {
539            core::hint::spin_loop();
540        }
541
542        // Send first Startup IPI (SIPI) with trampoline page vector.
543        // Startup page 0x08 = physical address 0x8000 where AP trampoline
544        // code would reside (not yet implemented -- requires 16-bit real mode code).
545        let sipi_page = 0x08u8;
546        let _ = crate::arch::x86_64::apic::send_startup_ipi(cpu_id, sipi_page);
547
548        // 200us delay.
549        for _ in 0..200_000 {
550            core::hint::spin_loop();
551        }
552
553        // Send second SIPI if AP has not come online yet (per Intel SDM
554        // recommendation).
555        if let Some(cpu_data) = per_cpu(cpu_id) {
556            if !cpu_data.cpu_info.is_online() {
557                let _ = crate::arch::x86_64::apic::send_startup_ipi(cpu_id, sipi_page);
558            }
559        }
560    }
561
562    #[cfg(target_arch = "aarch64")]
563    {
564        // On AArch64, use PSCI (Power State Coordination Interface)
565        // For now, just send a wake-up SGI
566        send_ipi(cpu_id, 0);
567    }
568
569    #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
570    {
571        // On RISC-V, use SBI HSM (Hart State Management) extension
572        // For now, just send IPI
573        send_ipi(cpu_id, 0);
574    }
575
576    // Wait for CPU to come online
577    let mut retries = 100;
578    while retries > 0 {
579        if let Some(cpu_data) = per_cpu(cpu_id) {
580            if cpu_data.cpu_info.is_online() {
581                println!("[SMP] CPU {} is now online", cpu_id);
582                return Ok(());
583            }
584        }
585        // Simulated delay
586        retries -= 1;
587    }
588
589    Err(KernelError::Timeout {
590        operation: "CPU online",
591        duration_ms: 100,
592    })
593}
594
595/// CPU hotplug: bring CPU offline
596pub fn cpu_down(cpu_id: u8) -> Result<(), KernelError> {
597    if cpu_id == 0 {
598        return Err(KernelError::PermissionDenied {
599            operation: "offline BSP (CPU 0)",
600        });
601    }
602
603    if let Some(cpu_data) = per_cpu(cpu_id) {
604        if !cpu_data.cpu_info.is_online() {
605            return Err(KernelError::InvalidState {
606                expected: "online",
607                actual: "offline",
608            });
609        }
610
611        // Migrate all tasks from this CPU
612        let nr_tasks = cpu_data.cpu_info.nr_running.load(Ordering::Relaxed);
613        if nr_tasks > 0 {
614            println!("[SMP] Migrating {} tasks from CPU {}", nr_tasks, cpu_id);
615
616            // Find target CPU with lowest load
617            let target_cpu = find_least_loaded_cpu();
618            if target_cpu == cpu_id {
619                return Err(KernelError::ResourceExhausted {
620                    resource: "available CPUs for migration",
621                });
622            }
623
624            // Migrate all tasks
625            let mut _migrated = 0;
626            loop {
627                let task = {
628                    let mut queue = cpu_data.cpu_info.ready_queue.lock();
629                    queue.dequeue()
630                };
631
632                if let Some(task_ptr) = task {
633                    if migrate_task(task_ptr, cpu_id, target_cpu).is_ok() {
634                        _migrated += 1;
635                    }
636                } else {
637                    break;
638                }
639            }
640
641            println!("[SMP] Migrated {} tasks to CPU {}", _migrated, target_cpu);
642        }
643
644        // Send CPU offline notification
645        send_ipi(cpu_id, 0xFF); // Special offline vector
646
647        // Mark CPU as offline
648        cpu_data.cpu_info.bring_offline();
649
650        println!("[SMP] CPU {} is now offline", cpu_id);
651        Ok(())
652    } else {
653        Err(KernelError::NotInitialized { subsystem: "CPU" })
654    }
655}
656
657/// Load balancing: migrate task between CPUs
658pub fn migrate_task(
659    task_ptr: core::ptr::NonNull<Task>,
660    from_cpu: u8,
661    to_cpu: u8,
662) -> Result<(), KernelError> {
663    // SAFETY: task_ptr is a valid NonNull<Task> passed by the caller
664    // (cpu_down or load balancer). We only read task fields (cpu_affinity,
665    // state, sched_class) for migration eligibility checks. The task is not
666    // currently running on any CPU (verified by the Running state check).
667    unsafe {
668        let task = task_ptr.as_ref();
669
670        // Check if migration is allowed
671        if !task.cpu_affinity.contains(to_cpu) {
672            return Err(KernelError::InvalidArgument {
673                name: "cpu_affinity",
674                value: "task affinity prevents migration",
675            });
676        }
677
678        // Don't migrate running tasks
679        if task.state == super::ProcessState::Running {
680            return Err(KernelError::InvalidState {
681                expected: "not running",
682                actual: "running",
683            });
684        }
685
686        // Don't migrate idle tasks
687        if task.sched_class == super::task::SchedClass::Idle {
688            return Err(KernelError::InvalidArgument {
689                name: "sched_class",
690                value: "cannot migrate idle task",
691            });
692        }
693    }
694
695    // Remove from source CPU queue
696    let removed = if let Some(from_cpu_data) = per_cpu(from_cpu) {
697        let mut queue = from_cpu_data.cpu_info.ready_queue.lock();
698        if queue.remove(task_ptr) {
699            from_cpu_data
700                .cpu_info
701                .nr_running
702                .fetch_sub(1, Ordering::Relaxed);
703            from_cpu_data.cpu_info.update_load();
704            true
705        } else {
706            false
707        }
708    } else {
709        false
710    };
711
712    if !removed {
713        return Err(KernelError::NotFound {
714            resource: "task in source CPU queue",
715            id: from_cpu as u64,
716        });
717    }
718
719    // Add to destination CPU queue
720    if let Some(to_cpu_data) = per_cpu(to_cpu) {
721        to_cpu_data.cpu_info.ready_queue.lock().enqueue(task_ptr);
722        to_cpu_data
723            .cpu_info
724            .nr_running
725            .fetch_add(1, Ordering::Relaxed);
726        to_cpu_data.cpu_info.update_load();
727
728        // SAFETY: task_ptr is a valid NonNull<Task> that was just removed
729        // from the source CPU queue and enqueued on the destination. We
730        // update current_cpu to reflect the new CPU assignment. No other
731        // code is modifying this task concurrently because it was removed
732        // from the source queue under lock.
733        unsafe {
734            let task_mut = task_ptr.as_ptr();
735            (*task_mut).current_cpu = Some(to_cpu);
736        }
737
738        // Send IPI if destination CPU is idle
739        if to_cpu_data.cpu_info.is_idle() {
740            send_ipi(to_cpu, 0); // Wake up CPU
741        }
742
743        // Record migration metric
744        super::metrics::SCHEDULER_METRICS.record_migration();
745
746        Ok(())
747    } else {
748        Err(KernelError::NotInitialized {
749            subsystem: "destination CPU",
750        })
751    }
752}
753
754/// Find least loaded CPU
755pub fn find_least_loaded_cpu() -> u8 {
756    let mut min_load = 100;
757    let mut best_cpu = 0;
758
759    for cpu_id in 0..MAX_CPUS as u8 {
760        if let Some(cpu_data) = per_cpu(cpu_id) {
761            if cpu_data.cpu_info.is_online() {
762                let load = cpu_data.cpu_info.load.load(Ordering::Relaxed);
763                if load < min_load {
764                    min_load = load;
765                    best_cpu = cpu_id;
766                }
767            }
768        }
769    }
770
771    best_cpu
772}
773
774/// Find least loaded CPU that matches affinity mask
775pub fn find_least_loaded_cpu_with_affinity(affinity_mask: u64) -> u8 {
776    let mut best_cpu = 0;
777    let mut min_load = 100;
778    let mut found_any = false;
779
780    for cpu_id in 0..64.min(MAX_CPUS as u8) {
781        // Check up to 64 CPUs (mask size)
782        if (affinity_mask & (1u64 << cpu_id)) != 0 {
783            if let Some(cpu_data) = per_cpu(cpu_id) {
784                if cpu_data.cpu_info.is_online() {
785                    let load = cpu_data.cpu_info.load.load(Ordering::Relaxed);
786                    if load < min_load || !found_any {
787                        min_load = load;
788                        best_cpu = cpu_id;
789                        found_any = true;
790                    }
791                }
792            }
793        }
794    }
795
796    // If no CPU matches affinity, fall back to least loaded
797    if !found_any {
798        find_least_loaded_cpu()
799    } else {
800        best_cpu
801    }
802}
803
804#[cfg(target_arch = "x86_64")]
805fn max_cpuid() -> u32 {
806    // SAFETY: CPUID leaf 0 is always valid on x86_64 processors. It returns
807    // the maximum supported standard CPUID leaf number in EAX. This is a
808    // read-only instruction with no side effects.
809    unsafe {
810        use core::arch::x86_64::__cpuid;
811        let cpuid = __cpuid(0);
812        cpuid.eax
813    }
814}
815
816/// Create idle task for specific CPU
817#[cfg(feature = "alloc")]
818fn create_cpu_idle_task(cpu_id: u8) {
819    use alloc::{boxed::Box, format};
820    use core::ptr::NonNull;
821
822    use super::{
823        idle_task_entry,
824        task::{Priority, SchedClass, SchedPolicy, Task},
825    };
826    use crate::process::{ProcessId, ThreadId};
827
828    // Allocate stack for idle task (8KB)
829    const IDLE_STACK_SIZE: usize = 8192;
830    let idle_stack = Box::leak(Box::new([0u8; IDLE_STACK_SIZE]));
831    let idle_stack_top = idle_stack.as_ptr() as usize + IDLE_STACK_SIZE;
832
833    // Get kernel page table
834    let kernel_page_table = crate::mm::get_kernel_page_table();
835
836    // Create idle task
837    let mut idle_task = Box::new(Task::new(
838        ProcessId(0),            // PID 0 for idle
839        ThreadId(cpu_id as u64), // TID = CPU ID for idle tasks
840        format!("idle-cpu{}", cpu_id),
841        idle_task_entry as usize,
842        idle_stack_top,
843        kernel_page_table,
844    ));
845
846    // Set as idle priority
847    idle_task.priority = Priority::Idle;
848    idle_task.sched_class = SchedClass::Idle;
849    idle_task.sched_policy = SchedPolicy::Idle;
850
851    // Set CPU affinity to only this CPU
852    idle_task.cpu_affinity = super::task::CpuSet::single(cpu_id);
853
854    // Get raw pointer to idle task
855    // Box::leak always returns a non-null pointer
856    let idle_ptr =
857        NonNull::new(Box::leak(idle_task) as *mut _).expect("Box::leak returned null (impossible)");
858
859    // Initialize per-CPU scheduler with idle task
860    if let Some(cpu_data) = per_cpu(cpu_id) {
861        let mut scheduler = cpu_data.cpu_info.scheduler.lock();
862        scheduler.init(idle_ptr);
863    }
864}