⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/sched/
scheduler.rs

1//! Core scheduler implementation
2//!
3//! Contains the `Scheduler` struct which implements the scheduling algorithms
4//! (round-robin, priority, CFS, hybrid), task enqueueing/dequeueing, context
5//! switching, and the global scheduler instance.
6
7#[cfg(feature = "alloc")]
8extern crate alloc;
9
10use core::{
11    ptr::NonNull,
12    sync::atomic::{AtomicBool, AtomicU64, Ordering},
13};
14
15use spin::Mutex;
16
17#[cfg(not(target_arch = "riscv64"))]
18use super::queue::READY_QUEUE;
19use super::{
20    queue::CfsRunQueue,
21    task::{Priority, SchedClass, Task},
22    task_ptr::TaskPtr,
23    ProcessState,
24};
25
26/// Scheduler state
27pub struct Scheduler {
28    /// Currently running task
29    pub current: Option<TaskPtr>,
30    /// Idle task
31    pub idle_task: Option<TaskPtr>,
32    /// Scheduling algorithm
33    pub algorithm: SchedAlgorithm,
34    /// Preemption enabled
35    pub preemption_enabled: AtomicBool,
36    /// Scheduler lock count
37    pub lock_count: AtomicU64,
38    /// CPU ID this scheduler is running on
39    pub cpu_id: u8,
40    /// CFS run queue (if using CFS)
41    #[cfg(feature = "alloc")]
42    pub cfs_queue: Option<Mutex<CfsRunQueue>>,
43}
44
45/// Scheduling algorithm
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum SchedAlgorithm {
48    /// Simple round-robin
49    RoundRobin,
50    /// Priority-based
51    Priority,
52    /// Completely Fair Scheduler
53    Cfs,
54    /// Real-time + CFS hybrid
55    Hybrid,
56}
57
58impl Scheduler {
59    /// Create new scheduler
60    pub const fn new() -> Self {
61        Self {
62            current: None,
63            idle_task: None,
64            algorithm: SchedAlgorithm::Priority,
65            preemption_enabled: AtomicBool::new(true),
66            lock_count: AtomicU64::new(0),
67            cpu_id: 0, // Default to CPU 0, will be set per-CPU
68            #[cfg(feature = "alloc")]
69            cfs_queue: None,
70        }
71    }
72
73    /// Initialize scheduler with idle task
74    pub fn init(&mut self, idle_task: NonNull<Task>) {
75        let idle_ptr = TaskPtr::new(idle_task);
76        self.idle_task = Some(idle_ptr);
77        self.current = Some(idle_ptr);
78
79        #[cfg(feature = "alloc")]
80        if self.algorithm == SchedAlgorithm::Cfs || self.algorithm == SchedAlgorithm::Hybrid {
81            self.cfs_queue = Some(Mutex::new(CfsRunQueue::new()));
82        }
83    }
84
85    /// Get current task
86    pub fn current(&self) -> Option<NonNull<Task>> {
87        self.current.map(|t| t.as_ptr())
88    }
89
90    /// Disable preemption
91    pub fn disable_preemption(&self) {
92        self.lock_count.fetch_add(1, Ordering::Acquire);
93        self.preemption_enabled.store(false, Ordering::Release);
94    }
95
96    /// Enable preemption
97    pub fn enable_preemption(&self) {
98        let count = self.lock_count.fetch_sub(1, Ordering::Release);
99        if count == 1 {
100            self.preemption_enabled.store(true, Ordering::Release);
101        }
102    }
103
104    /// Check if preemption is enabled
105    pub fn is_preemptible(&self) -> bool {
106        self.preemption_enabled.load(Ordering::Acquire)
107    }
108
109    /// Add task to ready queue
110    pub fn enqueue(&self, task: NonNull<Task>) {
111        // SAFETY: `task` is a valid NonNull<Task> provided by the caller
112        // (scheduler.schedule, wake_up_process, etc.). We read task fields
113        // (cpu_affinity, sched_class) to determine the correct queue. The
114        // caller ensures the task is not concurrently modified.
115        let task_ref = unsafe { task.as_ref() };
116
117        // Check if task can run on this CPU
118        if !task_ref.can_run_on(self.cpu_id) {
119            // Task can't run on this CPU, try to find a suitable CPU
120            self.enqueue_on_suitable_cpu(task);
121            return;
122        }
123
124        // SAFETY: `task` is a valid NonNull<Task>. We update its state to
125        // Ready before placing it in a queue. We hold the scheduler lock
126        // (callers acquire it before calling enqueue), ensuring exclusive
127        // access to the task's state field.
128        unsafe {
129            let task_mut = task.as_ptr();
130            (*task_mut).state = ProcessState::Ready;
131        }
132
133        // Use per-CPU queue if available
134        if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
135            match self.algorithm {
136                SchedAlgorithm::RoundRobin | SchedAlgorithm::Priority => {
137                    cpu_data.cpu_info.ready_queue.lock().enqueue(task);
138                    cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
139                    cpu_data.cpu_info.update_load();
140                }
141                #[cfg(feature = "alloc")]
142                SchedAlgorithm::Cfs => {
143                    if let Some(ref cfs) = self.cfs_queue {
144                        cfs.lock().enqueue(task);
145                        cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
146                        cpu_data.cpu_info.update_load();
147                    }
148                }
149                #[cfg(feature = "alloc")]
150                SchedAlgorithm::Hybrid => {
151                    if task_ref.sched_class == SchedClass::RealTime {
152                        cpu_data.cpu_info.ready_queue.lock().enqueue(task);
153                    } else if let Some(ref cfs) = self.cfs_queue {
154                        cfs.lock().enqueue(task);
155                    }
156                    cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
157                    cpu_data.cpu_info.update_load();
158                }
159            }
160        } else {
161            // Fallback to global queue
162            match self.algorithm {
163                SchedAlgorithm::RoundRobin | SchedAlgorithm::Priority => {
164                    #[cfg(not(target_arch = "riscv64"))]
165                    READY_QUEUE.lock().enqueue(task);
166                    #[cfg(target_arch = "riscv64")]
167                    super::queue::get_ready_queue().enqueue(task);
168                }
169                #[cfg(feature = "alloc")]
170                SchedAlgorithm::Cfs => {
171                    if let Some(ref cfs) = self.cfs_queue {
172                        cfs.lock().enqueue(task);
173                    }
174                }
175                #[cfg(feature = "alloc")]
176                SchedAlgorithm::Hybrid => {
177                    if task_ref.sched_class == SchedClass::RealTime {
178                        #[cfg(not(target_arch = "riscv64"))]
179                        READY_QUEUE.lock().enqueue(task);
180                        #[cfg(target_arch = "riscv64")]
181                        super::queue::get_ready_queue().enqueue(task);
182                    } else if let Some(ref cfs) = self.cfs_queue {
183                        cfs.lock().enqueue(task);
184                    }
185                }
186            }
187        }
188    }
189
190    /// Enqueue task on a suitable CPU that matches its affinity
191    fn enqueue_on_suitable_cpu(&self, task: NonNull<Task>) {
192        // SAFETY: `task` is a valid NonNull<Task>. We read cpu_affinity and
193        // tid fields to find a suitable CPU and log warnings. The task is not
194        // concurrently modified because the caller holds the scheduler lock.
195        let task_ref = unsafe { task.as_ref() };
196
197        // Find first CPU that matches task affinity
198        for cpu in 0..super::smp::MAX_CPUS as u8 {
199            if task_ref.can_run_on(cpu) {
200                // Schedule on that CPU
201                super::scheduler::schedule_on_cpu(cpu, task);
202                return;
203            }
204        }
205
206        // No suitable CPU found, this is an error
207        println!(
208            "[SCHED] Warning: Task {} has no valid CPU affinity!",
209            task_ref.tid
210        );
211    }
212
213    /// Select next task to run
214    pub fn pick_next(&self) -> Option<NonNull<Task>> {
215        match self.algorithm {
216            SchedAlgorithm::RoundRobin => self.pick_next_rr(),
217            SchedAlgorithm::Priority => self.pick_next_priority(),
218            #[cfg(feature = "alloc")]
219            SchedAlgorithm::Cfs => self.pick_next_cfs(),
220            #[cfg(feature = "alloc")]
221            SchedAlgorithm::Hybrid => self.pick_next_hybrid(),
222        }
223    }
224
225    /// Round-robin task selection
226    fn pick_next_rr(&self) -> Option<NonNull<Task>> {
227        let current_cpu = self.cpu_id;
228
229        // Try per-CPU queue first
230        if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
231            let mut queue = cpu_data.cpu_info.ready_queue.lock();
232
233            // Find a task that can run on this CPU
234            while let Some(task_ptr) = queue.dequeue() {
235                // SAFETY: task_ptr was just dequeued from the ready queue
236                // where it was stored as a valid NonNull<Task>. We read
237                // cpu_affinity via can_run_on to check compatibility. Tasks
238                // in the queue are not deallocated while enqueued.
239                unsafe {
240                    let task = task_ptr.as_ref();
241                    if task.can_run_on(current_cpu) {
242                        cpu_data.cpu_info.nr_running.fetch_sub(1, Ordering::Relaxed);
243                        cpu_data.cpu_info.update_load();
244                        return Some(task_ptr);
245                    }
246                    // Task can't run on this CPU, re-queue it
247                    queue.enqueue(task_ptr);
248                }
249            }
250        } else {
251            // Fallback to global queue
252            #[cfg(not(target_arch = "riscv64"))]
253            {
254                let mut queue = READY_QUEUE.lock();
255
256                while let Some(task_ptr) = queue.dequeue() {
257                    // SAFETY: Same as above -- task_ptr is valid from the
258                    // global ready queue.
259                    unsafe {
260                        let task = task_ptr.as_ref();
261                        if task.can_run_on(current_cpu) {
262                            return Some(task_ptr);
263                        }
264                        // Task can't run on this CPU, re-queue it
265                        queue.enqueue(task_ptr);
266                    }
267                }
268            }
269            #[cfg(target_arch = "riscv64")]
270            {
271                let queue = super::queue::get_ready_queue();
272
273                while let Some(task_ptr) = queue.dequeue() {
274                    // SAFETY: Same as above -- task_ptr is valid from the
275                    // RISC-V global ready queue.
276                    unsafe {
277                        let task = task_ptr.as_ref();
278                        if task.can_run_on(current_cpu) {
279                            return Some(task_ptr);
280                        }
281                        // Task can't run on this CPU, re-queue it
282                        queue.enqueue(task_ptr);
283                    }
284                }
285            }
286        }
287
288        // Work-stealing: try to steal from the busiest neighbor CPU
289        {
290            let mut best_cpu = None;
291            let mut best_load = 0u32;
292            for cpu in 0..super::smp::MAX_CPUS as u8 {
293                if cpu == current_cpu {
294                    continue;
295                }
296                if let Some(data) = super::smp::per_cpu(cpu) {
297                    let load = data.cpu_info.nr_running.load(Ordering::Relaxed);
298                    if load > best_load && load >= 2 {
299                        best_load = load;
300                        best_cpu = Some(cpu);
301                    }
302                }
303            }
304
305            if let Some(victim_cpu) = best_cpu {
306                if let Some(victim_data) = super::smp::per_cpu(victim_cpu) {
307                    let mut queue = victim_data.cpu_info.ready_queue.lock();
308                    if let Some(task_ptr) = queue.dequeue() {
309                        // SAFETY: task_ptr is valid from the victim's ready queue.
310                        unsafe {
311                            if task_ptr.as_ref().can_run_on(current_cpu) {
312                                victim_data
313                                    .cpu_info
314                                    .nr_running
315                                    .fetch_sub(1, Ordering::Relaxed);
316                                return Some(task_ptr);
317                            }
318                            // Can't run here, put it back
319                            queue.enqueue(task_ptr);
320                        }
321                    }
322                }
323            }
324        }
325
326        // No runnable task found, use idle task
327        self.idle_task.map(|t| t.as_ptr())
328    }
329
330    /// Priority-based task selection
331    fn pick_next_priority(&self) -> Option<NonNull<Task>> {
332        let current_cpu = self.cpu_id;
333
334        // Try per-CPU queue first
335        if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
336            let mut queue = cpu_data.cpu_info.ready_queue.lock();
337
338            // The ReadyQueue already maintains priority order with bitmaps
339            // Just dequeue the highest priority task that can run on this CPU
340            let mut requeue_count = 0;
341            let max_attempts = 10; // Prevent infinite loop
342
343            while requeue_count < max_attempts {
344                match queue.dequeue() {
345                    Some(task_ptr) => {
346                        // SAFETY: task_ptr was just dequeued from the per-CPU
347                        // ready queue where it was stored as a valid
348                        // NonNull<Task>. We read cpu_affinity via can_run_on.
349                        unsafe {
350                            let task = task_ptr.as_ref();
351                            if task.can_run_on(current_cpu) {
352                                cpu_data.cpu_info.nr_running.fetch_sub(1, Ordering::Relaxed);
353                                cpu_data.cpu_info.update_load();
354                                return Some(task_ptr);
355                            } else {
356                                // Task can't run on this CPU, re-queue it
357                                queue.enqueue(task_ptr);
358                                requeue_count += 1;
359                            }
360                        }
361                    }
362                    None => break, // No more tasks
363                }
364            }
365        } else {
366            // Fallback to global queue
367            #[cfg(not(target_arch = "riscv64"))]
368            {
369                let mut queue = READY_QUEUE.lock();
370
371                let mut requeue_count = 0;
372                let max_attempts = 10;
373
374                while requeue_count < max_attempts {
375                    match queue.dequeue() {
376                        // SAFETY: task_ptr is valid from the global ready queue.
377                        Some(task_ptr) => unsafe {
378                            let task = task_ptr.as_ref();
379                            if task.can_run_on(current_cpu) {
380                                return Some(task_ptr);
381                            } else {
382                                queue.enqueue(task_ptr);
383                                requeue_count += 1;
384                            }
385                        },
386                        None => break,
387                    }
388                }
389            }
390            #[cfg(target_arch = "riscv64")]
391            {
392                let queue = super::queue::get_ready_queue();
393
394                let mut requeue_count = 0;
395                let max_attempts = 10;
396
397                while requeue_count < max_attempts {
398                    match queue.dequeue() {
399                        // SAFETY: task_ptr is valid from RISC-V ready queue.
400                        Some(task_ptr) => unsafe {
401                            let task = task_ptr.as_ref();
402                            if task.can_run_on(current_cpu) {
403                                return Some(task_ptr);
404                            } else {
405                                queue.enqueue(task_ptr);
406                                requeue_count += 1;
407                            }
408                        },
409                        None => break,
410                    }
411                }
412            }
413        }
414
415        // No runnable task found, use idle task
416        self.idle_task.map(|t| t.as_ptr())
417    }
418
419    /// CFS task selection
420    #[cfg(feature = "alloc")]
421    fn pick_next_cfs(&self) -> Option<NonNull<Task>> {
422        let current_cpu = self.cpu_id;
423
424        if let Some(ref cfs) = self.cfs_queue {
425            let mut queue = cfs.lock();
426
427            // Find task with lowest vruntime that can run on this CPU
428            while let Some(task_ptr) = queue.dequeue() {
429                // SAFETY: task_ptr is valid from the CFS run queue. We read
430                // cpu_affinity to check if the task can run on this CPU.
431                unsafe {
432                    let task = task_ptr.as_ref();
433                    if task.can_run_on(current_cpu) {
434                        return Some(task_ptr);
435                    }
436                    // Task can't run on this CPU, re-queue it
437                    queue.enqueue(task_ptr);
438                }
439            }
440        }
441
442        self.idle_task.map(|t| t.as_ptr())
443    }
444
445    /// Hybrid scheduler task selection
446    #[cfg(feature = "alloc")]
447    fn pick_next_hybrid(&self) -> Option<NonNull<Task>> {
448        let current_cpu = self.cpu_id;
449
450        // Check real-time tasks first
451        {
452            #[cfg(not(target_arch = "riscv64"))]
453            {
454                let mut queue = READY_QUEUE.lock();
455                while let Some(task_ptr) = queue.dequeue() {
456                    // SAFETY: task_ptr is valid from the global ready queue.
457                    // We check affinity before returning it.
458                    unsafe {
459                        let task = task_ptr.as_ref();
460                        if task.can_run_on(current_cpu) {
461                            return Some(task_ptr);
462                        }
463                        // Task can't run on this CPU, re-queue it
464                        queue.enqueue(task_ptr);
465                    }
466                }
467            }
468            #[cfg(target_arch = "riscv64")]
469            {
470                let queue = super::queue::get_ready_queue();
471                while let Some(task_ptr) = queue.dequeue() {
472                    // SAFETY: Same as above for RISC-V ready queue.
473                    unsafe {
474                        let task = task_ptr.as_ref();
475                        if task.can_run_on(current_cpu) {
476                            return Some(task_ptr);
477                        }
478                        // Task can't run on this CPU, re-queue it
479                        queue.enqueue(task_ptr);
480                    }
481                }
482            }
483        }
484
485        // Then check CFS queue
486        if let Some(ref cfs) = self.cfs_queue {
487            let mut queue = cfs.lock();
488            while let Some(task_ptr) = queue.dequeue() {
489                // SAFETY: task_ptr is valid from the CFS queue.
490                unsafe {
491                    let task = task_ptr.as_ref();
492                    if task.can_run_on(current_cpu) {
493                        return Some(task_ptr);
494                    }
495                    // Task can't run on this CPU, re-queue it
496                    queue.enqueue(task_ptr);
497                }
498            }
499        }
500
501        self.idle_task.map(|t| t.as_ptr())
502    }
503
504    /// Update task runtime statistics
505    pub fn update_runtime(&self, task: NonNull<Task>, runtime: u64) {
506        // SAFETY: `task` is a valid NonNull<Task> passed by the caller. We
507        // update statistics (runtime via atomic operations) and vruntime for
508        // CFS scheduling. The caller holds the scheduler lock ensuring no
509        // concurrent modification of the vruntime field.
510        unsafe {
511            let task_ref = task.as_ref();
512            task_ref.update_runtime(runtime);
513
514            // Update vruntime for CFS
515            if self.algorithm == SchedAlgorithm::Cfs
516                || (self.algorithm == SchedAlgorithm::Hybrid
517                    && task_ref.sched_class != SchedClass::RealTime)
518            {
519                let task_mut = task.as_ptr();
520                let weight = priority_to_weight(task_ref.priority);
521                (*task_mut).vruntime += runtime * 1024 / weight;
522            }
523        }
524    }
525
526    /// Handle timer tick
527    pub fn tick(&mut self) {
528        if let Some(current) = self.current {
529            // SAFETY: `current` is a TaskPtr stored in the scheduler which
530            // points to a valid Task. We are called from the timer interrupt
531            // handler with the scheduler lock held. We decrement the time
532            // slice and potentially trigger a reschedule if it expires.
533            unsafe {
534                let task_mut = current.as_raw();
535
536                // Decrement time slice
537                if (*task_mut).time_slice > 0 {
538                    (*task_mut).time_slice -= 1;
539                }
540
541                // Check if time slice expired
542                if (*task_mut).time_slice == 0 && self.is_preemptible() {
543                    (*task_mut).time_slice = DEFAULT_TIME_SLICE;
544                    self.schedule();
545                }
546            }
547        }
548    }
549
550    /// Perform scheduling decision
551    pub fn schedule(&mut self) {
552        let start_cycles = super::metrics::read_tsc();
553
554        // Disable interrupts
555        let _guard = crate::arch::disable_interrupts();
556
557        // Check if we can schedule
558        if !self.is_preemptible() {
559            return;
560        }
561
562        // Get current task
563        let current = match self.current {
564            Some(task) => task,
565            None => {
566                // No current task, pick one
567                if let Some(next) = self.pick_next() {
568                    self.switch_to(next);
569                }
570                super::metrics::SCHEDULER_METRICS
571                    .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
572                return;
573            }
574        };
575
576        // Check if current task should continue
577        // SAFETY: `current` is a TaskPtr from self.current which points to a
578        // valid Task. We hold the scheduler lock and interrupts are disabled,
579        // so the task is not concurrently modified. We read its state and
580        // compare with the next candidate task to decide on preemption.
581        unsafe {
582            let current_ref = current.as_ptr().as_ref();
583            if current_ref.state == ProcessState::Running {
584                // Task is still runnable, check if we should preempt
585                if let Some(next) = self.pick_next() {
586                    let next_ref = next.as_ref();
587
588                    // Preempt if next task has higher priority
589                    if should_preempt(current_ref, next_ref) {
590                        // Re-queue current task
591                        self.enqueue(current.as_ptr());
592                        self.switch_to(next);
593                        super::metrics::SCHEDULER_METRICS
594                            .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
595                        return;
596                    }
597                }
598                // Current task continues
599                super::metrics::SCHEDULER_METRICS
600                    .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
601                return;
602            }
603        }
604
605        // Current task is not runnable, must switch
606        if let Some(next) = self.pick_next() {
607            self.switch_to(next);
608        }
609
610        super::metrics::SCHEDULER_METRICS
611            .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
612    }
613
614    /// Switch to new task
615    fn switch_to(&mut self, next: NonNull<Task>) {
616        let next_ptr = TaskPtr::new(next);
617        if self.current == Some(next_ptr) {
618            return; // Already running
619        }
620
621        crate::perf::count_context_switch();
622        let start_cycles = super::metrics::read_tsc();
623
624        // Trace: record context switch
625        // SAFETY: next is a valid NonNull<Task>.
626        unsafe {
627            let next_ref = next.as_ref();
628            if let Some(current) = self.current {
629                let current_ref = current.as_ptr().as_ref();
630                crate::trace!(
631                    crate::perf::trace::TraceEventType::SchedSwitchOut,
632                    current_ref.pid.0,
633                    current_ref.tid.0
634                );
635            }
636            crate::trace!(
637                crate::perf::trace::TraceEventType::SchedSwitchIn,
638                next_ref.pid.0,
639                next_ref.tid.0
640            );
641        }
642        // SAFETY: If self.current is Some, its TaskPtr points to a valid Task.
643        // We read the state field to determine if the context switch is
644        // voluntary (task blocked/sleeping) or involuntary (preemption).
645        // The scheduler lock is held ensuring exclusive access.
646        let is_voluntary = unsafe {
647            if let Some(current) = self.current {
648                let current_ref = current.as_ptr().as_ref();
649                current_ref.state == ProcessState::Blocked
650                    || current_ref.state == ProcessState::Sleeping
651            } else {
652                false
653            }
654        };
655
656        // SAFETY: Both `self.current` (if Some) and `next` point to valid
657        // Tasks. We hold the scheduler lock and interrupts are disabled
658        // (from schedule()), ensuring exclusive access. We update:
659        // - current task: state to Ready, clear current_cpu
660        // - next task: state to Running, set current_cpu, mark_scheduled
661        // - Load context for the first task if no current task exists
662        // The context pointers passed to load_context are derived from the
663        // task's context field which is valid for the task's lifetime.
664        unsafe {
665            // Update states
666            if let Some(current) = self.current {
667                let current_raw = current.as_raw();
668                if (*current_raw).state == ProcessState::Running {
669                    (*current_raw).state = ProcessState::Ready;
670                }
671                (*current_raw).current_cpu = None;
672            }
673
674            let next_mut = next.as_ptr();
675            let next_ref = next.as_ref();
676
677            // Check if scheduling idle task
678            if next_ref.sched_class == SchedClass::Idle {
679                super::metrics::SCHEDULER_METRICS.record_idle_scheduled();
680            }
681
682            (*next_mut).state = ProcessState::Running;
683            (*next_mut).current_cpu = Some(current_cpu());
684            (*next_mut).mark_scheduled(current_cpu(), is_voluntary);
685
686            // Update TSS kernel stack for Ring 3 -> Ring 0 transitions
687            #[cfg(target_arch = "x86_64")]
688            {
689                let kernel_sp = (*next.as_ptr()).kernel_stack;
690                if kernel_sp != 0 {
691                    crate::arch::x86_64::gdt::set_kernel_stack(kernel_sp as u64);
692                }
693            }
694
695            // Lazy TLB: skip CR3 reload when switching to kernel threads
696            // (has_user_mappings == false). Kernel threads share the same
697            // kernel page table mappings, so CR3 reload is unnecessary.
698            // This saves ~100-300 cycles per kernel-to-kernel switch.
699            if (*next.as_ptr()).has_user_mappings {
700                let next_pt = (*next.as_ptr()).page_table;
701                if next_pt != 0 {
702                    // Only reload CR3 if switching to a different address space
703                    let current_pt = if let Some(current) = self.current {
704                        (*current.as_raw()).page_table
705                    } else {
706                        0
707                    };
708                    if next_pt != current_pt {
709                        #[cfg(target_arch = "x86_64")]
710                        {
711                            // SAFETY: next_pt is a valid page table physical address
712                            // from the Task struct, set during process creation.
713                            // Already inside an unsafe block from the parent scope.
714                            core::arch::asm!(
715                                "mov cr3, {}",
716                                in(reg) next_pt,
717                                options(nostack, preserves_flags)
718                            );
719                        }
720                    }
721                }
722            }
723
724            // Perform context switch
725            if let Some(current) = self.current {
726                // Update current pointer BEFORE context_switch, since
727                // context_switch saves current registers and jumps to
728                // next's saved RIP. When this task resumes later,
729                // execution continues after this point with the correct
730                // self.current already set.
731                self.current = Some(next_ptr);
732
733                // Save current context and restore next context.
734                // This call does NOT return until this task is scheduled
735                // again -- at that point, execution resumes here.
736                let current_raw = current.as_raw();
737                context_switch(&mut (*current_raw).context, &(*next.as_ptr()).context);
738
739                // Resumed: Record metrics and report RCU quiescent state.
740                // A context switch is a quiescent point: no RCU read-side
741                // references from before the switch can be held.
742                crate::sync::rcu::rcu_quiescent();
743                let switch_cycles = super::metrics::read_tsc() - start_cycles;
744                super::metrics::SCHEDULER_METRICS
745                    .record_context_switch(switch_cycles, is_voluntary);
746                return;
747            } else {
748                // First task, load its context directly
749                // This happens when scheduler starts with bootstrap task
750                #[cfg(target_arch = "x86_64")]
751                {
752                    crate::arch::x86_64::idt::raw_serial_str(b"[SCHED] DISPATCH_FIRST pid=0x");
753                    crate::arch::x86_64::idt::raw_serial_hex((*next.as_ptr()).pid.0);
754                    crate::arch::x86_64::idt::raw_serial_str(b"\n");
755                }
756                match &(*next.as_ptr()).context {
757                    #[cfg(target_arch = "x86_64")]
758                    crate::sched::task::TaskContext::X86_64(ctx) => {
759                        // SAFETY: ctx is a reference to a valid X86_64Context
760                        // within the task. load_context restores CPU registers
761                        // from this context structure.
762                        crate::arch::x86_64::context::load_context(ctx as *const _);
763                    }
764                    #[cfg(target_arch = "aarch64")]
765                    crate::sched::task::TaskContext::AArch64(ctx) => {
766                        // SAFETY: ctx is a reference to a valid AArch64Context.
767                        // load_context restores registers from it.
768                        crate::arch::aarch64::context::load_context(ctx as *const _);
769                    }
770                    #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
771                    crate::sched::task::TaskContext::RiscV(ctx) => {
772                        // SAFETY: ctx is a reference to a valid RiscVContext.
773                        // load_context restores registers from it.
774                        crate::arch::riscv::context::load_context(ctx as *const _);
775                    }
776                }
777            }
778
779            // Update current task
780            self.current = Some(next_ptr);
781        }
782
783        // Record context switch metrics and RCU quiescent state.
784        crate::sync::rcu::rcu_quiescent();
785        let switch_cycles = super::metrics::read_tsc() - start_cycles;
786        super::metrics::SCHEDULER_METRICS.record_context_switch(switch_cycles, is_voluntary);
787    }
788}
789
790/// Check if should preempt current task
791pub fn should_preempt(current: &Task, next: &Task) -> bool {
792    // Never preempt idle task except for any real task
793    if current.sched_class == SchedClass::Idle && next.sched_class != SchedClass::Idle {
794        return true;
795    }
796
797    // Real-time tasks always preempt non-real-time
798    if next.sched_class == SchedClass::RealTime && current.sched_class != SchedClass::RealTime {
799        return true;
800    }
801
802    // Within same class, check effective priority
803    if next.sched_class == current.sched_class {
804        let next_prio = next.effective_priority();
805        let curr_prio = current.effective_priority();
806
807        // For real-time tasks, always preempt if higher priority
808        if current.sched_class == SchedClass::RealTime {
809            return next_prio < curr_prio;
810        }
811
812        // For normal tasks, only preempt if significantly higher priority
813        // or current task has used up its time slice
814        if current.sched_class == SchedClass::Normal {
815            return next_prio < curr_prio || (next_prio == curr_prio && current.time_slice == 0);
816        }
817    }
818
819    false
820}
821
822/// Convert priority to weight for CFS
823fn priority_to_weight(priority: Priority) -> u64 {
824    match priority {
825        Priority::SystemHigh => 200,
826        Priority::SystemNormal => 100,
827        Priority::UserHigh => 50,
828        Priority::UserNormal => 20,
829        Priority::UserLow => 10,
830        Priority::Idle => 1,
831        _ => 20, // Default for real-time (not used in CFS)
832    }
833}
834
835/// Architecture-specific context switch
836#[cfg(target_arch = "x86_64")]
837fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
838    use super::task::TaskContext;
839    match (current, next) {
840        // SAFETY: Both curr and next point to valid context structures within
841        // their respective Tasks. context_switch saves curr's registers and
842        // restores next's. The scheduler lock and disabled interrupts ensure
843        // no concurrent access to these contexts.
844        (TaskContext::X86_64(curr), TaskContext::X86_64(next)) => unsafe {
845            crate::arch::x86_64::context::context_switch(curr as *mut _, next as *const _);
846        },
847    }
848}
849
850#[cfg(target_arch = "aarch64")]
851fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
852    use super::task::TaskContext;
853    match (current, next) {
854        // SAFETY: Same as x86_64 -- valid context structures, scheduler lock
855        // held, interrupts disabled.
856        (TaskContext::AArch64(curr), TaskContext::AArch64(next)) => unsafe {
857            crate::arch::aarch64::context::context_switch(curr as *mut _, next as *const _);
858        },
859    }
860}
861
862#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
863fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
864    use super::task::TaskContext;
865    match (current, next) {
866        // SAFETY: Same as x86_64 -- valid context structures, scheduler lock
867        // held, interrupts disabled.
868        (TaskContext::RiscV(curr), TaskContext::RiscV(next)) => unsafe {
869            crate::arch::riscv::context::context_switch(curr as *mut _, next as *const _);
870        },
871    }
872}
873
874/// Load context for first task
875#[cfg(target_arch = "x86_64")]
876fn load_context(context: &super::task::TaskContext) {
877    use crate::arch::x86_64::context::X86_64Context;
878
879    match context {
880        // SAFETY: ctx is a valid reference to an X86_64Context within the
881        // bootstrap task. load_context restores CPU registers from this
882        // structure. This is called once during scheduler startup.
883        super::task::TaskContext::X86_64(ctx) => unsafe {
884            crate::arch::x86_64::context::load_context(ctx as *const X86_64Context);
885        },
886    }
887}
888
889#[cfg(target_arch = "aarch64")]
890fn load_context(context: &super::task::TaskContext) {
891    use crate::arch::aarch64::context::AArch64Context;
892
893    match context {
894        // SAFETY: Same as x86_64 -- valid context reference for the bootstrap
895        // task. load_context restores CPU registers.
896        super::task::TaskContext::AArch64(ctx) => unsafe {
897            crate::arch::aarch64::context::load_context(ctx as *const AArch64Context);
898        },
899        #[allow(unreachable_patterns)]
900        _ => unreachable!("Invalid context type for AArch64"),
901    }
902}
903
904#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
905fn load_context(context: &super::task::TaskContext) {
906    use crate::arch::riscv::context::RiscVContext;
907
908    match context {
909        // SAFETY: Same as x86_64 -- valid context reference for the bootstrap
910        // task. load_context restores CPU registers.
911        super::task::TaskContext::RiscV(ctx) => unsafe {
912            crate::arch::riscv::context::load_context(ctx as *const RiscVContext);
913        },
914        #[allow(unreachable_patterns)]
915        _ => unreachable!("Invalid context type for RISC-V"),
916    }
917}
918
919/// Get current CPU ID
920fn current_cpu() -> u8 {
921    super::smp::current_cpu_id()
922}
923
924impl Default for Scheduler {
925    fn default() -> Self {
926        Self::new()
927    }
928}
929
930/// Default time slice
931const DEFAULT_TIME_SLICE: u32 = 10;
932
933/// Global scheduler instance (for BSP/CPU0)
934pub(crate) static SCHEDULER: Mutex<Scheduler> = Mutex::new(Scheduler::new());
935
936// ---- Global PID-to-Task registry for O(log n) lookup ----
937
938#[cfg(feature = "alloc")]
939use alloc::collections::BTreeMap;
940
941/// Wrapper for NonNull<Task> that implements Send+Sync.
942///
943/// SAFETY: Task pointers in the registry are only accessed under the
944/// TASK_REGISTRY mutex. Tasks are pinned in memory (Box::leak) and
945/// outlive their registry entries. Cross-CPU access is serialized
946/// by the mutex.
947#[derive(Clone, Copy)]
948struct SendTaskPtr(NonNull<Task>);
949unsafe impl Send for SendTaskPtr {}
950unsafe impl Sync for SendTaskPtr {}
951
952/// Global PID-to-Task pointer registry.
953///
954/// Enables O(log n) task lookup by PID without holding the scheduler lock.
955/// Tasks are registered on creation and unregistered on exit.
956#[cfg(feature = "alloc")]
957static TASK_REGISTRY: Mutex<Option<BTreeMap<u64, SendTaskPtr>>> = Mutex::new(None);
958
959/// Register a task in the global PID-to-Task registry.
960#[cfg(feature = "alloc")]
961pub fn register_task(pid: u64, task: NonNull<Task>) {
962    let mut registry = TASK_REGISTRY.lock();
963    let map = registry.get_or_insert_with(BTreeMap::new);
964    map.insert(pid, SendTaskPtr(task));
965}
966
967/// Unregister a task from the global PID-to-Task registry.
968#[cfg(feature = "alloc")]
969pub fn unregister_task(pid: u64) {
970    let mut registry = TASK_REGISTRY.lock();
971    if let Some(map) = registry.as_mut() {
972        map.remove(&pid);
973    }
974}
975
976/// Look up a task pointer by PID. O(log n) via BTreeMap.
977///
978/// Returns the NonNull<Task> if the PID is registered. This is used by the
979/// IPC fast path for direct task-to-task message transfer without iterating
980/// the scheduler's run queues.
981#[cfg(feature = "alloc")]
982pub fn get_task_ptr(pid: u64) -> Option<NonNull<Task>> {
983    let registry = TASK_REGISTRY.lock();
984    registry.as_ref().and_then(|map| map.get(&pid).map(|p| p.0))
985}
986
987/// Get scheduler for current CPU
988pub fn current_scheduler() -> &'static Mutex<Scheduler> {
989    let cpu_id = current_cpu();
990
991    // Try to get per-CPU scheduler
992    if let Some(cpu_data) = super::smp::per_cpu(cpu_id) {
993        // Return reference to per-CPU scheduler
994        &cpu_data.cpu_info.scheduler
995    } else {
996        // Fallback to global scheduler for BSP
997        &SCHEDULER
998    }
999}
1000
1001/// Schedule on specific CPU
1002pub fn schedule_on_cpu(cpu_id: u8, task: NonNull<Task>) {
1003    // SAFETY: `task` is a valid NonNull<Task> provided by the caller. We
1004    // read the task's cpu_affinity and tid fields to verify it can run on
1005    // the specified CPU. The task is not concurrently modified because the
1006    // caller manages its lifecycle.
1007    unsafe {
1008        let task_ref = task.as_ref();
1009        if !task_ref.can_run_on(cpu_id) {
1010            println!(
1011                "[SCHED] Warning: Task {} cannot run on CPU {} due to affinity mask",
1012                task_ref.tid, cpu_id
1013            );
1014            return;
1015        }
1016    }
1017
1018    if let Some(cpu_data) = super::smp::per_cpu(cpu_id) {
1019        // Add to per-CPU ready queue
1020        cpu_data.cpu_info.ready_queue.lock().enqueue(task);
1021        cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
1022        cpu_data.cpu_info.update_load();
1023
1024        // Send IPI if needed
1025        if cpu_data.cpu_info.is_idle() {
1026            super::smp::send_ipi(cpu_id, 0); // Wake up CPU
1027        }
1028    }
1029}