veridian_kernel/sched/scheduler.rs
1//! Core scheduler implementation
2//!
3//! Contains the `Scheduler` struct which implements the scheduling algorithms
4//! (round-robin, priority, CFS, hybrid), task enqueueing/dequeueing, context
5//! switching, and the global scheduler instance.
6
7#[cfg(feature = "alloc")]
8extern crate alloc;
9
10use core::{
11 ptr::NonNull,
12 sync::atomic::{AtomicBool, AtomicU64, Ordering},
13};
14
15use spin::Mutex;
16
17#[cfg(not(target_arch = "riscv64"))]
18use super::queue::READY_QUEUE;
19use super::{
20 queue::CfsRunQueue,
21 task::{Priority, SchedClass, Task},
22 task_ptr::TaskPtr,
23 ProcessState,
24};
25
26/// Scheduler state
27pub struct Scheduler {
28 /// Currently running task
29 pub current: Option<TaskPtr>,
30 /// Idle task
31 pub idle_task: Option<TaskPtr>,
32 /// Scheduling algorithm
33 pub algorithm: SchedAlgorithm,
34 /// Preemption enabled
35 pub preemption_enabled: AtomicBool,
36 /// Scheduler lock count
37 pub lock_count: AtomicU64,
38 /// CPU ID this scheduler is running on
39 pub cpu_id: u8,
40 /// CFS run queue (if using CFS)
41 #[cfg(feature = "alloc")]
42 pub cfs_queue: Option<Mutex<CfsRunQueue>>,
43}
44
45/// Scheduling algorithm
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum SchedAlgorithm {
48 /// Simple round-robin
49 RoundRobin,
50 /// Priority-based
51 Priority,
52 /// Completely Fair Scheduler
53 Cfs,
54 /// Real-time + CFS hybrid
55 Hybrid,
56}
57
58impl Scheduler {
59 /// Create new scheduler
60 pub const fn new() -> Self {
61 Self {
62 current: None,
63 idle_task: None,
64 algorithm: SchedAlgorithm::Priority,
65 preemption_enabled: AtomicBool::new(true),
66 lock_count: AtomicU64::new(0),
67 cpu_id: 0, // Default to CPU 0, will be set per-CPU
68 #[cfg(feature = "alloc")]
69 cfs_queue: None,
70 }
71 }
72
73 /// Initialize scheduler with idle task
74 pub fn init(&mut self, idle_task: NonNull<Task>) {
75 let idle_ptr = TaskPtr::new(idle_task);
76 self.idle_task = Some(idle_ptr);
77 self.current = Some(idle_ptr);
78
79 #[cfg(feature = "alloc")]
80 if self.algorithm == SchedAlgorithm::Cfs || self.algorithm == SchedAlgorithm::Hybrid {
81 self.cfs_queue = Some(Mutex::new(CfsRunQueue::new()));
82 }
83 }
84
85 /// Get current task
86 pub fn current(&self) -> Option<NonNull<Task>> {
87 self.current.map(|t| t.as_ptr())
88 }
89
90 /// Disable preemption
91 pub fn disable_preemption(&self) {
92 self.lock_count.fetch_add(1, Ordering::Acquire);
93 self.preemption_enabled.store(false, Ordering::Release);
94 }
95
96 /// Enable preemption
97 pub fn enable_preemption(&self) {
98 let count = self.lock_count.fetch_sub(1, Ordering::Release);
99 if count == 1 {
100 self.preemption_enabled.store(true, Ordering::Release);
101 }
102 }
103
104 /// Check if preemption is enabled
105 pub fn is_preemptible(&self) -> bool {
106 self.preemption_enabled.load(Ordering::Acquire)
107 }
108
109 /// Add task to ready queue
110 pub fn enqueue(&self, task: NonNull<Task>) {
111 // SAFETY: `task` is a valid NonNull<Task> provided by the caller
112 // (scheduler.schedule, wake_up_process, etc.). We read task fields
113 // (cpu_affinity, sched_class) to determine the correct queue. The
114 // caller ensures the task is not concurrently modified.
115 let task_ref = unsafe { task.as_ref() };
116
117 // Check if task can run on this CPU
118 if !task_ref.can_run_on(self.cpu_id) {
119 // Task can't run on this CPU, try to find a suitable CPU
120 self.enqueue_on_suitable_cpu(task);
121 return;
122 }
123
124 // SAFETY: `task` is a valid NonNull<Task>. We update its state to
125 // Ready before placing it in a queue. We hold the scheduler lock
126 // (callers acquire it before calling enqueue), ensuring exclusive
127 // access to the task's state field.
128 unsafe {
129 let task_mut = task.as_ptr();
130 (*task_mut).state = ProcessState::Ready;
131 }
132
133 // Use per-CPU queue if available
134 if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
135 match self.algorithm {
136 SchedAlgorithm::RoundRobin | SchedAlgorithm::Priority => {
137 cpu_data.cpu_info.ready_queue.lock().enqueue(task);
138 cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
139 cpu_data.cpu_info.update_load();
140 }
141 #[cfg(feature = "alloc")]
142 SchedAlgorithm::Cfs => {
143 if let Some(ref cfs) = self.cfs_queue {
144 cfs.lock().enqueue(task);
145 cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
146 cpu_data.cpu_info.update_load();
147 }
148 }
149 #[cfg(feature = "alloc")]
150 SchedAlgorithm::Hybrid => {
151 if task_ref.sched_class == SchedClass::RealTime {
152 cpu_data.cpu_info.ready_queue.lock().enqueue(task);
153 } else if let Some(ref cfs) = self.cfs_queue {
154 cfs.lock().enqueue(task);
155 }
156 cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
157 cpu_data.cpu_info.update_load();
158 }
159 }
160 } else {
161 // Fallback to global queue
162 match self.algorithm {
163 SchedAlgorithm::RoundRobin | SchedAlgorithm::Priority => {
164 #[cfg(not(target_arch = "riscv64"))]
165 READY_QUEUE.lock().enqueue(task);
166 #[cfg(target_arch = "riscv64")]
167 super::queue::get_ready_queue().enqueue(task);
168 }
169 #[cfg(feature = "alloc")]
170 SchedAlgorithm::Cfs => {
171 if let Some(ref cfs) = self.cfs_queue {
172 cfs.lock().enqueue(task);
173 }
174 }
175 #[cfg(feature = "alloc")]
176 SchedAlgorithm::Hybrid => {
177 if task_ref.sched_class == SchedClass::RealTime {
178 #[cfg(not(target_arch = "riscv64"))]
179 READY_QUEUE.lock().enqueue(task);
180 #[cfg(target_arch = "riscv64")]
181 super::queue::get_ready_queue().enqueue(task);
182 } else if let Some(ref cfs) = self.cfs_queue {
183 cfs.lock().enqueue(task);
184 }
185 }
186 }
187 }
188 }
189
190 /// Enqueue task on a suitable CPU that matches its affinity
191 fn enqueue_on_suitable_cpu(&self, task: NonNull<Task>) {
192 // SAFETY: `task` is a valid NonNull<Task>. We read cpu_affinity and
193 // tid fields to find a suitable CPU and log warnings. The task is not
194 // concurrently modified because the caller holds the scheduler lock.
195 let task_ref = unsafe { task.as_ref() };
196
197 // Find first CPU that matches task affinity
198 for cpu in 0..super::smp::MAX_CPUS as u8 {
199 if task_ref.can_run_on(cpu) {
200 // Schedule on that CPU
201 super::scheduler::schedule_on_cpu(cpu, task);
202 return;
203 }
204 }
205
206 // No suitable CPU found, this is an error
207 println!(
208 "[SCHED] Warning: Task {} has no valid CPU affinity!",
209 task_ref.tid
210 );
211 }
212
213 /// Select next task to run
214 pub fn pick_next(&self) -> Option<NonNull<Task>> {
215 match self.algorithm {
216 SchedAlgorithm::RoundRobin => self.pick_next_rr(),
217 SchedAlgorithm::Priority => self.pick_next_priority(),
218 #[cfg(feature = "alloc")]
219 SchedAlgorithm::Cfs => self.pick_next_cfs(),
220 #[cfg(feature = "alloc")]
221 SchedAlgorithm::Hybrid => self.pick_next_hybrid(),
222 }
223 }
224
225 /// Round-robin task selection
226 fn pick_next_rr(&self) -> Option<NonNull<Task>> {
227 let current_cpu = self.cpu_id;
228
229 // Try per-CPU queue first
230 if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
231 let mut queue = cpu_data.cpu_info.ready_queue.lock();
232
233 // Find a task that can run on this CPU
234 while let Some(task_ptr) = queue.dequeue() {
235 // SAFETY: task_ptr was just dequeued from the ready queue
236 // where it was stored as a valid NonNull<Task>. We read
237 // cpu_affinity via can_run_on to check compatibility. Tasks
238 // in the queue are not deallocated while enqueued.
239 unsafe {
240 let task = task_ptr.as_ref();
241 if task.can_run_on(current_cpu) {
242 cpu_data.cpu_info.nr_running.fetch_sub(1, Ordering::Relaxed);
243 cpu_data.cpu_info.update_load();
244 return Some(task_ptr);
245 }
246 // Task can't run on this CPU, re-queue it
247 queue.enqueue(task_ptr);
248 }
249 }
250 } else {
251 // Fallback to global queue
252 #[cfg(not(target_arch = "riscv64"))]
253 {
254 let mut queue = READY_QUEUE.lock();
255
256 while let Some(task_ptr) = queue.dequeue() {
257 // SAFETY: Same as above -- task_ptr is valid from the
258 // global ready queue.
259 unsafe {
260 let task = task_ptr.as_ref();
261 if task.can_run_on(current_cpu) {
262 return Some(task_ptr);
263 }
264 // Task can't run on this CPU, re-queue it
265 queue.enqueue(task_ptr);
266 }
267 }
268 }
269 #[cfg(target_arch = "riscv64")]
270 {
271 let queue = super::queue::get_ready_queue();
272
273 while let Some(task_ptr) = queue.dequeue() {
274 // SAFETY: Same as above -- task_ptr is valid from the
275 // RISC-V global ready queue.
276 unsafe {
277 let task = task_ptr.as_ref();
278 if task.can_run_on(current_cpu) {
279 return Some(task_ptr);
280 }
281 // Task can't run on this CPU, re-queue it
282 queue.enqueue(task_ptr);
283 }
284 }
285 }
286 }
287
288 // Work-stealing: try to steal from the busiest neighbor CPU
289 {
290 let mut best_cpu = None;
291 let mut best_load = 0u32;
292 for cpu in 0..super::smp::MAX_CPUS as u8 {
293 if cpu == current_cpu {
294 continue;
295 }
296 if let Some(data) = super::smp::per_cpu(cpu) {
297 let load = data.cpu_info.nr_running.load(Ordering::Relaxed);
298 if load > best_load && load >= 2 {
299 best_load = load;
300 best_cpu = Some(cpu);
301 }
302 }
303 }
304
305 if let Some(victim_cpu) = best_cpu {
306 if let Some(victim_data) = super::smp::per_cpu(victim_cpu) {
307 let mut queue = victim_data.cpu_info.ready_queue.lock();
308 if let Some(task_ptr) = queue.dequeue() {
309 // SAFETY: task_ptr is valid from the victim's ready queue.
310 unsafe {
311 if task_ptr.as_ref().can_run_on(current_cpu) {
312 victim_data
313 .cpu_info
314 .nr_running
315 .fetch_sub(1, Ordering::Relaxed);
316 return Some(task_ptr);
317 }
318 // Can't run here, put it back
319 queue.enqueue(task_ptr);
320 }
321 }
322 }
323 }
324 }
325
326 // No runnable task found, use idle task
327 self.idle_task.map(|t| t.as_ptr())
328 }
329
330 /// Priority-based task selection
331 fn pick_next_priority(&self) -> Option<NonNull<Task>> {
332 let current_cpu = self.cpu_id;
333
334 // Try per-CPU queue first
335 if let Some(cpu_data) = super::smp::per_cpu(self.cpu_id) {
336 let mut queue = cpu_data.cpu_info.ready_queue.lock();
337
338 // The ReadyQueue already maintains priority order with bitmaps
339 // Just dequeue the highest priority task that can run on this CPU
340 let mut requeue_count = 0;
341 let max_attempts = 10; // Prevent infinite loop
342
343 while requeue_count < max_attempts {
344 match queue.dequeue() {
345 Some(task_ptr) => {
346 // SAFETY: task_ptr was just dequeued from the per-CPU
347 // ready queue where it was stored as a valid
348 // NonNull<Task>. We read cpu_affinity via can_run_on.
349 unsafe {
350 let task = task_ptr.as_ref();
351 if task.can_run_on(current_cpu) {
352 cpu_data.cpu_info.nr_running.fetch_sub(1, Ordering::Relaxed);
353 cpu_data.cpu_info.update_load();
354 return Some(task_ptr);
355 } else {
356 // Task can't run on this CPU, re-queue it
357 queue.enqueue(task_ptr);
358 requeue_count += 1;
359 }
360 }
361 }
362 None => break, // No more tasks
363 }
364 }
365 } else {
366 // Fallback to global queue
367 #[cfg(not(target_arch = "riscv64"))]
368 {
369 let mut queue = READY_QUEUE.lock();
370
371 let mut requeue_count = 0;
372 let max_attempts = 10;
373
374 while requeue_count < max_attempts {
375 match queue.dequeue() {
376 // SAFETY: task_ptr is valid from the global ready queue.
377 Some(task_ptr) => unsafe {
378 let task = task_ptr.as_ref();
379 if task.can_run_on(current_cpu) {
380 return Some(task_ptr);
381 } else {
382 queue.enqueue(task_ptr);
383 requeue_count += 1;
384 }
385 },
386 None => break,
387 }
388 }
389 }
390 #[cfg(target_arch = "riscv64")]
391 {
392 let queue = super::queue::get_ready_queue();
393
394 let mut requeue_count = 0;
395 let max_attempts = 10;
396
397 while requeue_count < max_attempts {
398 match queue.dequeue() {
399 // SAFETY: task_ptr is valid from RISC-V ready queue.
400 Some(task_ptr) => unsafe {
401 let task = task_ptr.as_ref();
402 if task.can_run_on(current_cpu) {
403 return Some(task_ptr);
404 } else {
405 queue.enqueue(task_ptr);
406 requeue_count += 1;
407 }
408 },
409 None => break,
410 }
411 }
412 }
413 }
414
415 // No runnable task found, use idle task
416 self.idle_task.map(|t| t.as_ptr())
417 }
418
419 /// CFS task selection
420 #[cfg(feature = "alloc")]
421 fn pick_next_cfs(&self) -> Option<NonNull<Task>> {
422 let current_cpu = self.cpu_id;
423
424 if let Some(ref cfs) = self.cfs_queue {
425 let mut queue = cfs.lock();
426
427 // Find task with lowest vruntime that can run on this CPU
428 while let Some(task_ptr) = queue.dequeue() {
429 // SAFETY: task_ptr is valid from the CFS run queue. We read
430 // cpu_affinity to check if the task can run on this CPU.
431 unsafe {
432 let task = task_ptr.as_ref();
433 if task.can_run_on(current_cpu) {
434 return Some(task_ptr);
435 }
436 // Task can't run on this CPU, re-queue it
437 queue.enqueue(task_ptr);
438 }
439 }
440 }
441
442 self.idle_task.map(|t| t.as_ptr())
443 }
444
445 /// Hybrid scheduler task selection
446 #[cfg(feature = "alloc")]
447 fn pick_next_hybrid(&self) -> Option<NonNull<Task>> {
448 let current_cpu = self.cpu_id;
449
450 // Check real-time tasks first
451 {
452 #[cfg(not(target_arch = "riscv64"))]
453 {
454 let mut queue = READY_QUEUE.lock();
455 while let Some(task_ptr) = queue.dequeue() {
456 // SAFETY: task_ptr is valid from the global ready queue.
457 // We check affinity before returning it.
458 unsafe {
459 let task = task_ptr.as_ref();
460 if task.can_run_on(current_cpu) {
461 return Some(task_ptr);
462 }
463 // Task can't run on this CPU, re-queue it
464 queue.enqueue(task_ptr);
465 }
466 }
467 }
468 #[cfg(target_arch = "riscv64")]
469 {
470 let queue = super::queue::get_ready_queue();
471 while let Some(task_ptr) = queue.dequeue() {
472 // SAFETY: Same as above for RISC-V ready queue.
473 unsafe {
474 let task = task_ptr.as_ref();
475 if task.can_run_on(current_cpu) {
476 return Some(task_ptr);
477 }
478 // Task can't run on this CPU, re-queue it
479 queue.enqueue(task_ptr);
480 }
481 }
482 }
483 }
484
485 // Then check CFS queue
486 if let Some(ref cfs) = self.cfs_queue {
487 let mut queue = cfs.lock();
488 while let Some(task_ptr) = queue.dequeue() {
489 // SAFETY: task_ptr is valid from the CFS queue.
490 unsafe {
491 let task = task_ptr.as_ref();
492 if task.can_run_on(current_cpu) {
493 return Some(task_ptr);
494 }
495 // Task can't run on this CPU, re-queue it
496 queue.enqueue(task_ptr);
497 }
498 }
499 }
500
501 self.idle_task.map(|t| t.as_ptr())
502 }
503
504 /// Update task runtime statistics
505 pub fn update_runtime(&self, task: NonNull<Task>, runtime: u64) {
506 // SAFETY: `task` is a valid NonNull<Task> passed by the caller. We
507 // update statistics (runtime via atomic operations) and vruntime for
508 // CFS scheduling. The caller holds the scheduler lock ensuring no
509 // concurrent modification of the vruntime field.
510 unsafe {
511 let task_ref = task.as_ref();
512 task_ref.update_runtime(runtime);
513
514 // Update vruntime for CFS
515 if self.algorithm == SchedAlgorithm::Cfs
516 || (self.algorithm == SchedAlgorithm::Hybrid
517 && task_ref.sched_class != SchedClass::RealTime)
518 {
519 let task_mut = task.as_ptr();
520 let weight = priority_to_weight(task_ref.priority);
521 (*task_mut).vruntime += runtime * 1024 / weight;
522 }
523 }
524 }
525
526 /// Handle timer tick
527 pub fn tick(&mut self) {
528 if let Some(current) = self.current {
529 // SAFETY: `current` is a TaskPtr stored in the scheduler which
530 // points to a valid Task. We are called from the timer interrupt
531 // handler with the scheduler lock held. We decrement the time
532 // slice and potentially trigger a reschedule if it expires.
533 unsafe {
534 let task_mut = current.as_raw();
535
536 // Decrement time slice
537 if (*task_mut).time_slice > 0 {
538 (*task_mut).time_slice -= 1;
539 }
540
541 // Check if time slice expired
542 if (*task_mut).time_slice == 0 && self.is_preemptible() {
543 (*task_mut).time_slice = DEFAULT_TIME_SLICE;
544 self.schedule();
545 }
546 }
547 }
548 }
549
550 /// Perform scheduling decision
551 pub fn schedule(&mut self) {
552 let start_cycles = super::metrics::read_tsc();
553
554 // Disable interrupts
555 let _guard = crate::arch::disable_interrupts();
556
557 // Check if we can schedule
558 if !self.is_preemptible() {
559 return;
560 }
561
562 // Get current task
563 let current = match self.current {
564 Some(task) => task,
565 None => {
566 // No current task, pick one
567 if let Some(next) = self.pick_next() {
568 self.switch_to(next);
569 }
570 super::metrics::SCHEDULER_METRICS
571 .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
572 return;
573 }
574 };
575
576 // Check if current task should continue
577 // SAFETY: `current` is a TaskPtr from self.current which points to a
578 // valid Task. We hold the scheduler lock and interrupts are disabled,
579 // so the task is not concurrently modified. We read its state and
580 // compare with the next candidate task to decide on preemption.
581 unsafe {
582 let current_ref = current.as_ptr().as_ref();
583 if current_ref.state == ProcessState::Running {
584 // Task is still runnable, check if we should preempt
585 if let Some(next) = self.pick_next() {
586 let next_ref = next.as_ref();
587
588 // Preempt if next task has higher priority
589 if should_preempt(current_ref, next_ref) {
590 // Re-queue current task
591 self.enqueue(current.as_ptr());
592 self.switch_to(next);
593 super::metrics::SCHEDULER_METRICS
594 .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
595 return;
596 }
597 }
598 // Current task continues
599 super::metrics::SCHEDULER_METRICS
600 .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
601 return;
602 }
603 }
604
605 // Current task is not runnable, must switch
606 if let Some(next) = self.pick_next() {
607 self.switch_to(next);
608 }
609
610 super::metrics::SCHEDULER_METRICS
611 .record_scheduler_overhead(super::metrics::read_tsc() - start_cycles);
612 }
613
614 /// Switch to new task
615 fn switch_to(&mut self, next: NonNull<Task>) {
616 let next_ptr = TaskPtr::new(next);
617 if self.current == Some(next_ptr) {
618 return; // Already running
619 }
620
621 crate::perf::count_context_switch();
622 let start_cycles = super::metrics::read_tsc();
623
624 // Trace: record context switch
625 // SAFETY: next is a valid NonNull<Task>.
626 unsafe {
627 let next_ref = next.as_ref();
628 if let Some(current) = self.current {
629 let current_ref = current.as_ptr().as_ref();
630 crate::trace!(
631 crate::perf::trace::TraceEventType::SchedSwitchOut,
632 current_ref.pid.0,
633 current_ref.tid.0
634 );
635 }
636 crate::trace!(
637 crate::perf::trace::TraceEventType::SchedSwitchIn,
638 next_ref.pid.0,
639 next_ref.tid.0
640 );
641 }
642 // SAFETY: If self.current is Some, its TaskPtr points to a valid Task.
643 // We read the state field to determine if the context switch is
644 // voluntary (task blocked/sleeping) or involuntary (preemption).
645 // The scheduler lock is held ensuring exclusive access.
646 let is_voluntary = unsafe {
647 if let Some(current) = self.current {
648 let current_ref = current.as_ptr().as_ref();
649 current_ref.state == ProcessState::Blocked
650 || current_ref.state == ProcessState::Sleeping
651 } else {
652 false
653 }
654 };
655
656 // SAFETY: Both `self.current` (if Some) and `next` point to valid
657 // Tasks. We hold the scheduler lock and interrupts are disabled
658 // (from schedule()), ensuring exclusive access. We update:
659 // - current task: state to Ready, clear current_cpu
660 // - next task: state to Running, set current_cpu, mark_scheduled
661 // - Load context for the first task if no current task exists
662 // The context pointers passed to load_context are derived from the
663 // task's context field which is valid for the task's lifetime.
664 unsafe {
665 // Update states
666 if let Some(current) = self.current {
667 let current_raw = current.as_raw();
668 if (*current_raw).state == ProcessState::Running {
669 (*current_raw).state = ProcessState::Ready;
670 }
671 (*current_raw).current_cpu = None;
672 }
673
674 let next_mut = next.as_ptr();
675 let next_ref = next.as_ref();
676
677 // Check if scheduling idle task
678 if next_ref.sched_class == SchedClass::Idle {
679 super::metrics::SCHEDULER_METRICS.record_idle_scheduled();
680 }
681
682 (*next_mut).state = ProcessState::Running;
683 (*next_mut).current_cpu = Some(current_cpu());
684 (*next_mut).mark_scheduled(current_cpu(), is_voluntary);
685
686 // Update TSS kernel stack for Ring 3 -> Ring 0 transitions
687 #[cfg(target_arch = "x86_64")]
688 {
689 let kernel_sp = (*next.as_ptr()).kernel_stack;
690 if kernel_sp != 0 {
691 crate::arch::x86_64::gdt::set_kernel_stack(kernel_sp as u64);
692 }
693 }
694
695 // Lazy TLB: skip CR3 reload when switching to kernel threads
696 // (has_user_mappings == false). Kernel threads share the same
697 // kernel page table mappings, so CR3 reload is unnecessary.
698 // This saves ~100-300 cycles per kernel-to-kernel switch.
699 if (*next.as_ptr()).has_user_mappings {
700 let next_pt = (*next.as_ptr()).page_table;
701 if next_pt != 0 {
702 // Only reload CR3 if switching to a different address space
703 let current_pt = if let Some(current) = self.current {
704 (*current.as_raw()).page_table
705 } else {
706 0
707 };
708 if next_pt != current_pt {
709 #[cfg(target_arch = "x86_64")]
710 {
711 // SAFETY: next_pt is a valid page table physical address
712 // from the Task struct, set during process creation.
713 // Already inside an unsafe block from the parent scope.
714 core::arch::asm!(
715 "mov cr3, {}",
716 in(reg) next_pt,
717 options(nostack, preserves_flags)
718 );
719 }
720 }
721 }
722 }
723
724 // Perform context switch
725 if let Some(current) = self.current {
726 // Update current pointer BEFORE context_switch, since
727 // context_switch saves current registers and jumps to
728 // next's saved RIP. When this task resumes later,
729 // execution continues after this point with the correct
730 // self.current already set.
731 self.current = Some(next_ptr);
732
733 // Save current context and restore next context.
734 // This call does NOT return until this task is scheduled
735 // again -- at that point, execution resumes here.
736 let current_raw = current.as_raw();
737 context_switch(&mut (*current_raw).context, &(*next.as_ptr()).context);
738
739 // Resumed: Record metrics and report RCU quiescent state.
740 // A context switch is a quiescent point: no RCU read-side
741 // references from before the switch can be held.
742 crate::sync::rcu::rcu_quiescent();
743 let switch_cycles = super::metrics::read_tsc() - start_cycles;
744 super::metrics::SCHEDULER_METRICS
745 .record_context_switch(switch_cycles, is_voluntary);
746 return;
747 } else {
748 // First task, load its context directly
749 // This happens when scheduler starts with bootstrap task
750 #[cfg(target_arch = "x86_64")]
751 {
752 crate::arch::x86_64::idt::raw_serial_str(b"[SCHED] DISPATCH_FIRST pid=0x");
753 crate::arch::x86_64::idt::raw_serial_hex((*next.as_ptr()).pid.0);
754 crate::arch::x86_64::idt::raw_serial_str(b"\n");
755 }
756 match &(*next.as_ptr()).context {
757 #[cfg(target_arch = "x86_64")]
758 crate::sched::task::TaskContext::X86_64(ctx) => {
759 // SAFETY: ctx is a reference to a valid X86_64Context
760 // within the task. load_context restores CPU registers
761 // from this context structure.
762 crate::arch::x86_64::context::load_context(ctx as *const _);
763 }
764 #[cfg(target_arch = "aarch64")]
765 crate::sched::task::TaskContext::AArch64(ctx) => {
766 // SAFETY: ctx is a reference to a valid AArch64Context.
767 // load_context restores registers from it.
768 crate::arch::aarch64::context::load_context(ctx as *const _);
769 }
770 #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
771 crate::sched::task::TaskContext::RiscV(ctx) => {
772 // SAFETY: ctx is a reference to a valid RiscVContext.
773 // load_context restores registers from it.
774 crate::arch::riscv::context::load_context(ctx as *const _);
775 }
776 }
777 }
778
779 // Update current task
780 self.current = Some(next_ptr);
781 }
782
783 // Record context switch metrics and RCU quiescent state.
784 crate::sync::rcu::rcu_quiescent();
785 let switch_cycles = super::metrics::read_tsc() - start_cycles;
786 super::metrics::SCHEDULER_METRICS.record_context_switch(switch_cycles, is_voluntary);
787 }
788}
789
790/// Check if should preempt current task
791pub fn should_preempt(current: &Task, next: &Task) -> bool {
792 // Never preempt idle task except for any real task
793 if current.sched_class == SchedClass::Idle && next.sched_class != SchedClass::Idle {
794 return true;
795 }
796
797 // Real-time tasks always preempt non-real-time
798 if next.sched_class == SchedClass::RealTime && current.sched_class != SchedClass::RealTime {
799 return true;
800 }
801
802 // Within same class, check effective priority
803 if next.sched_class == current.sched_class {
804 let next_prio = next.effective_priority();
805 let curr_prio = current.effective_priority();
806
807 // For real-time tasks, always preempt if higher priority
808 if current.sched_class == SchedClass::RealTime {
809 return next_prio < curr_prio;
810 }
811
812 // For normal tasks, only preempt if significantly higher priority
813 // or current task has used up its time slice
814 if current.sched_class == SchedClass::Normal {
815 return next_prio < curr_prio || (next_prio == curr_prio && current.time_slice == 0);
816 }
817 }
818
819 false
820}
821
822/// Convert priority to weight for CFS
823fn priority_to_weight(priority: Priority) -> u64 {
824 match priority {
825 Priority::SystemHigh => 200,
826 Priority::SystemNormal => 100,
827 Priority::UserHigh => 50,
828 Priority::UserNormal => 20,
829 Priority::UserLow => 10,
830 Priority::Idle => 1,
831 _ => 20, // Default for real-time (not used in CFS)
832 }
833}
834
835/// Architecture-specific context switch
836#[cfg(target_arch = "x86_64")]
837fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
838 use super::task::TaskContext;
839 match (current, next) {
840 // SAFETY: Both curr and next point to valid context structures within
841 // their respective Tasks. context_switch saves curr's registers and
842 // restores next's. The scheduler lock and disabled interrupts ensure
843 // no concurrent access to these contexts.
844 (TaskContext::X86_64(curr), TaskContext::X86_64(next)) => unsafe {
845 crate::arch::x86_64::context::context_switch(curr as *mut _, next as *const _);
846 },
847 }
848}
849
850#[cfg(target_arch = "aarch64")]
851fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
852 use super::task::TaskContext;
853 match (current, next) {
854 // SAFETY: Same as x86_64 -- valid context structures, scheduler lock
855 // held, interrupts disabled.
856 (TaskContext::AArch64(curr), TaskContext::AArch64(next)) => unsafe {
857 crate::arch::aarch64::context::context_switch(curr as *mut _, next as *const _);
858 },
859 }
860}
861
862#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
863fn context_switch(current: &mut super::task::TaskContext, next: &super::task::TaskContext) {
864 use super::task::TaskContext;
865 match (current, next) {
866 // SAFETY: Same as x86_64 -- valid context structures, scheduler lock
867 // held, interrupts disabled.
868 (TaskContext::RiscV(curr), TaskContext::RiscV(next)) => unsafe {
869 crate::arch::riscv::context::context_switch(curr as *mut _, next as *const _);
870 },
871 }
872}
873
874/// Load context for first task
875#[cfg(target_arch = "x86_64")]
876fn load_context(context: &super::task::TaskContext) {
877 use crate::arch::x86_64::context::X86_64Context;
878
879 match context {
880 // SAFETY: ctx is a valid reference to an X86_64Context within the
881 // bootstrap task. load_context restores CPU registers from this
882 // structure. This is called once during scheduler startup.
883 super::task::TaskContext::X86_64(ctx) => unsafe {
884 crate::arch::x86_64::context::load_context(ctx as *const X86_64Context);
885 },
886 }
887}
888
889#[cfg(target_arch = "aarch64")]
890fn load_context(context: &super::task::TaskContext) {
891 use crate::arch::aarch64::context::AArch64Context;
892
893 match context {
894 // SAFETY: Same as x86_64 -- valid context reference for the bootstrap
895 // task. load_context restores CPU registers.
896 super::task::TaskContext::AArch64(ctx) => unsafe {
897 crate::arch::aarch64::context::load_context(ctx as *const AArch64Context);
898 },
899 #[allow(unreachable_patterns)]
900 _ => unreachable!("Invalid context type for AArch64"),
901 }
902}
903
904#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
905fn load_context(context: &super::task::TaskContext) {
906 use crate::arch::riscv::context::RiscVContext;
907
908 match context {
909 // SAFETY: Same as x86_64 -- valid context reference for the bootstrap
910 // task. load_context restores CPU registers.
911 super::task::TaskContext::RiscV(ctx) => unsafe {
912 crate::arch::riscv::context::load_context(ctx as *const RiscVContext);
913 },
914 #[allow(unreachable_patterns)]
915 _ => unreachable!("Invalid context type for RISC-V"),
916 }
917}
918
919/// Get current CPU ID
920fn current_cpu() -> u8 {
921 super::smp::current_cpu_id()
922}
923
924impl Default for Scheduler {
925 fn default() -> Self {
926 Self::new()
927 }
928}
929
930/// Default time slice
931const DEFAULT_TIME_SLICE: u32 = 10;
932
933/// Global scheduler instance (for BSP/CPU0)
934pub(crate) static SCHEDULER: Mutex<Scheduler> = Mutex::new(Scheduler::new());
935
936// ---- Global PID-to-Task registry for O(log n) lookup ----
937
938#[cfg(feature = "alloc")]
939use alloc::collections::BTreeMap;
940
941/// Wrapper for NonNull<Task> that implements Send+Sync.
942///
943/// SAFETY: Task pointers in the registry are only accessed under the
944/// TASK_REGISTRY mutex. Tasks are pinned in memory (Box::leak) and
945/// outlive their registry entries. Cross-CPU access is serialized
946/// by the mutex.
947#[derive(Clone, Copy)]
948struct SendTaskPtr(NonNull<Task>);
949unsafe impl Send for SendTaskPtr {}
950unsafe impl Sync for SendTaskPtr {}
951
952/// Global PID-to-Task pointer registry.
953///
954/// Enables O(log n) task lookup by PID without holding the scheduler lock.
955/// Tasks are registered on creation and unregistered on exit.
956#[cfg(feature = "alloc")]
957static TASK_REGISTRY: Mutex<Option<BTreeMap<u64, SendTaskPtr>>> = Mutex::new(None);
958
959/// Register a task in the global PID-to-Task registry.
960#[cfg(feature = "alloc")]
961pub fn register_task(pid: u64, task: NonNull<Task>) {
962 let mut registry = TASK_REGISTRY.lock();
963 let map = registry.get_or_insert_with(BTreeMap::new);
964 map.insert(pid, SendTaskPtr(task));
965}
966
967/// Unregister a task from the global PID-to-Task registry.
968#[cfg(feature = "alloc")]
969pub fn unregister_task(pid: u64) {
970 let mut registry = TASK_REGISTRY.lock();
971 if let Some(map) = registry.as_mut() {
972 map.remove(&pid);
973 }
974}
975
976/// Look up a task pointer by PID. O(log n) via BTreeMap.
977///
978/// Returns the NonNull<Task> if the PID is registered. This is used by the
979/// IPC fast path for direct task-to-task message transfer without iterating
980/// the scheduler's run queues.
981#[cfg(feature = "alloc")]
982pub fn get_task_ptr(pid: u64) -> Option<NonNull<Task>> {
983 let registry = TASK_REGISTRY.lock();
984 registry.as_ref().and_then(|map| map.get(&pid).map(|p| p.0))
985}
986
987/// Get scheduler for current CPU
988pub fn current_scheduler() -> &'static Mutex<Scheduler> {
989 let cpu_id = current_cpu();
990
991 // Try to get per-CPU scheduler
992 if let Some(cpu_data) = super::smp::per_cpu(cpu_id) {
993 // Return reference to per-CPU scheduler
994 &cpu_data.cpu_info.scheduler
995 } else {
996 // Fallback to global scheduler for BSP
997 &SCHEDULER
998 }
999}
1000
1001/// Schedule on specific CPU
1002pub fn schedule_on_cpu(cpu_id: u8, task: NonNull<Task>) {
1003 // SAFETY: `task` is a valid NonNull<Task> provided by the caller. We
1004 // read the task's cpu_affinity and tid fields to verify it can run on
1005 // the specified CPU. The task is not concurrently modified because the
1006 // caller manages its lifecycle.
1007 unsafe {
1008 let task_ref = task.as_ref();
1009 if !task_ref.can_run_on(cpu_id) {
1010 println!(
1011 "[SCHED] Warning: Task {} cannot run on CPU {} due to affinity mask",
1012 task_ref.tid, cpu_id
1013 );
1014 return;
1015 }
1016 }
1017
1018 if let Some(cpu_data) = super::smp::per_cpu(cpu_id) {
1019 // Add to per-CPU ready queue
1020 cpu_data.cpu_info.ready_queue.lock().enqueue(task);
1021 cpu_data.cpu_info.nr_running.fetch_add(1, Ordering::Relaxed);
1022 cpu_data.cpu_info.update_load();
1023
1024 // Send IPI if needed
1025 if cpu_data.cpu_info.is_idle() {
1026 super::smp::send_ipi(cpu_id, 0); // Wake up CPU
1027 }
1028 }
1029}