⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/drivers/virtio/
queue.rs

1//! Split virtqueue implementation for virtio devices.
2//!
3//! Implements the split virtqueue data structure used by legacy and
4//! transitional virtio devices, as specified in the
5//! [virtio specification, section 2.7](https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html).
6//!
7//! A virtqueue consists of three physically contiguous regions:
8//!
9//! 1. **Descriptor table** -- array of [`VirtqDesc`] entries describing data
10//!    buffers (16 bytes each)
11//! 2. **Available ring** ([`VirtqAvail`]) -- driver-to-device: ring of
12//!    descriptor chain heads
13//! 3. **Used ring** ([`VirtqUsed`]) -- device-to-driver: ring of completed
14//!    descriptor chain heads
15//!
16//! Memory layout follows the virtio specification: descriptors at the base,
17//! available ring immediately after, used ring page-aligned after that.
18//!
19//! # Thread Safety
20//!
21//! `VirtQueue` is **not** internally synchronized. It implements `Send` and
22//! `Sync` only because it is always held behind a `Mutex` in the virtio-blk
23//! driver (`VIRTIO_BLK: OnceLock<Mutex<VirtioBlkDevice>>`). Callers MUST NOT
24//! access a `VirtQueue` from multiple threads without external synchronization.
25
26// Split virtqueue -- shared data structure for all virtio device types
27
28use core::sync::atomic::{self, Ordering};
29
30use crate::mm::{FrameNumber, FRAME_ALLOCATOR, FRAME_SIZE};
31
32/// Default queue size (power of 2, must match QEMU's virtio-blk queue size).
33/// Legacy virtio requires the driver to use the exact queue size reported by
34/// the device. QEMU's virtio-blk reports 256.
35pub const DEFAULT_QUEUE_SIZE: u16 = 256;
36
37/// Descriptor flag: buffer continues via the `next` field
38pub const VIRTQ_DESC_F_NEXT: u16 = 1;
39/// Descriptor flag: buffer is device-writable (device writes, driver reads)
40pub const VIRTQ_DESC_F_WRITE: u16 = 2;
41/// Descriptor flag: buffer contains a list of buffer descriptors (indirect)
42pub const VIRTQ_DESC_F_INDIRECT: u16 = 4;
43
44/// Virtqueue descriptor table entry.
45///
46/// Each descriptor points to a physically contiguous buffer in guest memory.
47/// Descriptors can be chained via the `next` field when `VIRTQ_DESC_F_NEXT`
48/// is set in `flags`.
49#[repr(C)]
50#[derive(Debug, Clone, Copy, Default)]
51pub struct VirtqDesc {
52    /// Physical address of the guest buffer
53    pub addr: u64,
54    /// Length of the guest buffer in bytes
55    pub len: u32,
56    /// Descriptor flags (NEXT, WRITE, INDIRECT)
57    pub flags: u16,
58    /// Index of the next descriptor in the chain (valid if NEXT flag is set)
59    pub next: u16,
60}
61
62/// Available ring: driver writes descriptor chain heads here for the device to
63/// consume.
64#[repr(C)]
65pub struct VirtqAvail {
66    /// Flags (e.g., VIRTQ_AVAIL_F_NO_INTERRUPT to suppress used-buffer
67    /// notifications)
68    pub flags: u16,
69    /// Index of the next entry the driver will write to in `ring[]`
70    pub idx: u16,
71    /// Ring of descriptor chain head indices
72    pub ring: [u16; DEFAULT_QUEUE_SIZE as usize],
73}
74
75/// Element in the used ring, returned by the device after processing.
76#[repr(C)]
77#[derive(Debug, Clone, Copy, Default)]
78pub struct VirtqUsedElem {
79    /// Index of the start of the used descriptor chain
80    pub id: u32,
81    /// Total bytes written into the descriptor chain buffers by the device
82    pub len: u32,
83}
84
85/// Used ring: device writes completed descriptor chain heads here.
86#[repr(C)]
87pub struct VirtqUsed {
88    /// Flags (e.g., VIRTQ_USED_F_NO_NOTIFY to suppress available-buffer
89    /// notifications)
90    pub flags: u16,
91    /// Index of the next entry the device will write to in `ring[]`
92    pub idx: u16,
93    /// Ring of completed descriptor chain elements
94    pub ring: [VirtqUsedElem; DEFAULT_QUEUE_SIZE as usize],
95}
96
97/// A split virtqueue managing descriptors, available ring, and used ring.
98///
99/// Owns the physical memory backing all three structures. The physical page
100/// frame number (PFN) is communicated to the device so it can DMA directly.
101pub struct VirtQueue {
102    /// Number of entries (descriptors) in this queue
103    size: u16,
104
105    /// Pointer to the descriptor table in identity-mapped kernel memory
106    desc: *mut VirtqDesc,
107
108    /// Pointer to the available ring
109    avail: *mut VirtqAvail,
110
111    /// Pointer to the used ring
112    used: *mut VirtqUsed,
113
114    /// Head of the free descriptor list
115    free_head: u16,
116
117    /// Number of free descriptors remaining
118    num_free: u16,
119
120    /// Last seen used ring index (for polling completion)
121    last_used_idx: u16,
122
123    /// Physical frame number of the queue memory (for QUEUE_ADDRESS register)
124    queue_pfn: u32,
125
126    /// Number of contiguous frames allocated for this queue
127    num_frames: usize,
128
129    /// First frame allocated (for freeing)
130    first_frame: FrameNumber,
131
132    /// Physical base address of the queue allocation
133    phys_base: u64,
134
135    /// Offsets of sub-structures from phys_base
136    desc_offset: usize,
137    avail_offset: usize,
138    used_offset: usize,
139}
140
141impl VirtQueue {
142    /// Allocate and initialize a new virtqueue.
143    ///
144    /// Allocates physically contiguous memory for the descriptor table,
145    /// available ring, and used ring. The memory is zeroed and the free
146    /// descriptor list is linked.
147    ///
148    /// `size` is typically read from the device via `QUEUE_SIZE` register
149    /// and should be a power of 2. If `size` is 0 or exceeds our compiled
150    /// maximum, we clamp to `DEFAULT_QUEUE_SIZE`.
151    pub fn new(size: u16) -> Result<Self, crate::error::KernelError> {
152        // Clamp to our compiled-in maximum
153        let size = if size == 0 || size > DEFAULT_QUEUE_SIZE {
154            DEFAULT_QUEUE_SIZE
155        } else {
156            size
157        };
158
159        // Calculate memory layout per virtio spec:
160        //   descriptors: 16 bytes * queue_size
161        //   available ring: 4 + 2 * queue_size bytes (+ padding)
162        //   used ring: 4 + 8 * queue_size bytes
163        //
164        // Used ring must be page-aligned.
165        let desc_size = 16 * size as usize;
166        let avail_size = 4 + 2 * size as usize;
167        let used_offset = align_up(desc_size + avail_size, FRAME_SIZE);
168        let used_size = 4 + 8 * size as usize;
169        let total_size = used_offset + used_size;
170        let num_frames = total_size.div_ceil(FRAME_SIZE);
171
172        // Allocate physically contiguous frames
173        let first_frame = FRAME_ALLOCATOR
174            .lock()
175            .allocate_frames(num_frames, None)
176            .map_err(|_| crate::error::KernelError::OutOfMemory {
177                requested: total_size,
178                available: 0,
179            })?;
180
181        let phys_base = first_frame.as_u64() * FRAME_SIZE as u64;
182        let virt_base = phys_to_kernel_virt(phys_base);
183
184        // Zero the entire allocation
185        // SAFETY: virt_base points to freshly allocated, identity-mapped memory
186        // of size `num_frames * FRAME_SIZE` bytes. No other references exist.
187        unsafe {
188            core::ptr::write_bytes(virt_base as *mut u8, 0, num_frames * FRAME_SIZE);
189        }
190
191        let desc_ptr = virt_base as *mut VirtqDesc;
192        let avail_ptr = (virt_base + desc_size) as *mut VirtqAvail;
193        let used_ptr = (virt_base + used_offset) as *mut VirtqUsed;
194
195        // Initialize the free descriptor chain: each descriptor's `next` field
196        // points to the subsequent descriptor, forming a singly-linked free list.
197        // SAFETY: desc_ptr points to zeroed memory of `size` VirtqDesc entries.
198        // No other references to this memory exist.
199        unsafe {
200            for i in 0..size {
201                let desc = &mut *desc_ptr.add(i as usize);
202                desc.next = if i + 1 < size { i + 1 } else { 0 };
203                desc.flags = 0;
204                desc.addr = 0;
205                desc.len = 0;
206            }
207        }
208
209        Ok(Self {
210            size,
211            desc: desc_ptr,
212            avail: avail_ptr,
213            used: used_ptr,
214            free_head: 0,
215            num_free: size,
216            last_used_idx: 0,
217            queue_pfn: (phys_base / FRAME_SIZE as u64) as u32,
218            num_frames,
219            first_frame,
220            phys_base,
221            desc_offset: 0,
222            avail_offset: desc_size,
223            used_offset,
224        })
225    }
226
227    /// Get the physical page frame number for the QUEUE_ADDRESS register.
228    pub fn pfn(&self) -> u32 {
229        self.queue_pfn
230    }
231
232    /// Physical addresses for mmio transports (virtio-mmio expects 64-bit phys)
233    pub fn phys_desc(&self) -> u64 {
234        self.phys_base + self.desc_offset as u64
235    }
236
237    pub fn phys_avail(&self) -> u64 {
238        self.phys_base + self.avail_offset as u64
239    }
240
241    pub fn phys_used(&self) -> u64 {
242        self.phys_base + self.used_offset as u64
243    }
244
245    /// Get the queue size.
246    pub fn size(&self) -> u16 {
247        self.size
248    }
249
250    /// Allocate a single free descriptor, returning its index.
251    ///
252    /// Returns `None` if all descriptors are in use.
253    pub fn alloc_desc(&mut self) -> Option<u16> {
254        if self.num_free == 0 {
255            return None;
256        }
257
258        let idx = self.free_head;
259        // SAFETY: `idx` is within [0, size) because the free list is initialized
260        // with valid indices and we only ever store indices < size.
261        let desc = unsafe { &*self.desc.add(idx as usize) };
262        self.free_head = desc.next;
263        self.num_free -= 1;
264
265        Some(idx)
266    }
267
268    /// Return a descriptor to the free list.
269    pub fn free_desc(&mut self, idx: u16) {
270        debug_assert!((idx as usize) < self.size as usize);
271
272        // SAFETY: `idx` is within bounds (asserted above). We relink it into
273        // the free list by updating its `next` field.
274        unsafe {
275            let desc = &mut *self.desc.add(idx as usize);
276            desc.next = self.free_head;
277            desc.flags = 0;
278            desc.addr = 0;
279            desc.len = 0;
280        }
281        self.free_head = idx;
282        self.num_free += 1;
283    }
284
285    /// Free a chain of descriptors linked via NEXT flags, starting at `head`.
286    pub fn free_chain(&mut self, head: u16) {
287        let mut idx = head;
288        loop {
289            debug_assert!((idx as usize) < self.size as usize);
290            // SAFETY: idx is in bounds (asserted). We read flags/next before freeing.
291            let (flags, next) = unsafe {
292                let desc = &*self.desc.add(idx as usize);
293                (desc.flags, desc.next)
294            };
295            self.free_desc(idx);
296            if flags & VIRTQ_DESC_F_NEXT == 0 {
297                break;
298            }
299            idx = next;
300        }
301    }
302
303    /// Write a descriptor's fields.
304    ///
305    /// # Safety
306    ///
307    /// `idx` must be a valid descriptor index (< queue size). `phys_addr` must
308    /// point to a valid guest physical buffer of at least `len` bytes that will
309    /// remain valid until the device returns the descriptor via the used ring.
310    pub unsafe fn write_desc(&mut self, idx: u16, phys_addr: u64, len: u32, flags: u16, next: u16) {
311        debug_assert!((idx as usize) < self.size as usize);
312        // SAFETY: idx is in bounds (asserted). The caller guarantees phys_addr
313        // and len are valid.
314        let desc = unsafe { &mut *self.desc.add(idx as usize) };
315        desc.addr = phys_addr;
316        desc.len = len;
317        desc.flags = flags;
318        desc.next = next;
319    }
320
321    /// Push a descriptor chain head onto the available ring and advance the
322    /// available index.
323    ///
324    /// The caller must call `kick()` (via the transport) after one or more
325    /// `push_avail()` calls to notify the device.
326    pub fn push_avail(&mut self, desc_head: u16) {
327        debug_assert!(
328            (desc_head as usize) < self.size as usize,
329            "push_avail: descriptor index {} out of bounds (queue size {})",
330            desc_head,
331            self.size
332        );
333        // SAFETY: self.avail points to valid VirtqAvail memory we own.
334        // ring_idx is reduced modulo self.size, so it is always in bounds.
335        // desc_head is asserted to be < self.size above.
336        unsafe {
337            let avail = &mut *self.avail;
338            let ring_idx = avail.idx as usize % self.size as usize;
339            avail.ring[ring_idx] = desc_head;
340
341            // Write barrier: ensure the descriptor table writes and ring entry
342            // write above are visible before we update the available index.
343            atomic::fence(Ordering::Release);
344
345            avail.idx = avail.idx.wrapping_add(1);
346        }
347    }
348
349    /// Poll the used ring for a completed buffer.
350    ///
351    /// Returns `Some((chain_head_index, bytes_written))` if the device has
352    /// returned a buffer, or `None` if no new completions are available.
353    ///
354    /// The caller should free the returned descriptor chain via `free_chain()`.
355    pub fn poll_used(&mut self) -> Option<(u16, u32)> {
356        // Read barrier: ensure we see the device's writes to the used ring
357        // before we read the index.
358        atomic::fence(Ordering::Acquire);
359
360        // SAFETY: self.used points to valid VirtqUsed memory we own.
361        let used_idx = unsafe { (*self.used).idx };
362
363        if self.last_used_idx == used_idx {
364            return None;
365        }
366
367        let ring_idx = self.last_used_idx as usize % self.size as usize;
368        // SAFETY: ring_idx is modular-reduced to within [0, size).
369        let elem = unsafe { (*self.used).ring[ring_idx] };
370
371        self.last_used_idx = self.last_used_idx.wrapping_add(1);
372
373        Some((elem.id as u16, elem.len))
374    }
375
376    /// Check if any completions are pending without consuming them.
377    pub fn has_used(&self) -> bool {
378        atomic::fence(Ordering::Acquire);
379        // SAFETY: self.used is valid.
380        let used_idx = unsafe { (*self.used).idx };
381        self.last_used_idx != used_idx
382    }
383}
384
385impl Drop for VirtQueue {
386    fn drop(&mut self) {
387        // Return physical frames to the allocator
388        let _ = FRAME_ALLOCATOR
389            .lock()
390            .free_frames(self.first_frame, self.num_frames);
391    }
392}
393
394// SAFETY: VirtQueue manages raw pointers (`desc`, `avail`, `used`) to
395// physically contiguous DMA memory that it owns exclusively. These pointers
396// are not aliased by any other Rust object. Sending a VirtQueue to another
397// thread is safe because the pointed-to memory is valid from allocation
398// until Drop, and the virtio device accesses it via physical (not virtual)
399// addresses.
400//
401// IMPORTANT: VirtQueue itself is NOT internally synchronized. Callers MUST
402// hold it behind a Mutex (or equivalent) to prevent concurrent access.
403// In VeridianOS, VirtioBlkDevice wraps VirtQueue in a spin::Mutex inside
404// the global VIRTIO_BLK OnceLock.
405unsafe impl Send for VirtQueue {}
406// SAFETY: Shared references to VirtQueue are safe because the type is always
407// accessed behind a Mutex (specifically, the global VIRTIO_BLK
408// OnceLock<Mutex<VirtioBlkDevice>>). Only one thread can hold &mut VirtQueue
409// at a time through the Mutex guard. The raw pointers themselves are stable
410// (allocated once in new(), freed only in drop()) and never modified after
411// construction.
412//
413// IMPORTANT: If VirtQueue is ever used outside a Mutex, this impl is UNSOUND.
414// Do not remove the Mutex wrapper without replacing these impls with proper
415// internal synchronization.
416unsafe impl Sync for VirtQueue {}
417
418/// Align `value` up to the next multiple of `align`.
419fn align_up(value: usize, align: usize) -> usize {
420    (value + align - 1) & !(align - 1)
421}
422
423/// Convert a physical address to a kernel-accessible virtual address.
424///
425/// On x86_64 with the bootloader's physical memory mapping, physical addresses
426/// are accessible at `phys + physical_memory_offset`. On other architectures
427/// or in early boot, low physical addresses may be identity-mapped.
428fn phys_to_kernel_virt(phys: u64) -> usize {
429    #[cfg(target_arch = "x86_64")]
430    {
431        // Try the bootloader's physical memory offset first.
432        if let Some(virt) = crate::arch::x86_64::msr::phys_to_virt(phys as usize) {
433            return virt;
434        }
435        // Fallback: assume identity mapping in the higher-half window
436        (phys + 0xFFFF_8000_0000_0000) as usize
437    }
438
439    #[cfg(not(target_arch = "x86_64"))]
440    {
441        // AArch64 and RISC-V: physical addresses are identity-mapped in the
442        // kernel's address space during early boot.
443        phys as usize
444    }
445}