veridian_kernel/drivers/virtio/queue.rs
1//! Split virtqueue implementation for virtio devices.
2//!
3//! Implements the split virtqueue data structure used by legacy and
4//! transitional virtio devices, as specified in the
5//! [virtio specification, section 2.7](https://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html).
6//!
7//! A virtqueue consists of three physically contiguous regions:
8//!
9//! 1. **Descriptor table** -- array of [`VirtqDesc`] entries describing data
10//! buffers (16 bytes each)
11//! 2. **Available ring** ([`VirtqAvail`]) -- driver-to-device: ring of
12//! descriptor chain heads
13//! 3. **Used ring** ([`VirtqUsed`]) -- device-to-driver: ring of completed
14//! descriptor chain heads
15//!
16//! Memory layout follows the virtio specification: descriptors at the base,
17//! available ring immediately after, used ring page-aligned after that.
18//!
19//! # Thread Safety
20//!
21//! `VirtQueue` is **not** internally synchronized. It implements `Send` and
22//! `Sync` only because it is always held behind a `Mutex` in the virtio-blk
23//! driver (`VIRTIO_BLK: OnceLock<Mutex<VirtioBlkDevice>>`). Callers MUST NOT
24//! access a `VirtQueue` from multiple threads without external synchronization.
25
26// Split virtqueue -- shared data structure for all virtio device types
27
28use core::sync::atomic::{self, Ordering};
29
30use crate::mm::{FrameNumber, FRAME_ALLOCATOR, FRAME_SIZE};
31
32/// Default queue size (power of 2, must match QEMU's virtio-blk queue size).
33/// Legacy virtio requires the driver to use the exact queue size reported by
34/// the device. QEMU's virtio-blk reports 256.
35pub const DEFAULT_QUEUE_SIZE: u16 = 256;
36
37/// Descriptor flag: buffer continues via the `next` field
38pub const VIRTQ_DESC_F_NEXT: u16 = 1;
39/// Descriptor flag: buffer is device-writable (device writes, driver reads)
40pub const VIRTQ_DESC_F_WRITE: u16 = 2;
41/// Descriptor flag: buffer contains a list of buffer descriptors (indirect)
42pub const VIRTQ_DESC_F_INDIRECT: u16 = 4;
43
44/// Virtqueue descriptor table entry.
45///
46/// Each descriptor points to a physically contiguous buffer in guest memory.
47/// Descriptors can be chained via the `next` field when `VIRTQ_DESC_F_NEXT`
48/// is set in `flags`.
49#[repr(C)]
50#[derive(Debug, Clone, Copy, Default)]
51pub struct VirtqDesc {
52 /// Physical address of the guest buffer
53 pub addr: u64,
54 /// Length of the guest buffer in bytes
55 pub len: u32,
56 /// Descriptor flags (NEXT, WRITE, INDIRECT)
57 pub flags: u16,
58 /// Index of the next descriptor in the chain (valid if NEXT flag is set)
59 pub next: u16,
60}
61
62/// Available ring: driver writes descriptor chain heads here for the device to
63/// consume.
64#[repr(C)]
65pub struct VirtqAvail {
66 /// Flags (e.g., VIRTQ_AVAIL_F_NO_INTERRUPT to suppress used-buffer
67 /// notifications)
68 pub flags: u16,
69 /// Index of the next entry the driver will write to in `ring[]`
70 pub idx: u16,
71 /// Ring of descriptor chain head indices
72 pub ring: [u16; DEFAULT_QUEUE_SIZE as usize],
73}
74
75/// Element in the used ring, returned by the device after processing.
76#[repr(C)]
77#[derive(Debug, Clone, Copy, Default)]
78pub struct VirtqUsedElem {
79 /// Index of the start of the used descriptor chain
80 pub id: u32,
81 /// Total bytes written into the descriptor chain buffers by the device
82 pub len: u32,
83}
84
85/// Used ring: device writes completed descriptor chain heads here.
86#[repr(C)]
87pub struct VirtqUsed {
88 /// Flags (e.g., VIRTQ_USED_F_NO_NOTIFY to suppress available-buffer
89 /// notifications)
90 pub flags: u16,
91 /// Index of the next entry the device will write to in `ring[]`
92 pub idx: u16,
93 /// Ring of completed descriptor chain elements
94 pub ring: [VirtqUsedElem; DEFAULT_QUEUE_SIZE as usize],
95}
96
97/// A split virtqueue managing descriptors, available ring, and used ring.
98///
99/// Owns the physical memory backing all three structures. The physical page
100/// frame number (PFN) is communicated to the device so it can DMA directly.
101pub struct VirtQueue {
102 /// Number of entries (descriptors) in this queue
103 size: u16,
104
105 /// Pointer to the descriptor table in identity-mapped kernel memory
106 desc: *mut VirtqDesc,
107
108 /// Pointer to the available ring
109 avail: *mut VirtqAvail,
110
111 /// Pointer to the used ring
112 used: *mut VirtqUsed,
113
114 /// Head of the free descriptor list
115 free_head: u16,
116
117 /// Number of free descriptors remaining
118 num_free: u16,
119
120 /// Last seen used ring index (for polling completion)
121 last_used_idx: u16,
122
123 /// Physical frame number of the queue memory (for QUEUE_ADDRESS register)
124 queue_pfn: u32,
125
126 /// Number of contiguous frames allocated for this queue
127 num_frames: usize,
128
129 /// First frame allocated (for freeing)
130 first_frame: FrameNumber,
131
132 /// Physical base address of the queue allocation
133 phys_base: u64,
134
135 /// Offsets of sub-structures from phys_base
136 desc_offset: usize,
137 avail_offset: usize,
138 used_offset: usize,
139}
140
141impl VirtQueue {
142 /// Allocate and initialize a new virtqueue.
143 ///
144 /// Allocates physically contiguous memory for the descriptor table,
145 /// available ring, and used ring. The memory is zeroed and the free
146 /// descriptor list is linked.
147 ///
148 /// `size` is typically read from the device via `QUEUE_SIZE` register
149 /// and should be a power of 2. If `size` is 0 or exceeds our compiled
150 /// maximum, we clamp to `DEFAULT_QUEUE_SIZE`.
151 pub fn new(size: u16) -> Result<Self, crate::error::KernelError> {
152 // Clamp to our compiled-in maximum
153 let size = if size == 0 || size > DEFAULT_QUEUE_SIZE {
154 DEFAULT_QUEUE_SIZE
155 } else {
156 size
157 };
158
159 // Calculate memory layout per virtio spec:
160 // descriptors: 16 bytes * queue_size
161 // available ring: 4 + 2 * queue_size bytes (+ padding)
162 // used ring: 4 + 8 * queue_size bytes
163 //
164 // Used ring must be page-aligned.
165 let desc_size = 16 * size as usize;
166 let avail_size = 4 + 2 * size as usize;
167 let used_offset = align_up(desc_size + avail_size, FRAME_SIZE);
168 let used_size = 4 + 8 * size as usize;
169 let total_size = used_offset + used_size;
170 let num_frames = total_size.div_ceil(FRAME_SIZE);
171
172 // Allocate physically contiguous frames
173 let first_frame = FRAME_ALLOCATOR
174 .lock()
175 .allocate_frames(num_frames, None)
176 .map_err(|_| crate::error::KernelError::OutOfMemory {
177 requested: total_size,
178 available: 0,
179 })?;
180
181 let phys_base = first_frame.as_u64() * FRAME_SIZE as u64;
182 let virt_base = phys_to_kernel_virt(phys_base);
183
184 // Zero the entire allocation
185 // SAFETY: virt_base points to freshly allocated, identity-mapped memory
186 // of size `num_frames * FRAME_SIZE` bytes. No other references exist.
187 unsafe {
188 core::ptr::write_bytes(virt_base as *mut u8, 0, num_frames * FRAME_SIZE);
189 }
190
191 let desc_ptr = virt_base as *mut VirtqDesc;
192 let avail_ptr = (virt_base + desc_size) as *mut VirtqAvail;
193 let used_ptr = (virt_base + used_offset) as *mut VirtqUsed;
194
195 // Initialize the free descriptor chain: each descriptor's `next` field
196 // points to the subsequent descriptor, forming a singly-linked free list.
197 // SAFETY: desc_ptr points to zeroed memory of `size` VirtqDesc entries.
198 // No other references to this memory exist.
199 unsafe {
200 for i in 0..size {
201 let desc = &mut *desc_ptr.add(i as usize);
202 desc.next = if i + 1 < size { i + 1 } else { 0 };
203 desc.flags = 0;
204 desc.addr = 0;
205 desc.len = 0;
206 }
207 }
208
209 Ok(Self {
210 size,
211 desc: desc_ptr,
212 avail: avail_ptr,
213 used: used_ptr,
214 free_head: 0,
215 num_free: size,
216 last_used_idx: 0,
217 queue_pfn: (phys_base / FRAME_SIZE as u64) as u32,
218 num_frames,
219 first_frame,
220 phys_base,
221 desc_offset: 0,
222 avail_offset: desc_size,
223 used_offset,
224 })
225 }
226
227 /// Get the physical page frame number for the QUEUE_ADDRESS register.
228 pub fn pfn(&self) -> u32 {
229 self.queue_pfn
230 }
231
232 /// Physical addresses for mmio transports (virtio-mmio expects 64-bit phys)
233 pub fn phys_desc(&self) -> u64 {
234 self.phys_base + self.desc_offset as u64
235 }
236
237 pub fn phys_avail(&self) -> u64 {
238 self.phys_base + self.avail_offset as u64
239 }
240
241 pub fn phys_used(&self) -> u64 {
242 self.phys_base + self.used_offset as u64
243 }
244
245 /// Get the queue size.
246 pub fn size(&self) -> u16 {
247 self.size
248 }
249
250 /// Allocate a single free descriptor, returning its index.
251 ///
252 /// Returns `None` if all descriptors are in use.
253 pub fn alloc_desc(&mut self) -> Option<u16> {
254 if self.num_free == 0 {
255 return None;
256 }
257
258 let idx = self.free_head;
259 // SAFETY: `idx` is within [0, size) because the free list is initialized
260 // with valid indices and we only ever store indices < size.
261 let desc = unsafe { &*self.desc.add(idx as usize) };
262 self.free_head = desc.next;
263 self.num_free -= 1;
264
265 Some(idx)
266 }
267
268 /// Return a descriptor to the free list.
269 pub fn free_desc(&mut self, idx: u16) {
270 debug_assert!((idx as usize) < self.size as usize);
271
272 // SAFETY: `idx` is within bounds (asserted above). We relink it into
273 // the free list by updating its `next` field.
274 unsafe {
275 let desc = &mut *self.desc.add(idx as usize);
276 desc.next = self.free_head;
277 desc.flags = 0;
278 desc.addr = 0;
279 desc.len = 0;
280 }
281 self.free_head = idx;
282 self.num_free += 1;
283 }
284
285 /// Free a chain of descriptors linked via NEXT flags, starting at `head`.
286 pub fn free_chain(&mut self, head: u16) {
287 let mut idx = head;
288 loop {
289 debug_assert!((idx as usize) < self.size as usize);
290 // SAFETY: idx is in bounds (asserted). We read flags/next before freeing.
291 let (flags, next) = unsafe {
292 let desc = &*self.desc.add(idx as usize);
293 (desc.flags, desc.next)
294 };
295 self.free_desc(idx);
296 if flags & VIRTQ_DESC_F_NEXT == 0 {
297 break;
298 }
299 idx = next;
300 }
301 }
302
303 /// Write a descriptor's fields.
304 ///
305 /// # Safety
306 ///
307 /// `idx` must be a valid descriptor index (< queue size). `phys_addr` must
308 /// point to a valid guest physical buffer of at least `len` bytes that will
309 /// remain valid until the device returns the descriptor via the used ring.
310 pub unsafe fn write_desc(&mut self, idx: u16, phys_addr: u64, len: u32, flags: u16, next: u16) {
311 debug_assert!((idx as usize) < self.size as usize);
312 // SAFETY: idx is in bounds (asserted). The caller guarantees phys_addr
313 // and len are valid.
314 let desc = unsafe { &mut *self.desc.add(idx as usize) };
315 desc.addr = phys_addr;
316 desc.len = len;
317 desc.flags = flags;
318 desc.next = next;
319 }
320
321 /// Push a descriptor chain head onto the available ring and advance the
322 /// available index.
323 ///
324 /// The caller must call `kick()` (via the transport) after one or more
325 /// `push_avail()` calls to notify the device.
326 pub fn push_avail(&mut self, desc_head: u16) {
327 debug_assert!(
328 (desc_head as usize) < self.size as usize,
329 "push_avail: descriptor index {} out of bounds (queue size {})",
330 desc_head,
331 self.size
332 );
333 // SAFETY: self.avail points to valid VirtqAvail memory we own.
334 // ring_idx is reduced modulo self.size, so it is always in bounds.
335 // desc_head is asserted to be < self.size above.
336 unsafe {
337 let avail = &mut *self.avail;
338 let ring_idx = avail.idx as usize % self.size as usize;
339 avail.ring[ring_idx] = desc_head;
340
341 // Write barrier: ensure the descriptor table writes and ring entry
342 // write above are visible before we update the available index.
343 atomic::fence(Ordering::Release);
344
345 avail.idx = avail.idx.wrapping_add(1);
346 }
347 }
348
349 /// Poll the used ring for a completed buffer.
350 ///
351 /// Returns `Some((chain_head_index, bytes_written))` if the device has
352 /// returned a buffer, or `None` if no new completions are available.
353 ///
354 /// The caller should free the returned descriptor chain via `free_chain()`.
355 pub fn poll_used(&mut self) -> Option<(u16, u32)> {
356 // Read barrier: ensure we see the device's writes to the used ring
357 // before we read the index.
358 atomic::fence(Ordering::Acquire);
359
360 // SAFETY: self.used points to valid VirtqUsed memory we own.
361 let used_idx = unsafe { (*self.used).idx };
362
363 if self.last_used_idx == used_idx {
364 return None;
365 }
366
367 let ring_idx = self.last_used_idx as usize % self.size as usize;
368 // SAFETY: ring_idx is modular-reduced to within [0, size).
369 let elem = unsafe { (*self.used).ring[ring_idx] };
370
371 self.last_used_idx = self.last_used_idx.wrapping_add(1);
372
373 Some((elem.id as u16, elem.len))
374 }
375
376 /// Check if any completions are pending without consuming them.
377 pub fn has_used(&self) -> bool {
378 atomic::fence(Ordering::Acquire);
379 // SAFETY: self.used is valid.
380 let used_idx = unsafe { (*self.used).idx };
381 self.last_used_idx != used_idx
382 }
383}
384
385impl Drop for VirtQueue {
386 fn drop(&mut self) {
387 // Return physical frames to the allocator
388 let _ = FRAME_ALLOCATOR
389 .lock()
390 .free_frames(self.first_frame, self.num_frames);
391 }
392}
393
394// SAFETY: VirtQueue manages raw pointers (`desc`, `avail`, `used`) to
395// physically contiguous DMA memory that it owns exclusively. These pointers
396// are not aliased by any other Rust object. Sending a VirtQueue to another
397// thread is safe because the pointed-to memory is valid from allocation
398// until Drop, and the virtio device accesses it via physical (not virtual)
399// addresses.
400//
401// IMPORTANT: VirtQueue itself is NOT internally synchronized. Callers MUST
402// hold it behind a Mutex (or equivalent) to prevent concurrent access.
403// In VeridianOS, VirtioBlkDevice wraps VirtQueue in a spin::Mutex inside
404// the global VIRTIO_BLK OnceLock.
405unsafe impl Send for VirtQueue {}
406// SAFETY: Shared references to VirtQueue are safe because the type is always
407// accessed behind a Mutex (specifically, the global VIRTIO_BLK
408// OnceLock<Mutex<VirtioBlkDevice>>). Only one thread can hold &mut VirtQueue
409// at a time through the Mutex guard. The raw pointers themselves are stable
410// (allocated once in new(), freed only in drop()) and never modified after
411// construction.
412//
413// IMPORTANT: If VirtQueue is ever used outside a Mutex, this impl is UNSOUND.
414// Do not remove the Mutex wrapper without replacing these impls with proper
415// internal synchronization.
416unsafe impl Sync for VirtQueue {}
417
418/// Align `value` up to the next multiple of `align`.
419fn align_up(value: usize, align: usize) -> usize {
420 (value + align - 1) & !(align - 1)
421}
422
423/// Convert a physical address to a kernel-accessible virtual address.
424///
425/// On x86_64 with the bootloader's physical memory mapping, physical addresses
426/// are accessible at `phys + physical_memory_offset`. On other architectures
427/// or in early boot, low physical addresses may be identity-mapped.
428fn phys_to_kernel_virt(phys: u64) -> usize {
429 #[cfg(target_arch = "x86_64")]
430 {
431 // Try the bootloader's physical memory offset first.
432 if let Some(virt) = crate::arch::x86_64::msr::phys_to_virt(phys as usize) {
433 return virt;
434 }
435 // Fallback: assume identity mapping in the higher-half window
436 (phys + 0xFFFF_8000_0000_0000) as usize
437 }
438
439 #[cfg(not(target_arch = "x86_64"))]
440 {
441 // AArch64 and RISC-V: physical addresses are identity-mapped in the
442 // kernel's address space during early boot.
443 phys as usize
444 }
445}