⚠️ VeridianOS Kernel Documentation - This is low-level kernel code. All functions are unsafe unless explicitly marked otherwise. no_std

veridian_kernel/drivers/
nvme.rs

1//! NVMe (Non-Volatile Memory Express) Driver
2//!
3//! High-performance storage driver for NVMe SSDs using the BlockDevice trait.
4
5// NVMe driver -- hardware register offsets per NVMe spec
6#![allow(dead_code)]
7
8use alloc::{vec, vec::Vec};
9use core::sync::atomic::AtomicU16;
10
11use crate::{error::KernelError, fs::blockdev::BlockDevice};
12
13/// NVMe PCI vendor/device IDs
14pub const NVME_VENDOR_INTEL: u16 = 0x8086;
15pub const NVME_VENDOR_SAMSUNG: u16 = 0x144d;
16
17/// NVMe register offsets
18const REG_CAP: usize = 0x00; // Controller Capabilities
19const REG_VS: usize = 0x08; // Version
20const REG_CC: usize = 0x14; // Controller Configuration
21const REG_CSTS: usize = 0x1C; // Controller Status
22const REG_AQA: usize = 0x24; // Admin Queue Attributes
23const REG_ASQ: usize = 0x28; // Admin Submission Queue
24const REG_ACQ: usize = 0x30; // Admin Completion Queue
25
26/// Controller Configuration bits
27const CC_ENABLE: u32 = 1 << 0;
28const CC_CSS_NVM: u32 = 0 << 4;
29const CC_MPS_4K: u32 = 0 << 7;
30const CC_AMS_RR: u32 = 0 << 11;
31const CC_SHN_NONE: u32 = 0 << 14;
32const CC_IOSQES: u32 = 6 << 16;
33const CC_IOCQES: u32 = 4 << 20;
34
35/// Controller Status bits
36const CSTS_RDY: u32 = 1 << 0;
37const CSTS_CFS: u32 = 1 << 1;
38
39/// NVMe Admin Commands
40const ADMIN_DELETE_SQ: u8 = 0x00;
41const ADMIN_CREATE_SQ: u8 = 0x01;
42const ADMIN_DELETE_CQ: u8 = 0x04;
43const ADMIN_CREATE_CQ: u8 = 0x05;
44const ADMIN_IDENTIFY: u8 = 0x06;
45const ADMIN_SET_FEATURES: u8 = 0x09;
46
47/// NVMe I/O Commands
48const IO_READ: u8 = 0x02;
49const IO_WRITE: u8 = 0x01;
50
51/// Submission Queue Entry
52#[repr(C)]
53#[derive(Debug, Clone, Copy)]
54struct SubmissionQueueEntry {
55    opcode: u8,
56    flags: u8,
57    command_id: u16,
58    nsid: u32,
59    _reserved: u64,
60    metadata: u64,
61    prp1: u64,
62    prp2: u64,
63    cdw10: u32,
64    cdw11: u32,
65    cdw12: u32,
66    cdw13: u32,
67    cdw14: u32,
68    cdw15: u32,
69}
70
71impl SubmissionQueueEntry {
72    fn new() -> Self {
73        Self {
74            opcode: 0,
75            flags: 0,
76            command_id: 0,
77            nsid: 0,
78            _reserved: 0,
79            metadata: 0,
80            prp1: 0,
81            prp2: 0,
82            cdw10: 0,
83            cdw11: 0,
84            cdw12: 0,
85            cdw13: 0,
86            cdw14: 0,
87            cdw15: 0,
88        }
89    }
90}
91
92/// Completion Queue Entry
93#[repr(C)]
94#[derive(Debug, Clone, Copy)]
95struct CompletionQueueEntry {
96    result: u32,
97    _reserved: u32,
98    sq_head: u16,
99    sq_id: u16,
100    command_id: u16,
101    status: u16,
102}
103
104/// NVMe Queue Pair
105struct QueuePair {
106    /// Submission queue
107    submission_queue: Vec<SubmissionQueueEntry>,
108
109    /// Completion queue
110    completion_queue: Vec<CompletionQueueEntry>,
111
112    /// Submission queue tail (index of next free entry)
113    sq_tail: AtomicU16,
114
115    /// Completion queue head (index of next entry to process)
116    cq_head: AtomicU16,
117
118    /// Queue size
119    queue_size: u16,
120}
121
122impl QueuePair {
123    fn new(queue_size: u16) -> Self {
124        Self {
125            submission_queue: vec![SubmissionQueueEntry::new(); queue_size as usize],
126            completion_queue: vec![
127                CompletionQueueEntry {
128                    result: 0,
129                    _reserved: 0,
130                    sq_head: 0,
131                    sq_id: 0,
132                    command_id: 0,
133                    status: 0
134                };
135                queue_size as usize
136            ],
137            sq_tail: AtomicU16::new(0),
138            cq_head: AtomicU16::new(0),
139            queue_size,
140        }
141    }
142}
143
144/// NVMe Controller
145pub struct NvmeController {
146    /// MMIO base address
147    mmio_base: usize,
148
149    /// Admin queue pair
150    admin_queue: Option<QueuePair>,
151
152    /// I/O queue pairs
153    io_queues: Vec<QueuePair>,
154
155    /// Number of namespaces
156    num_namespaces: u32,
157
158    /// Block size
159    block_size: usize,
160
161    /// Total blocks
162    total_blocks: u64,
163}
164
165impl NvmeController {
166    /// Create a new NVMe controller
167    pub fn new(mmio_base: usize) -> Result<Self, KernelError> {
168        let mut controller = Self {
169            mmio_base,
170            admin_queue: None,
171            io_queues: Vec::new(),
172            num_namespaces: 1,
173            block_size: 512,
174            total_blocks: 0,
175        };
176
177        controller.initialize()?;
178
179        Ok(controller)
180    }
181
182    /// Read MMIO register
183    fn read_reg(&self, offset: usize) -> u32 {
184        // SAFETY: Reading an NVMe MMIO register at mmio_base + offset. The mmio_base
185        // is the controller's BAR0 address from PCI configuration. read_volatile
186        // ensures the compiler does not elide or reorder this hardware register
187        // access.
188        unsafe { core::ptr::read_volatile((self.mmio_base + offset) as *const u32) }
189    }
190
191    /// Write MMIO register
192    fn write_reg(&self, offset: usize, value: u32) {
193        // SAFETY: Writing an NVMe MMIO register. Same invariants as read_reg.
194        unsafe { core::ptr::write_volatile((self.mmio_base + offset) as *mut u32, value) }
195    }
196
197    /// Read 64-bit MMIO register
198    fn read_reg64(&self, offset: usize) -> u64 {
199        // SAFETY: Reading a 64-bit NVMe MMIO register (e.g. CAP). Same invariants as
200        // read_reg.
201        unsafe { core::ptr::read_volatile((self.mmio_base + offset) as *const u64) }
202    }
203
204    /// Write 64-bit MMIO register
205    fn write_reg64(&self, offset: usize, value: u64) {
206        // SAFETY: Writing a 64-bit NVMe MMIO register. Same invariants as write_reg.
207        unsafe { core::ptr::write_volatile((self.mmio_base + offset) as *mut u64, value) }
208    }
209
210    /// Initialize the NVMe controller
211    fn initialize(&mut self) -> Result<(), KernelError> {
212        println!(
213            "[NVME] Initializing NVMe controller at 0x{:x}",
214            self.mmio_base
215        );
216
217        // Read version
218        let version = self.read_reg(REG_VS);
219        let _major = (version >> 16) & 0xFFFF;
220        let _minor = (version >> 8) & 0xFF;
221        let _tertiary = version & 0xFF;
222        println!("[NVME] Version: {}.{}.{}", _major, _minor, _tertiary);
223
224        // Read capabilities
225        let cap = self.read_reg64(REG_CAP);
226        let max_queue_entries = ((cap & 0xFFFF) + 1) as u16;
227        println!("[NVME] Max queue entries: {}", max_queue_entries);
228
229        // Disable controller
230        self.write_reg(REG_CC, 0);
231
232        // Wait for controller to be disabled
233        let mut timeout = 1000;
234        while (self.read_reg(REG_CSTS) & CSTS_RDY) != 0 && timeout > 0 {
235            timeout -= 1;
236        }
237
238        if timeout == 0 {
239            return Err(KernelError::HardwareError {
240                device: "nvme",
241                code: 1,
242            });
243        }
244
245        // Create admin queue (stub - would need DMA allocation)
246        let admin_queue_size = 64.min(max_queue_entries);
247        self.admin_queue = Some(QueuePair::new(admin_queue_size));
248
249        println!(
250            "[NVME] Created admin queue with {} entries",
251            admin_queue_size
252        );
253
254        // NOTE: Full initialization requires:
255        // 1. DMA-capable memory allocation for queues
256        // 2. Setting up admin queue physical addresses in ASQ/ACQ
257        // 3. Configuring queue attributes in AQA
258        // 4. Enabling the controller
259        // 5. Creating I/O queues
260        // 6. Identifying namespaces
261
262        println!("[NVME] Controller initialized (stub - requires DMA)");
263
264        Ok(())
265    }
266
267    /// Submit a command to the admin queue and poll for completion.
268    ///
269    /// Writes the command to the next available submission queue slot,
270    /// rings the admin submission queue doorbell (offset 0x1000), and
271    /// spins waiting for a matching completion queue entry.
272    fn submit_admin_command(&mut self, cmd: SubmissionQueueEntry) -> Result<u32, KernelError> {
273        let mmio = self.mmio_base;
274        let queue = self
275            .admin_queue
276            .as_mut()
277            .ok_or(KernelError::NotInitialized {
278                subsystem: "NVMe admin queue",
279            })?;
280
281        let tail = queue.sq_tail.load(core::sync::atomic::Ordering::Relaxed);
282        let idx = tail as usize % queue.queue_size as usize;
283
284        // Write command to submission queue.
285        queue.submission_queue[idx] = cmd;
286
287        // Advance tail.
288        let new_tail = (tail + 1) % queue.queue_size;
289        queue
290            .sq_tail
291            .store(new_tail, core::sync::atomic::Ordering::Release);
292
293        // Ring admin SQ doorbell (offset 0x1000 for queue 0).
294        // SAFETY: MMIO write to NVMe doorbell register.
295        unsafe {
296            core::ptr::write_volatile((mmio + 0x1000) as *mut u32, new_tail as u32);
297        }
298
299        // Poll completion queue for response (admin queue = CQ 0).
300        let cq_head = queue.cq_head.load(core::sync::atomic::Ordering::Relaxed);
301        let cq_idx = cq_head as usize % queue.queue_size as usize;
302
303        let mut timeout = 100_000u32;
304        loop {
305            let status = queue.completion_queue[cq_idx].status;
306            // Phase bit check: completion entries toggle phase on wrap.
307            if status & 1 != 0 || timeout == 0 {
308                break;
309            }
310            timeout -= 1;
311            core::hint::spin_loop();
312        }
313
314        if timeout == 0 {
315            return Err(KernelError::Timeout {
316                operation: "NVMe admin command",
317                duration_ms: 100,
318            });
319        }
320
321        let result = queue.completion_queue[cq_idx].result;
322        let new_head = (cq_head + 1) % queue.queue_size;
323        queue
324            .cq_head
325            .store(new_head, core::sync::atomic::Ordering::Release);
326
327        // Ring admin CQ doorbell (offset 0x1000 + 1 * doorbell_stride for CQ 0).
328        // SAFETY: MMIO write to NVMe doorbell register.
329        unsafe {
330            core::ptr::write_volatile((mmio + 0x1004) as *mut u32, new_head as u32);
331        }
332
333        Ok(result)
334    }
335
336    /// Create an I/O queue pair.
337    ///
338    /// Sends Create I/O Completion Queue and Create I/O Submission Queue
339    /// admin commands to set up an I/O queue for block operations.
340    fn create_io_queue(&mut self, queue_id: u16, queue_size: u16) -> Result<(), KernelError> {
341        let qp = QueuePair::new(queue_size);
342
343        // Create I/O Completion Queue (admin opcode 0x05).
344        let mut cq_cmd = SubmissionQueueEntry::new();
345        cq_cmd.opcode = ADMIN_CREATE_CQ;
346        cq_cmd.cdw10 = ((queue_size as u32 - 1) << 16) | queue_id as u32;
347        cq_cmd.cdw11 = 1; // physically contiguous, interrupts enabled
348        let _ = self.submit_admin_command(cq_cmd)?;
349
350        // Create I/O Submission Queue (admin opcode 0x01).
351        let mut sq_cmd = SubmissionQueueEntry::new();
352        sq_cmd.opcode = ADMIN_CREATE_SQ;
353        sq_cmd.cdw10 = ((queue_size as u32 - 1) << 16) | queue_id as u32;
354        sq_cmd.cdw11 = (queue_id as u32) << 16 | 1; // CQ ID + physically contiguous
355        let _ = self.submit_admin_command(sq_cmd)?;
356
357        self.io_queues.push(qp);
358        println!(
359            "[NVME] Created I/O queue pair {} (size={})",
360            queue_id, queue_size
361        );
362        Ok(())
363    }
364
365    /// Submit an I/O read command to the specified queue.
366    fn submit_io_read(
367        &self,
368        queue_idx: usize,
369        start_lba: u64,
370        num_blocks: u16,
371        prp1: u64,
372    ) -> Result<(), KernelError> {
373        if queue_idx >= self.io_queues.len() {
374            return Err(KernelError::InvalidArgument {
375                name: "queue_idx",
376                value: "exceeds number of I/O queues",
377            });
378        }
379
380        let queue = &self.io_queues[queue_idx];
381        let tail = queue.sq_tail.load(core::sync::atomic::Ordering::Relaxed);
382        let idx = tail as usize % queue.queue_size as usize;
383
384        let mut cmd = SubmissionQueueEntry::new();
385        cmd.opcode = IO_READ;
386        cmd.nsid = 1; // Namespace 1
387        cmd.prp1 = prp1;
388        cmd.cdw10 = (start_lba & 0xFFFF_FFFF) as u32;
389        cmd.cdw11 = (start_lba >> 32) as u32;
390        cmd.cdw12 = (num_blocks - 1) as u32; // 0-based count
391
392        // SAFETY: We own this queue slot exclusively via the atomic tail index.
393        // No other code writes to submission_queue[idx] until we advance the tail.
394        unsafe {
395            let sq_ptr = queue.submission_queue.as_ptr() as *mut SubmissionQueueEntry;
396            core::ptr::write(sq_ptr.add(idx), cmd);
397        }
398
399        let new_tail = (tail + 1) % queue.queue_size;
400        queue
401            .sq_tail
402            .store(new_tail, core::sync::atomic::Ordering::Release);
403
404        // Ring I/O SQ doorbell: offset 0x1000 + (2 * queue_id) * doorbell_stride.
405        let doorbell_offset = 0x1000 + (2 * (queue_idx + 1)) * 4;
406        self.write_reg(doorbell_offset, new_tail as u32);
407
408        Ok(())
409    }
410
411    /// Read blocks using the first I/O queue.
412    fn read_blocks_internal(&self, start_block: u64, buffer: &mut [u8]) -> Result<(), KernelError> {
413        if self.io_queues.is_empty() {
414            // No I/O queues initialized -- return zeros (stub behavior).
415            buffer.fill(0);
416            return Ok(());
417        }
418
419        let num_blocks = (buffer.len() / self.block_size) as u16;
420
421        // For actual DMA, we would allocate a DMA buffer, submit the command
422        // with the DMA physical address as PRP1, wait for completion, then
423        // copy from the DMA buffer to the user buffer. Since DMA buffer
424        // allocation is done via iommu::alloc_dma_buffer(), we use a stub
425        // PRP address of 0 which won't transfer real data.
426        let _ = self.submit_io_read(0, start_block, num_blocks, 0);
427
428        // Poll for completion on the first I/O queue.
429        let queue = &self.io_queues[0];
430        let mut timeout = 100_000u32;
431        let cq_head = queue.cq_head.load(core::sync::atomic::Ordering::Relaxed);
432        let cq_idx = cq_head as usize % queue.queue_size as usize;
433
434        loop {
435            if queue.completion_queue[cq_idx].status & 1 != 0 || timeout == 0 {
436                break;
437            }
438            timeout -= 1;
439            core::hint::spin_loop();
440        }
441
442        // Advance CQ head and ring doorbell.
443        let new_head = (cq_head + 1) % queue.queue_size;
444        queue
445            .cq_head
446            .store(new_head, core::sync::atomic::Ordering::Release);
447        let cq_doorbell = 0x1000 + 3 * 4; // CQ 1 doorbell = offset 0x100C
448        self.write_reg(cq_doorbell, new_head as u32);
449
450        Ok(())
451    }
452
453    /// Write blocks using the first I/O queue.
454    fn write_blocks_internal(
455        &mut self,
456        start_block: u64,
457        buffer: &[u8],
458    ) -> Result<(), KernelError> {
459        if self.io_queues.is_empty() {
460            return Ok(());
461        }
462
463        let num_blocks = (buffer.len() / self.block_size) as u16;
464        let queue = &self.io_queues[0];
465        let tail = queue.sq_tail.load(core::sync::atomic::Ordering::Relaxed);
466        let idx = tail as usize % queue.queue_size as usize;
467
468        // Build I/O Write command.
469        let mut cmd = SubmissionQueueEntry::new();
470        cmd.opcode = IO_WRITE;
471        cmd.nsid = 1;
472        cmd.prp1 = 0; // Would be DMA phys addr
473        cmd.cdw10 = (start_block & 0xFFFF_FFFF) as u32;
474        cmd.cdw11 = (start_block >> 32) as u32;
475        cmd.cdw12 = (num_blocks - 1) as u32;
476
477        // SAFETY: Exclusive access via atomic tail index.
478        unsafe {
479            let sq_ptr = queue.submission_queue.as_ptr() as *mut SubmissionQueueEntry;
480            core::ptr::write(sq_ptr.add(idx), cmd);
481        }
482
483        let new_tail = (tail + 1) % queue.queue_size;
484        queue
485            .sq_tail
486            .store(new_tail, core::sync::atomic::Ordering::Release);
487
488        // Ring I/O SQ doorbell.
489        let sq_doorbell = 0x1000 + 2 * 4; // SQ 1 doorbell = offset 0x1008
490        self.write_reg(sq_doorbell, new_tail as u32);
491
492        // Poll for completion.
493        let cq_head = queue.cq_head.load(core::sync::atomic::Ordering::Relaxed);
494        let cq_idx = cq_head as usize % queue.queue_size as usize;
495        let mut timeout = 100_000u32;
496        loop {
497            if queue.completion_queue[cq_idx].status & 1 != 0 || timeout == 0 {
498                break;
499            }
500            timeout -= 1;
501            core::hint::spin_loop();
502        }
503
504        let new_head = (cq_head + 1) % queue.queue_size;
505        queue
506            .cq_head
507            .store(new_head, core::sync::atomic::Ordering::Release);
508        let cq_doorbell = 0x1000 + 3 * 4; // CQ 1 doorbell
509        self.write_reg(cq_doorbell, new_head as u32);
510
511        Ok(())
512    }
513}
514
515impl BlockDevice for NvmeController {
516    fn name(&self) -> &str {
517        "nvme0"
518    }
519
520    fn block_size(&self) -> usize {
521        self.block_size
522    }
523
524    fn block_count(&self) -> u64 {
525        self.total_blocks
526    }
527
528    fn read_blocks(&self, start_block: u64, buffer: &mut [u8]) -> Result<(), KernelError> {
529        if !buffer.len().is_multiple_of(self.block_size) {
530            return Err(KernelError::InvalidArgument {
531                name: "buffer_length",
532                value: "not_multiple_of_block_size",
533            });
534        }
535
536        self.read_blocks_internal(start_block, buffer)
537    }
538
539    fn write_blocks(&mut self, start_block: u64, buffer: &[u8]) -> Result<(), KernelError> {
540        if !buffer.len().is_multiple_of(self.block_size) {
541            return Err(KernelError::InvalidArgument {
542                name: "buffer_length",
543                value: "not_multiple_of_block_size",
544            });
545        }
546
547        self.write_blocks_internal(start_block, buffer)
548    }
549
550    fn flush(&mut self) -> Result<(), KernelError> {
551        // NVMe flush command would go here
552        Ok(())
553    }
554}
555
556/// NVMe PCI subclass code.
557const NVME_SUBCLASS: u8 = 0x08;
558
559/// Admin queue size (64 entries is the minimum guaranteed by spec).
560const ADMIN_QUEUE_SIZE: u16 = 64;
561
562/// Timeout iterations for controller ready polling.
563const CONTROLLER_READY_TIMEOUT: u32 = 500_000;
564
565/// NVMe Identify Controller data offsets.
566const IDENT_SERIAL_OFFSET: usize = 4;
567const IDENT_SERIAL_LEN: usize = 20;
568const IDENT_MODEL_OFFSET: usize = 24;
569const IDENT_MODEL_LEN: usize = 40;
570const IDENT_FIRMWARE_OFFSET: usize = 64;
571const IDENT_FIRMWARE_LEN: usize = 8;
572const IDENT_MDTS_OFFSET: usize = 77;
573
574/// Initialize an NVMe controller found at the given BAR0 physical address.
575///
576/// Performs the full NVMe initialization sequence:
577/// 1. Map BAR0 into kernel virtual address space
578/// 2. Reset the controller (CC.EN=0, wait CSTS.RDY=0)
579/// 3. Allocate admin submission/completion queues via frame allocator
580/// 4. Program AQA, ASQ, ACQ registers
581/// 5. Enable controller (CC.EN=1), wait for CSTS.RDY=1
582/// 6. Issue Identify Controller command to read device metadata
583#[cfg(target_arch = "x86_64")]
584fn initialize_nvme_controller(bar0_phys: u64) -> Result<(), KernelError> {
585    use crate::mm::{phys_to_virt_addr, FRAME_ALLOCATOR, FRAME_SIZE};
586
587    // Step 1: Map BAR0 MMIO region into kernel virtual space.
588    let mmio_base = phys_to_virt_addr(bar0_phys) as usize;
589    println!(
590        "[NVME] MMIO base: phys={:#x} virt={:#x}",
591        bar0_phys, mmio_base
592    );
593
594    // Helper: read 32-bit MMIO register.
595    let read32 = |offset: usize| -> u32 {
596        // SAFETY: Reading NVMe MMIO register. mmio_base is the BAR0 address
597        // mapped through the kernel direct-map (phys + PHYS_MEM_OFFSET).
598        // All offsets are within the NVMe register space (< 0x1000).
599        unsafe { core::ptr::read_volatile((mmio_base + offset) as *const u32) }
600    };
601
602    // Helper: write 32-bit MMIO register.
603    let write32 = |offset: usize, value: u32| {
604        // SAFETY: Writing NVMe MMIO register. Same invariants as read32.
605        unsafe { core::ptr::write_volatile((mmio_base + offset) as *mut u32, value) }
606    };
607
608    // Helper: write 64-bit MMIO register (used for ASQ/ACQ base addresses).
609    let write64 = |offset: usize, value: u64| {
610        // SAFETY: Writing 64-bit NVMe MMIO register (ASQ/ACQ base address).
611        // The register pair is naturally aligned at offset 0x28 and 0x30.
612        unsafe { core::ptr::write_volatile((mmio_base + offset) as *mut u64, value) }
613    };
614
615    // Read controller version.
616    let version = read32(REG_VS);
617    let ver_major = (version >> 16) & 0xFFFF;
618    let ver_minor = (version >> 8) & 0xFF;
619    println!("[NVME] Controller version: {}.{}", ver_major, ver_minor);
620
621    // Read capabilities to determine max queue entries supported.
622    let cap_lo = read32(REG_CAP) as u64;
623    let cap_hi = read32(REG_CAP + 4) as u64;
624    let cap = cap_lo | (cap_hi << 32);
625    let mqes = ((cap & 0xFFFF) + 1) as u16;
626    let admin_qsize = ADMIN_QUEUE_SIZE.min(mqes);
627    println!(
628        "[NVME] CAP={:#018x}, MQES={}, using admin queue size={}",
629        cap, mqes, admin_qsize
630    );
631
632    // Step 2: Disable controller (CC.EN=0).
633    write32(REG_CC, 0);
634
635    // Wait for CSTS.RDY to clear.
636    let mut timeout = CONTROLLER_READY_TIMEOUT;
637    while (read32(REG_CSTS) & CSTS_RDY) != 0 {
638        if timeout == 0 {
639            println!("[NVME] Timeout waiting for controller disable");
640            return Err(KernelError::Timeout {
641                operation: "NVMe controller disable",
642                duration_ms: 500,
643            });
644        }
645        timeout -= 1;
646        core::hint::spin_loop();
647    }
648    println!("[NVME] Controller disabled");
649
650    // Step 3: Allocate physically contiguous memory for admin queues.
651    // Admin Submission Queue: 64 entries x 64 bytes = 4096 bytes = 1 frame.
652    // Admin Completion Queue: 64 entries x 16 bytes = 1024 bytes = 1 frame.
653    let asq_frame = FRAME_ALLOCATOR
654        .lock()
655        .allocate_frames(1, None)
656        .map_err(|_| KernelError::OutOfMemory {
657            requested: FRAME_SIZE,
658            available: 0,
659        })?;
660    let acq_frame = FRAME_ALLOCATOR
661        .lock()
662        .allocate_frames(1, None)
663        .map_err(|_| KernelError::OutOfMemory {
664            requested: FRAME_SIZE,
665            available: 0,
666        })?;
667
668    let asq_phys = asq_frame.as_u64() * FRAME_SIZE as u64;
669    let acq_phys = acq_frame.as_u64() * FRAME_SIZE as u64;
670
671    // Zero the queue memory.
672    let asq_virt = phys_to_virt_addr(asq_phys) as *mut u8;
673    let acq_virt = phys_to_virt_addr(acq_phys) as *mut u8;
674    // SAFETY: Writing to freshly allocated frames mapped via kernel direct-map.
675    // Each frame is FRAME_SIZE (4096) bytes. We zero the entire frame.
676    unsafe {
677        core::ptr::write_bytes(asq_virt, 0, FRAME_SIZE);
678        core::ptr::write_bytes(acq_virt, 0, FRAME_SIZE);
679    }
680
681    println!(
682        "[NVME] Admin queues allocated: ASQ phys={:#x}, ACQ phys={:#x}",
683        asq_phys, acq_phys
684    );
685
686    // Step 4: Program admin queue registers.
687    // AQA: Admin Queue Attributes -- ASQ size in bits [27:16], ACQ size in bits
688    // [11:0]. Sizes are 0-based (value N means N+1 entries).
689    let aqa = (((admin_qsize - 1) as u32) << 16) | ((admin_qsize - 1) as u32);
690    write32(REG_AQA, aqa);
691
692    // ASQ: Admin Submission Queue base address (physical, page-aligned).
693    write64(REG_ASQ, asq_phys);
694
695    // ACQ: Admin Completion Queue base address (physical, page-aligned).
696    write64(REG_ACQ, acq_phys);
697
698    // Step 5: Enable controller.
699    // CC register: EN=1, CSS=0 (NVM), MPS=0 (4KB pages), IOSQES=6 (64B), IOCQES=4
700    // (16B).
701    let cc_value =
702        CC_ENABLE | CC_CSS_NVM | CC_MPS_4K | CC_AMS_RR | CC_SHN_NONE | CC_IOSQES | CC_IOCQES;
703    write32(REG_CC, cc_value);
704    println!("[NVME] Controller enable: CC={:#010x}", cc_value);
705
706    // Wait for CSTS.RDY to assert.
707    timeout = CONTROLLER_READY_TIMEOUT;
708    loop {
709        let csts = read32(REG_CSTS);
710        if (csts & CSTS_RDY) != 0 {
711            break;
712        }
713        if (csts & CSTS_CFS) != 0 {
714            println!("[NVME] Controller fatal status during enable");
715            // Free allocated frames before returning error.
716            let _ = FRAME_ALLOCATOR.lock().free_frames(asq_frame, 1);
717            let _ = FRAME_ALLOCATOR.lock().free_frames(acq_frame, 1);
718            return Err(KernelError::HardwareError {
719                device: "nvme",
720                code: 2,
721            });
722        }
723        if timeout == 0 {
724            println!("[NVME] Timeout waiting for controller ready");
725            let _ = FRAME_ALLOCATOR.lock().free_frames(asq_frame, 1);
726            let _ = FRAME_ALLOCATOR.lock().free_frames(acq_frame, 1);
727            return Err(KernelError::Timeout {
728                operation: "NVMe controller enable",
729                duration_ms: 500,
730            });
731        }
732        timeout -= 1;
733        core::hint::spin_loop();
734    }
735    println!("[NVME] Controller ready");
736
737    // Step 6: Issue Identify Controller command (opcode 0x06, CNS=1).
738    // Allocate a frame for the 4KB identify data buffer.
739    let ident_frame = FRAME_ALLOCATOR
740        .lock()
741        .allocate_frames(1, None)
742        .map_err(|_| KernelError::OutOfMemory {
743            requested: FRAME_SIZE,
744            available: 0,
745        })?;
746    let ident_phys = ident_frame.as_u64() * FRAME_SIZE as u64;
747    let ident_virt = phys_to_virt_addr(ident_phys) as *mut u8;
748
749    // SAFETY: Zeroing freshly allocated identify data frame.
750    unsafe {
751        core::ptr::write_bytes(ident_virt, 0, FRAME_SIZE);
752    }
753
754    // Build Identify Controller submission queue entry.
755    let asq_entries = asq_virt as *mut SubmissionQueueEntry;
756    let identify_cmd = SubmissionQueueEntry {
757        opcode: ADMIN_IDENTIFY,
758        flags: 0,
759        command_id: 1,
760        nsid: 0,
761        _reserved: 0,
762        metadata: 0,
763        prp1: ident_phys, // PRP1 points to identify data buffer
764        prp2: 0,
765        cdw10: 1, // CNS=1: Identify Controller
766        cdw11: 0,
767        cdw12: 0,
768        cdw13: 0,
769        cdw14: 0,
770        cdw15: 0,
771    };
772
773    // Write command to ASQ slot 0.
774    // SAFETY: asq_entries points to the zeroed admin submission queue frame.
775    // Slot 0 is within bounds (admin_qsize >= 1). The queue memory is
776    // 4KB-aligned and large enough for 64 entries of 64 bytes each.
777    unsafe {
778        core::ptr::write_volatile(asq_entries, identify_cmd);
779    }
780
781    // Ring admin SQ doorbell (queue 0 SQ doorbell is at offset 0x1000).
782    write32(0x1000, 1); // Tail = 1 (we wrote entry at index 0)
783
784    // Poll admin completion queue for response.
785    let acq_entries = acq_virt as *const CompletionQueueEntry;
786    timeout = CONTROLLER_READY_TIMEOUT;
787    loop {
788        // SAFETY: Reading completion queue entry 0 from the ACQ frame.
789        let cqe = unsafe { core::ptr::read_volatile(acq_entries) };
790        // Phase bit (bit 0 of status) toggles on each wrap. Initially 0,
791        // so the first valid completion has phase bit = 1.
792        if (cqe.status & 1) != 0 {
793            // Check for error in status field (bits 1-15).
794            let status_code = (cqe.status >> 1) & 0x7FFF;
795            if status_code != 0 {
796                println!(
797                    "[NVME] Identify Controller failed: status={:#x}",
798                    status_code
799                );
800            } else {
801                // Parse identify data.
802                // SAFETY: ident_virt points to a 4KB frame filled by the
803                // controller via DMA. Offsets are within the 4KB page and
804                // we only read byte slices, so alignment is not an issue.
805                unsafe {
806                    // Serial number (bytes 4-23, ASCII, space-padded)
807                    let sn_ptr = ident_virt.add(IDENT_SERIAL_OFFSET);
808                    let sn_slice = core::slice::from_raw_parts(sn_ptr, IDENT_SERIAL_LEN);
809                    if let Ok(sn) = core::str::from_utf8(sn_slice) {
810                        println!("[NVME] Serial:   {}", sn.trim_end());
811                    }
812
813                    // Model number (bytes 24-63, ASCII, space-padded)
814                    let mn_ptr = ident_virt.add(IDENT_MODEL_OFFSET);
815                    let mn_slice = core::slice::from_raw_parts(mn_ptr, IDENT_MODEL_LEN);
816                    if let Ok(mn) = core::str::from_utf8(mn_slice) {
817                        println!("[NVME] Model:    {}", mn.trim_end());
818                    }
819
820                    // Firmware revision (bytes 64-71, ASCII)
821                    let fr_ptr = ident_virt.add(IDENT_FIRMWARE_OFFSET);
822                    let fr_slice = core::slice::from_raw_parts(fr_ptr, IDENT_FIRMWARE_LEN);
823                    if let Ok(fr) = core::str::from_utf8(fr_slice) {
824                        println!("[NVME] Firmware: {}", fr.trim_end());
825                    }
826
827                    // MDTS: Maximum Data Transfer Size (byte 77).
828                    // Value N means max transfer = 2^N * min memory page size.
829                    // 0 means no limit reported.
830                    let mdts = *ident_virt.add(IDENT_MDTS_OFFSET);
831                    if mdts > 0 {
832                        let max_transfer = 1u64 << (12 + mdts as u64); // 4KB * 2^MDTS
833                        println!(
834                            "[NVME] MDTS:     {} (max transfer {} bytes)",
835                            mdts, max_transfer
836                        );
837                    } else {
838                        println!("[NVME] MDTS:     0 (no limit)");
839                    }
840                }
841            }
842            break;
843        }
844        if timeout == 0 {
845            println!("[NVME] Timeout waiting for Identify Controller completion");
846            break;
847        }
848        timeout -= 1;
849        core::hint::spin_loop();
850    }
851
852    // Ring admin CQ doorbell (offset 0x1004 for CQ 0).
853    write32(0x1004, 1); // Head = 1
854
855    // Free the identify data buffer.
856    let _ = FRAME_ALLOCATOR.lock().free_frames(ident_frame, 1);
857
858    println!("[NVME] Admin queue initialization complete");
859    Ok(())
860}
861
862/// Detect and initialize NVMe devices via PCI bus enumeration.
863///
864/// Scans the PCI bus for Mass Storage controllers with NVMe subclass
865/// (class 0x01, subclass 0x08). On QEMU without NVMe devices, this
866/// will simply report no devices found.
867pub fn init() -> Result<(), KernelError> {
868    println!("[NVME] Scanning PCI bus for NVMe controllers...");
869
870    #[cfg(target_arch = "x86_64")]
871    {
872        let pci_bus = crate::drivers::pci::get_pci_bus().lock();
873        let storage_devices =
874            pci_bus.find_devices_by_class(crate::drivers::pci::class_codes::MASS_STORAGE);
875
876        let mut nvme_count = 0;
877        for dev in &storage_devices {
878            if dev.subclass == NVME_SUBCLASS {
879                nvme_count += 1;
880                println!(
881                    "[NVME] Found NVMe controller: {:04x}:{:04x} at {}:{}.{}",
882                    dev.vendor_id,
883                    dev.device_id,
884                    dev.location.bus,
885                    dev.location.device,
886                    dev.location.function,
887                );
888
889                // Report BAR0 (NVMe MMIO register space)
890                if let Some(bar) = dev.bars.first() {
891                    match bar {
892                        crate::drivers::pci::PciBar::Memory { address, size, .. } => {
893                            println!("[NVME]   BAR0: MMIO at {:#x}, size {:#x}", address, size);
894                        }
895                        crate::drivers::pci::PciBar::Io { address, size } => {
896                            println!("[NVME]   BAR0: I/O at {:#x}, size {:#x}", address, size);
897                        }
898                        crate::drivers::pci::PciBar::None => {}
899                    }
900                }
901
902                // Full NVMe initialization: map BAR0, set up admin queues,
903                // enable controller, and identify.
904                if let Some(bar0_phys) = dev.bars.first().and_then(|b| b.get_memory_address()) {
905                    if let Err(e) = initialize_nvme_controller(bar0_phys) {
906                        println!("[NVME] Controller init failed: {:?}", e);
907                    }
908                }
909            }
910        }
911
912        if nvme_count == 0 {
913            println!("[NVME] No NVMe controllers found on PCI bus");
914        } else {
915            println!("[NVME] Found {} NVMe controller(s)", nvme_count);
916        }
917    }
918
919    #[cfg(not(target_arch = "x86_64"))]
920    {
921        println!("[NVME] NVMe PCI scanning not available on this architecture");
922    }
923
924    Ok(())
925}
926
927#[cfg(test)]
928mod tests {
929    use super::*;
930
931    #[test]
932    fn test_submission_queue_entry_size() {
933        assert_eq!(core::mem::size_of::<SubmissionQueueEntry>(), 64);
934    }
935
936    #[test]
937    fn test_completion_queue_entry_size() {
938        assert_eq!(core::mem::size_of::<CompletionQueueEntry>(), 16);
939    }
940}