When irqfd is enabled, we bypass QEMU's irq emulation and let KVM to
directly assert the irq. However, KVM is not aware of the device's MSI-x
masking status. Add MSI-x mask bookkeeping in NVMe emulation and
detach the corresponding irqfd when the certain vector is masked.

Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn>
---
 hw/nvme/ctrl.c       | 82 ++++++++++++++++++++++++++++++++++++++++++++
 hw/nvme/nvme.h       |  2 ++
 hw/nvme/trace-events |  3 ++
 3 files changed, 87 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 74075f782f..30bbda7bb5 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -7493,10 +7493,84 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, 
uint8_t offset)
 
     return 0;
 }
+static int nvme_vector_unmask(PCIDevice *pci_dev, unsigned vector,
+                               MSIMessage msg)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+    int ret;
+
+    trace_pci_nvme_irq_unmask(vector, msg.address, msg.data);
+    
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        /* 
+         * If this function is called, then irqfd must be available. Therefore,
+         * irqfd must be in use if cq->assert_notifier.initialized is true.
+         */
+        if (cq && cq->vector == vector && cq->assert_notifier.initialized) {
+            if (cq->msg.data != msg.data || cq->msg.address != msg.address) {
+                ret = kvm_irqchip_update_msi_route(kvm_state, cq->virq, msg,
+                                                   pci_dev);
+                if (ret < 0) {
+                    return ret;
+                }
+                kvm_irqchip_commit_routes(kvm_state);
+                cq->msg = msg;
+            }
+
+            ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                                     &cq->assert_notifier,
+                                                     NULL, cq->virq);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void nvme_vector_mask(PCIDevice *pci_dev, unsigned vector)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_mask(vector);
+    
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        if (cq && cq->vector == vector && cq->assert_notifier.initialized) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                  &cq->assert_notifier,
+                                                  cq->virq);
+        }
+    }
+}
+
+static void nvme_vector_poll(PCIDevice *pci_dev,
+                             unsigned int vector_start,
+                             unsigned int vector_end)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_poll(vector_start, vector_end);
+
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+        if (cq && cq->vector >= vector_start && cq->vector <= vector_end 
+            && msix_is_masked(pci_dev, cq->vector) 
+            && cq->assert_notifier.initialized) {
+            if (event_notifier_test_and_clear(&cq->assert_notifier)) {
+                msix_set_pending(pci_dev, i);
+            }
+        }
+    }
+}
 
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
     uint8_t *pci_conf = pci_dev->config;
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
     uint64_t bar_size;
     unsigned msix_table_offset, msix_pba_offset;
     int ret;
@@ -7549,6 +7623,13 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
         }
     }
 
+    if (with_irqfd) {
+        msix_set_vector_notifiers(pci_dev,
+                                  nvme_vector_unmask,
+                                  nvme_vector_mask,
+                                  nvme_vector_poll);
+    }
+
     nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
 
     if (n->params.cmb_size_mb) {
@@ -7796,6 +7877,7 @@ static void nvme_exit(PCIDevice *pci_dev)
         pcie_sriov_pf_exit(pci_dev);
     }
 
+    msix_unset_vector_notifiers(pci_dev);
     msix_uninit(pci_dev, &n->bar0, &n->bar0);
     memory_region_del_subregion(&n->bar0, &n->iomem);
 }
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 85fd9cd0e2..707a55ebfc 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -20,6 +20,7 @@
 
 #include "qemu/uuid.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
 #include "hw/block/block.h"
 
 #include "block/nvme.h"
@@ -401,6 +402,7 @@ typedef struct NvmeCQueue {
     EventNotifier notifier;
     EventNotifier assert_notifier;
     EventNotifier deassert_notifier;
+    MSIMessage  msg;
     bool        first_io_cqe;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index fccb79f489..b11fcf4a65 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -2,6 +2,9 @@
 pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
 pci_nvme_irq_pin(void) "pulsing IRQ pin"
 pci_nvme_irq_masked(void) "IRQ is masked"
+pci_nvme_irq_mask(uint32_t vector) "IRQ %u gets masked"
+pci_nvme_irq_unmask(uint32_t vector, uint64_t addr, uint32_t data) "IRQ %u 
gets unmasked, addr=0x%"PRIx64" data=0x%"PRIu32""
+pci_nvme_irq_poll(uint32_t vector_start, uint32_t vector_end) "IRQ poll, 
start=0x%"PRIu32" end=0x%"PRIu32""
 pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" 
prp2=0x%"PRIx64""
 pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) 
"dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
-- 
2.25.1


Reply via email to