When irqfd is enabled, we bypass QEMU's irq emulation and let KVM to directly assert the irq. However, KVM is not aware of the device's MSI-x masking status. Add MSI-x mask bookkeeping in NVMe emulation and detach the corresponding irqfd when the certain vector is masked.
Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn> --- hw/nvme/ctrl.c | 82 ++++++++++++++++++++++++++++++++++++++++++++ hw/nvme/nvme.h | 2 ++ hw/nvme/trace-events | 3 ++ 3 files changed, 87 insertions(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 74075f782f..30bbda7bb5 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -7493,10 +7493,84 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) return 0; } +static int nvme_vector_unmask(PCIDevice *pci_dev, unsigned vector, + MSIMessage msg) +{ + NvmeCtrl *n = NVME(pci_dev); + int ret; + + trace_pci_nvme_irq_unmask(vector, msg.address, msg.data); + + for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) { + NvmeCQueue *cq = n->cq[i]; + /* + * If this function is called, then irqfd must be available. Therefore, + * irqfd must be in use if cq->assert_notifier.initialized is true. + */ + if (cq && cq->vector == vector && cq->assert_notifier.initialized) { + if (cq->msg.data != msg.data || cq->msg.address != msg.address) { + ret = kvm_irqchip_update_msi_route(kvm_state, cq->virq, msg, + pci_dev); + if (ret < 0) { + return ret; + } + kvm_irqchip_commit_routes(kvm_state); + cq->msg = msg; + } + + ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, + &cq->assert_notifier, + NULL, cq->virq); + if (ret < 0) { + return ret; + } + } + } + + return 0; +} + +static void nvme_vector_mask(PCIDevice *pci_dev, unsigned vector) +{ + NvmeCtrl *n = NVME(pci_dev); + + trace_pci_nvme_irq_mask(vector); + + for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) { + NvmeCQueue *cq = n->cq[i]; + if (cq && cq->vector == vector && cq->assert_notifier.initialized) { + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, + &cq->assert_notifier, + cq->virq); + } + } +} + +static void nvme_vector_poll(PCIDevice *pci_dev, + unsigned int vector_start, + unsigned int vector_end) +{ + NvmeCtrl *n = NVME(pci_dev); + + trace_pci_nvme_irq_poll(vector_start, vector_end); + + for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) { + NvmeCQueue *cq = n->cq[i]; + if (cq && cq->vector >= vector_start && cq->vector <= vector_end + && msix_is_masked(pci_dev, cq->vector) + && cq->assert_notifier.initialized) { + if (event_notifier_test_and_clear(&cq->assert_notifier)) { + msix_set_pending(pci_dev, i); + } + } + } +} static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; + bool with_irqfd = msix_enabled(&n->parent_obj) && + kvm_msi_via_irqfd_enabled(); uint64_t bar_size; unsigned msix_table_offset, msix_pba_offset; int ret; @@ -7549,6 +7623,13 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } } + if (with_irqfd) { + msix_set_vector_notifiers(pci_dev, + nvme_vector_unmask, + nvme_vector_mask, + nvme_vector_poll); + } + nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); if (n->params.cmb_size_mb) { @@ -7796,6 +7877,7 @@ static void nvme_exit(PCIDevice *pci_dev) pcie_sriov_pf_exit(pci_dev); } + msix_unset_vector_notifiers(pci_dev); msix_uninit(pci_dev, &n->bar0, &n->bar0); memory_region_del_subregion(&n->bar0, &n->iomem); } diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 85fd9cd0e2..707a55ebfc 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -20,6 +20,7 @@ #include "qemu/uuid.h" #include "hw/pci/pci.h" +#include "hw/pci/msi.h" #include "hw/block/block.h" #include "block/nvme.h" @@ -401,6 +402,7 @@ typedef struct NvmeCQueue { EventNotifier notifier; EventNotifier assert_notifier; EventNotifier deassert_notifier; + MSIMessage msg; bool first_io_cqe; bool ioeventfd_enabled; QTAILQ_HEAD(, NvmeSQueue) sq_list; diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index fccb79f489..b11fcf4a65 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -2,6 +2,9 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" +pci_nvme_irq_mask(uint32_t vector) "IRQ %u gets masked" +pci_nvme_irq_unmask(uint32_t vector, uint64_t addr, uint32_t data) "IRQ %u gets unmasked, addr=0x%"PRIx64" data=0x%"PRIu32"" +pci_nvme_irq_poll(uint32_t vector_start, uint32_t vector_end) "IRQ poll, start=0x%"PRIu32" end=0x%"PRIu32"" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64"" pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" -- 2.25.1