Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
After digging through the source code, I found event_notifier_cleanup() only closes the eventfd, but does not de-register the event from QEMU’s event loop. event_notifier_set_handler() actually interacts with the event loop. It is a wrapper around aio_set_event_notifier(), which is again a wrapper of aio_set_fd_handler(). Passing in a handler of NULL means removing the fd handler from the event loop. Therefore, originally in nvme_free_sq/cq(), we closed the eventfd but did not delete its handler. I guess maybe sometime later the fd is reused and that corresponding file is somehow written (A POLLIN event), this will trigger the event loop to call the stale handler, which causes this bug. So the correct order is: Init: 1. event_notifier_init: Open the eventfd 2. event_notifier_set_handler: Register on the event loop 3. memory_region_add_eventfd: Associate with IO region Cleanup: 1. memory_region_del_eventfd: De-associate with IO region 2. event_notifier_set_handler(NULL): Remove from the event loop 3. event_notifier_cleanup: Close the eventfd
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 27 16:16, Jinhao Fan wrote: > at 3:06 PM, Klaus Jensen wrote: > > > On Jul 26 14:08, Klaus Jensen wrote: > >> Alright. Forget about the iommu, that was just a coincidence. > >> > >> This patch seems to fix it. I guess it is the > >> event_notifier_set_handler(..., NULL) that does the trick, but I'd like > >> to understand why ;) > >> > >> > >> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c > >> index 533ad14e7a61..3bc3c6bfbe78 100644 > >> --- i/hw/nvme/ctrl.c > >> +++ w/hw/nvme/ctrl.c > >> @@ -4238,7 +4238,9 @@ static void nvme_cq_notifier(EventNotifier *e) > >> NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); > >> NvmeCtrl *n = cq->ctrl; > >> > >> -event_notifier_test_and_clear(&cq->notifier); > >> +if (!event_notifier_test_and_clear(e)) { > >> +return; > >> +} > >> > >> nvme_update_cq_head(cq); > >> > >> @@ -4275,7 +4277,9 @@ static void nvme_sq_notifier(EventNotifier *e) > >> { > >> NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); > >> > >> -event_notifier_test_and_clear(&sq->notifier); > >> +if (!event_notifier_test_and_clear(e)) { > >> +return; > >> +} > >> > > Yes, virtio also checks the return value of event_notifier_test_and_clear(). > > >> nvme_process_sq(sq); > >> } > >> @@ -4307,6 +4311,8 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > >> if (sq->ioeventfd_enabled) { > >> memory_region_del_eventfd(&n->iomem, > >> 0x1000 + offset, 4, false, 0, > >> &sq->notifier); > >> +event_notifier_set_handler(&sq->notifier, NULL); > >> +nvme_sq_notifier(&sq->notifier); > >> event_notifier_cleanup(&sq->notifier); > >> } > >> g_free(sq->io_req); > >> @@ -4697,6 +4703,8 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > >> if (cq->ioeventfd_enabled) { > >> memory_region_del_eventfd(&n->iomem, > >> 0x1000 + offset, 4, false, 0, > >> &cq->notifier); > >> +event_notifier_set_handler(&cq->notifier, NULL); > >> +nvme_cq_notifier(&cq->notifier); > >> event_notifier_cleanup(&cq->notifier); > >> } > >> if (msix_enabled(&n->parent_obj)) { > > > > I’m a bit messed up here. I will further check how virtio handles queue > deletion today. Yeah, the API documentation is not exactly exhaustive on the specifics here, so I kinda inferred this from the virtio code. If this is the way to do it, then I think I will follow up with a patch for the event notifier api with a not on how teardown should be done. Paolo - get_maintainer.pl spits you out for main-loop.h. Any chance you could shed some light on this? signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 3:06 PM, Klaus Jensen wrote: > On Jul 26 14:08, Klaus Jensen wrote: >> Alright. Forget about the iommu, that was just a coincidence. >> >> This patch seems to fix it. I guess it is the >> event_notifier_set_handler(..., NULL) that does the trick, but I'd like >> to understand why ;) >> >> >> diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c >> index 533ad14e7a61..3bc3c6bfbe78 100644 >> --- i/hw/nvme/ctrl.c >> +++ w/hw/nvme/ctrl.c >> @@ -4238,7 +4238,9 @@ static void nvme_cq_notifier(EventNotifier *e) >> NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); >> NvmeCtrl *n = cq->ctrl; >> >> -event_notifier_test_and_clear(&cq->notifier); >> +if (!event_notifier_test_and_clear(e)) { >> +return; >> +} >> >> nvme_update_cq_head(cq); >> >> @@ -4275,7 +4277,9 @@ static void nvme_sq_notifier(EventNotifier *e) >> { >> NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); >> >> -event_notifier_test_and_clear(&sq->notifier); >> +if (!event_notifier_test_and_clear(e)) { >> +return; >> +} >> Yes, virtio also checks the return value of event_notifier_test_and_clear(). >> nvme_process_sq(sq); >> } >> @@ -4307,6 +4311,8 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) >> if (sq->ioeventfd_enabled) { >> memory_region_del_eventfd(&n->iomem, >> 0x1000 + offset, 4, false, 0, >> &sq->notifier); >> +event_notifier_set_handler(&sq->notifier, NULL); >> +nvme_sq_notifier(&sq->notifier); >> event_notifier_cleanup(&sq->notifier); >> } >> g_free(sq->io_req); >> @@ -4697,6 +4703,8 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) >> if (cq->ioeventfd_enabled) { >> memory_region_del_eventfd(&n->iomem, >> 0x1000 + offset, 4, false, 0, >> &cq->notifier); >> +event_notifier_set_handler(&cq->notifier, NULL); >> +nvme_cq_notifier(&cq->notifier); >> event_notifier_cleanup(&cq->notifier); >> } >> if (msix_enabled(&n->parent_obj)) { > I’m a bit messed up here. I will further check how virtio handles queue deletion today.
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 14:08, Klaus Jensen wrote: > > Alright. Forget about the iommu, that was just a coincidence. > > This patch seems to fix it. I guess it is the > event_notifier_set_handler(..., NULL) that does the trick, but I'd like > to understand why ;) > > > diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c > index 533ad14e7a61..3bc3c6bfbe78 100644 > --- i/hw/nvme/ctrl.c > +++ w/hw/nvme/ctrl.c > @@ -4238,7 +4238,9 @@ static void nvme_cq_notifier(EventNotifier *e) > NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); > NvmeCtrl *n = cq->ctrl; > > -event_notifier_test_and_clear(&cq->notifier); > +if (!event_notifier_test_and_clear(e)) { > +return; > +} > > nvme_update_cq_head(cq); > > @@ -4275,7 +4277,9 @@ static void nvme_sq_notifier(EventNotifier *e) > { > NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); > > -event_notifier_test_and_clear(&sq->notifier); > +if (!event_notifier_test_and_clear(e)) { > +return; > +} > > nvme_process_sq(sq); > } > @@ -4307,6 +4311,8 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > if (sq->ioeventfd_enabled) { > memory_region_del_eventfd(&n->iomem, >0x1000 + offset, 4, false, 0, > &sq->notifier); > +event_notifier_set_handler(&sq->notifier, NULL); > +nvme_sq_notifier(&sq->notifier); > event_notifier_cleanup(&sq->notifier); > } > g_free(sq->io_req); > @@ -4697,6 +4703,8 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > if (cq->ioeventfd_enabled) { > memory_region_del_eventfd(&n->iomem, >0x1000 + offset, 4, false, 0, > &cq->notifier); > +event_notifier_set_handler(&cq->notifier, NULL); > +nvme_cq_notifier(&cq->notifier); > event_notifier_cleanup(&cq->notifier); > } > if (msix_enabled(&n->parent_obj)) { Jinhao, Do you have any comments on the above patch - does it make sense to you, considering the effort you've done into researching how virtio does this? signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Tue, 5 Jul 2022 at 10:25, Jinhao Fan wrote: > > Add property "ioeventfd" which is enabled by default. When this is > enabled, updates on the doorbell registers will cause KVM to signal > an event to the QEMU main loop to handle the doorbell updates. > Therefore, instead of letting the vcpu thread run both guest VM and > IO emulation, we now use the main loop thread to do IO emulation and > thus the vcpu thread has more cycles for the guest VM. > > Since ioeventfd does not tell us the exact value that is written, it is > only useful when shadow doorbell buffer is enabled, where we check > for the value in the shadow doorbell buffer when we get the doorbell > update event. > > IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) > > qd 1 4 16 64 > qemu35 121 176 153 > ioeventfd 41 133 258 313 Nice > > Changes since v3: > - Do not deregister ioeventfd when it was not enabled on a SQ/CQ > > Signed-off-by: Jinhao Fan > --- > hw/nvme/ctrl.c | 114 - > hw/nvme/nvme.h | 5 +++ > 2 files changed, 118 insertions(+), 1 deletion(-) > > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index c952c34f94..4b75c5f549 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > *cq, NvmeRequest *req) > > QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); > QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > + > +if (req->sq->ioeventfd_enabled) { > +/* Post CQE directly since we are in main loop thread */ > +nvme_post_cqes(cq); > +} else { > +/* Schedule the timer to post CQE later since we are in vcpu thread > */ > +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > +} This change is somewhat independent of ioeventfd. I wonder how much of the performance improvement is due to the sq ioeventfd and how much is due to bypassing the completion timer. In the qd=1 case I expect the completion timer to increase latency and hurt performance. In the qd=64 case the timer increases batching, reduces CPU consumption, and probably improves performance. It would be nice to measure this in isolation, independently of ioeventfd/IOThreads. > } > > static void nvme_process_aers(void *opaque) > @@ -4195,10 +4202,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest > *req) > return NVME_INVALID_OPCODE | NVME_DNR; > } > > +static void nvme_cq_notifier(EventNotifier *e) > +{ > +NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); > +NvmeCtrl *n = cq->ctrl; > + > +event_notifier_test_and_clear(&cq->notifier); > + > +nvme_update_cq_head(cq); > + > +if (cq->tail == cq->head) { > +if (cq->irq_enabled) { > +n->cq_pending--; > +} > + > +nvme_irq_deassert(n, cq); > +} > + > +nvme_post_cqes(cq); > +} > + > +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) > +{ > +NvmeCtrl *n = cq->ctrl; > +uint16_t offset = (cq->cqid << 3) + (1 << 2); Too many... > +int ret; > + > +ret = event_notifier_init(&cq->notifier, 0); > +if (ret < 0) { > +return ret; > +} > + > +event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); > +memory_region_add_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, &cq->notifier); ...magic numbers. Please use constants so it's clear what << 3, (1 << 2), 0x1000, etc mean. > + > +return 0; > +} > + > +static void nvme_sq_notifier(EventNotifier *e) > +{ > +NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); > + > +event_notifier_test_and_clear(&sq->notifier); > + > +nvme_process_sq(sq); > +} > + > +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) > +{ > +NvmeCtrl *n = sq->ctrl; > +uint16_t offset = sq->sqid << 3; > +int ret; > + > +ret = event_notifier_init(&sq->notifier, 0); > +if (ret < 0) { > +return ret; > +} > + > +event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); > +memory_region_add_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, &sq->notifier); > + > +return 0; > +} > + > static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > { > +uint16_t offset = sq->sqid << 3; > + > n->sq[sq->sqid] = NULL; > timer_free(sq->timer); > +if (sq->ioeventfd_enabled) { > +memory_region_del_eventfd(&n->iomem, > + 0x1000 + offset, 4, false, 0, > &sq->notifier); > +event_notifier_cleanup(&sq->notifier); > +} > g_free(sq->io_req); > if (sq->sqid) { > g_free(sq); > @@ -4271,6 +4350,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, > uint64_t dma_addr, > if (n->dbbuf_enabled) { > sq->db_addr = n->dbbuf_dbs + (sqid << 3); > sq->ei_addr = n->dbbuf_eis + (sqid << 3); > + > +
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 13:32, Klaus Jensen wrote: > On Jul 26 13:24, Klaus Jensen wrote: > > On Jul 26 12:09, Klaus Jensen wrote: > > > On Jul 26 11:19, Klaus Jensen wrote: > > > > On Jul 26 15:55, Jinhao Fan wrote: > > > > > at 3:41 PM, Klaus Jensen wrote: > > > > > > > > > > > On Jul 26 15:35, Jinhao Fan wrote: > > > > > >> at 4:55 AM, Klaus Jensen wrote: > > > > > >> > > > > > >>> We have a regression following this patch that we need to address. > > > > > >>> > > > > > >>> With this patch, issuing a reset on the device (`nvme reset > > > > > >>> /dev/nvme0` > > > > > >>> will do the trick) causes QEMU to hog my host cpu at 100%. > > > > > >>> > > > > > >>> I'm still not sure what causes this. The trace output is a bit > > > > > >>> inconclusive still. > > > > > >>> > > > > > >>> I'll keep looking into it. > > > > > >> > > > > > >> I cannot reproduce this bug. I just start the VM and used `nvme > > > > > >> reset > > > > > >> /dev/nvme0`. Did you do anything before the reset? > > > > > > > > > > > > Interesting and thanks for checking! Looks like a kernel issue then! > > > > > > > > > > > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel > > > > > > and > > > > > > reverting to a stock OS kernel did not produce the bug. > > > > > > > > > > I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works > > > > > ok on > > > > > my machine. > > > > > > > > Interesting. I can reproduce on 5.19-rc4 from torvalds tree. Can you > > > > drop your qemu command line here? > > > > > > > > This is mine. > > > > > > > > /home/kbj/work/src/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ > > > > -nodefaults \ > > > > -display "none" \ > > > > -machine "q35,accel=kvm,kernel-irqchip=split" \ > > > > -cpu "host" \ > > > > -smp "4" \ > > > > -m "8G" \ > > > > -device "intel-iommu" \ > > > > -netdev "user,id=net0,hostfwd=tcp::-:22" \ > > > > -device "virtio-net-pci,netdev=net0" \ > > > > -device "virtio-rng-pci" \ > > > > -drive > > > > "id=boot,file=/home/kbj/work/vol/machines/img/nvme.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" > > > > \ > > > > -device "pcie-root-port,id=pcie_root_port1,chassis=1,slot=0" \ > > > > -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port1,mdts=7" \ > > > > -drive > > > > "id=null,if=none,file=null-co://,file.read-zeroes=on,format=raw" \ > > > > -device > > > > "nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,drive=null,logical_block_size=4096,physical_block_size=4096" > > > > \ > > > > -pidfile "/home/kbj/work/vol/machines/run/null/pidfile" \ > > > > -kernel "/home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage" \ > > > > -append "root=/dev/vda1 console=ttyS0,115200 audit=0 intel_iommu=on" \ > > > > -virtfs > > > > "local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly=on,mount_tag=kernel_dir" > > > > \ > > > > -serial "mon:stdio" \ > > > > -d "guest_errors" \ > > > > -D "/home/kbj/work/vol/machines/log/null/qemu.log" \ > > > > -trace "pci_nvme*" > > > > > > Alright. It was *some* config issue with my kernel. Reverted to a > > > defconfig + requirements and the issue went away. > > > > > > > And it went away because I didn't include iommu support in that kernel (and > > its > > not enabled by default on the stock OS kernel). > > > > > I'll try to track down what happended, but doesnt look like qemu is at > > > fault here. > > > > OK. So. > > > > I can continue to reproduce this if the machine has a virtual intel iommu > > enabled. And it only happens when this commit is applied. > > > > I even backported this patch (and the shadow doorbell patch) to v7.0 and > > v6.2 > > (i.e. no SRIOV or CC logic changes that could be buggy) and it still > > exhibits > > this behavior. Sometimes QEMU coredumps on poweroff and I managed to grab > > one: > > > > Program terminated with signal SIGSEGV, Segmentation fault. > > #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 > > 5720 NvmeCQueue *cq = n->cq[sq->cqid]; > > [Current thread is 1 (Thread 0x7f7363553cc0 (LWP 2554896))] > > (gdb) bt > > #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 > > #1 0x556326e82e28 in nvme_sq_notifier (e=0x556329708148) at > > ../hw/nvme/ctrl.c:3993 > > #2 0x55632738396a in aio_dispatch_handler (ctx=0x5563291c3160, > > node=0x55632a228b60) at ../util/aio-posix.c:329 > > #3 0x556327383b22 in aio_dispatch_handlers (ctx=0x5563291c3160) at > > ../util/aio-posix.c:372 > > #4 0x556327383b78 in aio_dispatch (ctx=0x5563291c3160) at > > ../util/aio-posix.c:382 > > #5 0x55632739d748 in aio_ctx_dispatch (source=0x5563291c3160, > > callback=0x0, user_data=0x0) at ../util/async.c:311 > > #6 0x7f7369398163 in g_main_context_dispatch () at > > /usr/lib64/libglib-2.0.so.0 > > #7 0x5563273af279 in glib_pollfds_poll () at ../util/main-loop.c:232 > > #8 0x5563273af2f6 in os_host_main_loop_wait (timeout=0x1dbe22c0
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 13:24, Klaus Jensen wrote: > On Jul 26 12:09, Klaus Jensen wrote: > > On Jul 26 11:19, Klaus Jensen wrote: > > > On Jul 26 15:55, Jinhao Fan wrote: > > > > at 3:41 PM, Klaus Jensen wrote: > > > > > > > > > On Jul 26 15:35, Jinhao Fan wrote: > > > > >> at 4:55 AM, Klaus Jensen wrote: > > > > >> > > > > >>> We have a regression following this patch that we need to address. > > > > >>> > > > > >>> With this patch, issuing a reset on the device (`nvme reset > > > > >>> /dev/nvme0` > > > > >>> will do the trick) causes QEMU to hog my host cpu at 100%. > > > > >>> > > > > >>> I'm still not sure what causes this. The trace output is a bit > > > > >>> inconclusive still. > > > > >>> > > > > >>> I'll keep looking into it. > > > > >> > > > > >> I cannot reproduce this bug. I just start the VM and used `nvme reset > > > > >> /dev/nvme0`. Did you do anything before the reset? > > > > > > > > > > Interesting and thanks for checking! Looks like a kernel issue then! > > > > > > > > > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel and > > > > > reverting to a stock OS kernel did not produce the bug. > > > > > > > > I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works ok > > > > on > > > > my machine. > > > > > > Interesting. I can reproduce on 5.19-rc4 from torvalds tree. Can you > > > drop your qemu command line here? > > > > > > This is mine. > > > > > > /home/kbj/work/src/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ > > > -nodefaults \ > > > -display "none" \ > > > -machine "q35,accel=kvm,kernel-irqchip=split" \ > > > -cpu "host" \ > > > -smp "4" \ > > > -m "8G" \ > > > -device "intel-iommu" \ > > > -netdev "user,id=net0,hostfwd=tcp::-:22" \ > > > -device "virtio-net-pci,netdev=net0" \ > > > -device "virtio-rng-pci" \ > > > -drive > > > "id=boot,file=/home/kbj/work/vol/machines/img/nvme.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" > > > \ > > > -device "pcie-root-port,id=pcie_root_port1,chassis=1,slot=0" \ > > > -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port1,mdts=7" \ > > > -drive "id=null,if=none,file=null-co://,file.read-zeroes=on,format=raw" > > > \ > > > -device > > > "nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,drive=null,logical_block_size=4096,physical_block_size=4096" > > > \ > > > -pidfile "/home/kbj/work/vol/machines/run/null/pidfile" \ > > > -kernel "/home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage" \ > > > -append "root=/dev/vda1 console=ttyS0,115200 audit=0 intel_iommu=on" \ > > > -virtfs > > > "local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly=on,mount_tag=kernel_dir" > > > \ > > > -serial "mon:stdio" \ > > > -d "guest_errors" \ > > > -D "/home/kbj/work/vol/machines/log/null/qemu.log" \ > > > -trace "pci_nvme*" > > > > Alright. It was *some* config issue with my kernel. Reverted to a > > defconfig + requirements and the issue went away. > > > > And it went away because I didn't include iommu support in that kernel (and > its > not enabled by default on the stock OS kernel). > > > I'll try to track down what happended, but doesnt look like qemu is at > > fault here. > > OK. So. > > I can continue to reproduce this if the machine has a virtual intel iommu > enabled. And it only happens when this commit is applied. > > I even backported this patch (and the shadow doorbell patch) to v7.0 and v6.2 > (i.e. no SRIOV or CC logic changes that could be buggy) and it still exhibits > this behavior. Sometimes QEMU coredumps on poweroff and I managed to grab one: > > Program terminated with signal SIGSEGV, Segmentation fault. > #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 > 5720 NvmeCQueue *cq = n->cq[sq->cqid]; > [Current thread is 1 (Thread 0x7f7363553cc0 (LWP 2554896))] > (gdb) bt > #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 > #1 0x556326e82e28 in nvme_sq_notifier (e=0x556329708148) at > ../hw/nvme/ctrl.c:3993 > #2 0x55632738396a in aio_dispatch_handler (ctx=0x5563291c3160, > node=0x55632a228b60) at ../util/aio-posix.c:329 > #3 0x556327383b22 in aio_dispatch_handlers (ctx=0x5563291c3160) at > ../util/aio-posix.c:372 > #4 0x556327383b78 in aio_dispatch (ctx=0x5563291c3160) at > ../util/aio-posix.c:382 > #5 0x55632739d748 in aio_ctx_dispatch (source=0x5563291c3160, > callback=0x0, user_data=0x0) at ../util/async.c:311 > #6 0x7f7369398163 in g_main_context_dispatch () at > /usr/lib64/libglib-2.0.so.0 > #7 0x5563273af279 in glib_pollfds_poll () at ../util/main-loop.c:232 > #8 0x5563273af2f6 in os_host_main_loop_wait (timeout=0x1dbe22c0) at > ../util/main-loop.c:255 > #9 0x5563273af404 in main_loop_wait (nonblocking=0x0) at > ../util/main-loop.c:531 > #10 0x5563270714d9 in qemu_main_loop () at ../softmmu/runstate.c:726 > #11 0x556326c7ea46 in main (argc=0x2e, argv=0x7ffc6977f198, > envp=0x7ffc6977f310) at ../sof
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 12:09, Klaus Jensen wrote: > On Jul 26 11:19, Klaus Jensen wrote: > > On Jul 26 15:55, Jinhao Fan wrote: > > > at 3:41 PM, Klaus Jensen wrote: > > > > > > > On Jul 26 15:35, Jinhao Fan wrote: > > > >> at 4:55 AM, Klaus Jensen wrote: > > > >> > > > >>> We have a regression following this patch that we need to address. > > > >>> > > > >>> With this patch, issuing a reset on the device (`nvme reset > > > >>> /dev/nvme0` > > > >>> will do the trick) causes QEMU to hog my host cpu at 100%. > > > >>> > > > >>> I'm still not sure what causes this. The trace output is a bit > > > >>> inconclusive still. > > > >>> > > > >>> I'll keep looking into it. > > > >> > > > >> I cannot reproduce this bug. I just start the VM and used `nvme reset > > > >> /dev/nvme0`. Did you do anything before the reset? > > > > > > > > Interesting and thanks for checking! Looks like a kernel issue then! > > > > > > > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel and > > > > reverting to a stock OS kernel did not produce the bug. > > > > > > I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works ok on > > > my machine. > > > > Interesting. I can reproduce on 5.19-rc4 from torvalds tree. Can you > > drop your qemu command line here? > > > > This is mine. > > > > /home/kbj/work/src/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ > > -nodefaults \ > > -display "none" \ > > -machine "q35,accel=kvm,kernel-irqchip=split" \ > > -cpu "host" \ > > -smp "4" \ > > -m "8G" \ > > -device "intel-iommu" \ > > -netdev "user,id=net0,hostfwd=tcp::-:22" \ > > -device "virtio-net-pci,netdev=net0" \ > > -device "virtio-rng-pci" \ > > -drive > > "id=boot,file=/home/kbj/work/vol/machines/img/nvme.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" > > \ > > -device "pcie-root-port,id=pcie_root_port1,chassis=1,slot=0" \ > > -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port1,mdts=7" \ > > -drive "id=null,if=none,file=null-co://,file.read-zeroes=on,format=raw" \ > > -device > > "nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,drive=null,logical_block_size=4096,physical_block_size=4096" > > \ > > -pidfile "/home/kbj/work/vol/machines/run/null/pidfile" \ > > -kernel "/home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage" \ > > -append "root=/dev/vda1 console=ttyS0,115200 audit=0 intel_iommu=on" \ > > -virtfs > > "local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly=on,mount_tag=kernel_dir" > > \ > > -serial "mon:stdio" \ > > -d "guest_errors" \ > > -D "/home/kbj/work/vol/machines/log/null/qemu.log" \ > > -trace "pci_nvme*" > > Alright. It was *some* config issue with my kernel. Reverted to a > defconfig + requirements and the issue went away. > And it went away because I didn't include iommu support in that kernel (and its not enabled by default on the stock OS kernel). > I'll try to track down what happended, but doesnt look like qemu is at > fault here. OK. So. I can continue to reproduce this if the machine has a virtual intel iommu enabled. And it only happens when this commit is applied. I even backported this patch (and the shadow doorbell patch) to v7.0 and v6.2 (i.e. no SRIOV or CC logic changes that could be buggy) and it still exhibits this behavior. Sometimes QEMU coredumps on poweroff and I managed to grab one: Program terminated with signal SIGSEGV, Segmentation fault. #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 5720 NvmeCQueue *cq = n->cq[sq->cqid]; [Current thread is 1 (Thread 0x7f7363553cc0 (LWP 2554896))] (gdb) bt #0 nvme_process_sq (opaque=0x556329708110) at ../hw/nvme/ctrl.c:5720 #1 0x556326e82e28 in nvme_sq_notifier (e=0x556329708148) at ../hw/nvme/ctrl.c:3993 #2 0x55632738396a in aio_dispatch_handler (ctx=0x5563291c3160, node=0x55632a228b60) at ../util/aio-posix.c:329 #3 0x556327383b22 in aio_dispatch_handlers (ctx=0x5563291c3160) at ../util/aio-posix.c:372 #4 0x556327383b78 in aio_dispatch (ctx=0x5563291c3160) at ../util/aio-posix.c:382 #5 0x55632739d748 in aio_ctx_dispatch (source=0x5563291c3160, callback=0x0, user_data=0x0) at ../util/async.c:311 #6 0x7f7369398163 in g_main_context_dispatch () at /usr/lib64/libglib-2.0.so.0 #7 0x5563273af279 in glib_pollfds_poll () at ../util/main-loop.c:232 #8 0x5563273af2f6 in os_host_main_loop_wait (timeout=0x1dbe22c0) at ../util/main-loop.c:255 #9 0x5563273af404 in main_loop_wait (nonblocking=0x0) at ../util/main-loop.c:531 #10 0x5563270714d9 in qemu_main_loop () at ../softmmu/runstate.c:726 #11 0x556326c7ea46 in main (argc=0x2e, argv=0x7ffc6977f198, envp=0x7ffc6977f310) at ../softmmu/main.c:50 At this point, there should not be any CQ/SQs (I detached the device from the kernel driver which deletes all queues and bound it to vfio-pci instead), but somehow a stale notifier is called on poweroff and the queue is bogus, causing the segfault. (gdb) p cq->
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 11:19, Klaus Jensen wrote: > On Jul 26 15:55, Jinhao Fan wrote: > > at 3:41 PM, Klaus Jensen wrote: > > > > > On Jul 26 15:35, Jinhao Fan wrote: > > >> at 4:55 AM, Klaus Jensen wrote: > > >> > > >>> We have a regression following this patch that we need to address. > > >>> > > >>> With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` > > >>> will do the trick) causes QEMU to hog my host cpu at 100%. > > >>> > > >>> I'm still not sure what causes this. The trace output is a bit > > >>> inconclusive still. > > >>> > > >>> I'll keep looking into it. > > >> > > >> I cannot reproduce this bug. I just start the VM and used `nvme reset > > >> /dev/nvme0`. Did you do anything before the reset? > > > > > > Interesting and thanks for checking! Looks like a kernel issue then! > > > > > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel and > > > reverting to a stock OS kernel did not produce the bug. > > > > I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works ok on > > my machine. > > Interesting. I can reproduce on 5.19-rc4 from torvalds tree. Can you > drop your qemu command line here? > > This is mine. > > /home/kbj/work/src/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ > -nodefaults \ > -display "none" \ > -machine "q35,accel=kvm,kernel-irqchip=split" \ > -cpu "host" \ > -smp "4" \ > -m "8G" \ > -device "intel-iommu" \ > -netdev "user,id=net0,hostfwd=tcp::-:22" \ > -device "virtio-net-pci,netdev=net0" \ > -device "virtio-rng-pci" \ > -drive > "id=boot,file=/home/kbj/work/vol/machines/img/nvme.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" > \ > -device "pcie-root-port,id=pcie_root_port1,chassis=1,slot=0" \ > -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port1,mdts=7" \ > -drive "id=null,if=none,file=null-co://,file.read-zeroes=on,format=raw" \ > -device > "nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,drive=null,logical_block_size=4096,physical_block_size=4096" > \ > -pidfile "/home/kbj/work/vol/machines/run/null/pidfile" \ > -kernel "/home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage" \ > -append "root=/dev/vda1 console=ttyS0,115200 audit=0 intel_iommu=on" \ > -virtfs > "local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly=on,mount_tag=kernel_dir" > \ > -serial "mon:stdio" \ > -d "guest_errors" \ > -D "/home/kbj/work/vol/machines/log/null/qemu.log" \ > -trace "pci_nvme*" Alright. It was *some* config issue with my kernel. Reverted to a defconfig + requirements and the issue went away. I'll try to track down what happended, but doesnt look like qemu is at fault here. signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 15:55, Jinhao Fan wrote: > at 3:41 PM, Klaus Jensen wrote: > > > On Jul 26 15:35, Jinhao Fan wrote: > >> at 4:55 AM, Klaus Jensen wrote: > >> > >>> We have a regression following this patch that we need to address. > >>> > >>> With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` > >>> will do the trick) causes QEMU to hog my host cpu at 100%. > >>> > >>> I'm still not sure what causes this. The trace output is a bit > >>> inconclusive still. > >>> > >>> I'll keep looking into it. > >> > >> I cannot reproduce this bug. I just start the VM and used `nvme reset > >> /dev/nvme0`. Did you do anything before the reset? > > > > Interesting and thanks for checking! Looks like a kernel issue then! > > > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel and > > reverting to a stock OS kernel did not produce the bug. > > I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works ok on > my machine. Interesting. I can reproduce on 5.19-rc4 from torvalds tree. Can you drop your qemu command line here? This is mine. /home/kbj/work/src/qemu/build/x86_64-softmmu/qemu-system-x86_64 \ -nodefaults \ -display "none" \ -machine "q35,accel=kvm,kernel-irqchip=split" \ -cpu "host" \ -smp "4" \ -m "8G" \ -device "intel-iommu" \ -netdev "user,id=net0,hostfwd=tcp::-:22" \ -device "virtio-net-pci,netdev=net0" \ -device "virtio-rng-pci" \ -drive "id=boot,file=/home/kbj/work/vol/machines/img/nvme.qcow2,format=qcow2,if=virtio,discard=unmap,media=disk,read-only=no" \ -device "pcie-root-port,id=pcie_root_port1,chassis=1,slot=0" \ -device "nvme,id=nvme0,serial=deadbeef,bus=pcie_root_port1,mdts=7" \ -drive "id=null,if=none,file=null-co://,file.read-zeroes=on,format=raw" \ -device "nvme-ns,id=nvm-1,drive=nvm-1,bus=nvme0,nsid=1,drive=null,logical_block_size=4096,physical_block_size=4096" \ -pidfile "/home/kbj/work/vol/machines/run/null/pidfile" \ -kernel "/home/kbj/work/src/kernel/linux/arch/x86_64/boot/bzImage" \ -append "root=/dev/vda1 console=ttyS0,115200 audit=0 intel_iommu=on" \ -virtfs "local,path=/home/kbj/work/src/kernel/linux,security_model=none,readonly=on,mount_tag=kernel_dir" \ -serial "mon:stdio" \ -d "guest_errors" \ -D "/home/kbj/work/vol/machines/log/null/qemu.log" \ -trace "pci_nvme*" signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 3:41 PM, Klaus Jensen wrote: > On Jul 26 15:35, Jinhao Fan wrote: >> at 4:55 AM, Klaus Jensen wrote: >> >>> We have a regression following this patch that we need to address. >>> >>> With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` >>> will do the trick) causes QEMU to hog my host cpu at 100%. >>> >>> I'm still not sure what causes this. The trace output is a bit >>> inconclusive still. >>> >>> I'll keep looking into it. >> >> I cannot reproduce this bug. I just start the VM and used `nvme reset >> /dev/nvme0`. Did you do anything before the reset? > > Interesting and thanks for checking! Looks like a kernel issue then! > > I remember that I'm using a dev branch (nvme-v5.20) of the kernel and > reverting to a stock OS kernel did not produce the bug. I’m using 5.19-rc4 which I pulled from linux-next on Jul 1. It works ok on my machine.
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 26 15:35, Jinhao Fan wrote: > at 4:55 AM, Klaus Jensen wrote: > > > > > We have a regression following this patch that we need to address. > > > > With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` > > will do the trick) causes QEMU to hog my host cpu at 100%. > > > > I'm still not sure what causes this. The trace output is a bit > > inconclusive still. > > > > I'll keep looking into it. > > I cannot reproduce this bug. I just start the VM and used `nvme reset > /dev/nvme0`. Did you do anything before the reset? > Interesting and thanks for checking! Looks like a kernel issue then! I remember that I'm using a dev branch (nvme-v5.20) of the kernel and reverting to a stock OS kernel did not produce the bug. signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 4:55 AM, Klaus Jensen wrote: > > We have a regression following this patch that we need to address. > > With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` > will do the trick) causes QEMU to hog my host cpu at 100%. > > I'm still not sure what causes this. The trace output is a bit > inconclusive still. > > I'll keep looking into it. I cannot reproduce this bug. I just start the VM and used `nvme reset /dev/nvme0`. Did you do anything before the reset?
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 5 22:24, Jinhao Fan wrote: > Add property "ioeventfd" which is enabled by default. When this is > enabled, updates on the doorbell registers will cause KVM to signal > an event to the QEMU main loop to handle the doorbell updates. > Therefore, instead of letting the vcpu thread run both guest VM and > IO emulation, we now use the main loop thread to do IO emulation and > thus the vcpu thread has more cycles for the guest VM. > > Since ioeventfd does not tell us the exact value that is written, it is > only useful when shadow doorbell buffer is enabled, where we check > for the value in the shadow doorbell buffer when we get the doorbell > update event. > > IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) > > qd 1 4 16 64 > qemu35 121 176 153 > ioeventfd 41 133 258 313 > > Changes since v3: > - Do not deregister ioeventfd when it was not enabled on a SQ/CQ > > Signed-off-by: Jinhao Fan > --- > hw/nvme/ctrl.c | 114 - > hw/nvme/nvme.h | 5 +++ > 2 files changed, 118 insertions(+), 1 deletion(-) > We have a regression following this patch that we need to address. With this patch, issuing a reset on the device (`nvme reset /dev/nvme0` will do the trick) causes QEMU to hog my host cpu at 100%. I'm still not sure what causes this. The trace output is a bit inconclusive still. I'll keep looking into it. signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 12 14:23, Klaus Jensen wrote: > On Jul 9 11:06, Jinhao Fan wrote: > > at 10:24 PM, Jinhao Fan wrote: > > > > > @@ -5793,6 +5891,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, > > > const NvmeRequest *req) > > > uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > > > uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > > > int i; > > > +int ret; > > > > > > > I just noticed this ret is unused. Could you help remove this line when > > applying the patch? > > Yes, I noticed it and hot-fixed it ;) Jinhao, Applied to nvme-next! signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 9 11:06, Jinhao Fan wrote: > at 10:24 PM, Jinhao Fan wrote: > > > @@ -5793,6 +5891,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > > NvmeRequest *req) > > uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > > uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > > int i; > > +int ret; > > > > I just noticed this ret is unused. Could you help remove this line when > applying the patch? Yes, I noticed it and hot-fixed it ;) signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 10:24 PM, Jinhao Fan wrote: > @@ -5793,6 +5891,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const > NvmeRequest *req) > uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); > uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); > int i; > +int ret; > I just noticed this ret is unused. Could you help remove this line when applying the patch?
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 1:51 PM, Klaus Jensen wrote: > On Jul 6 19:34, Jinhao Fan wrote: >> at 2:43 AM, Keith Busch wrote: >> >>> On Tue, Jul 05, 2022 at 07:11:36PM +0200, Klaus Jensen wrote: On Jul 5 22:24, Jinhao Fan wrote: > @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > *cq, NvmeRequest *req) > >QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); >QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > + > +if (req->sq->ioeventfd_enabled) { > +/* Post CQE directly since we are in main loop thread */ > +nvme_post_cqes(cq); > +} else { > +/* Schedule the timer to post CQE later since we are in vcpu > thread */ > +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > 500); > +} Actually, we are only in the vcpu thread if we come here from nvme_process_db that in very rare circumstances may enqueue the completion of an AER due to an invalid doorbell write. In general, nvme_enqueue_req_completion is only ever called from the main iothread. Which actually causes me to wonder why we defer this work in the first place. It does have the benefit that we queue up several completions before posting them in one go and raising the interrupt. But I wonder if that could be handled better. >>> >>> I think the timer is used because of the cq_full condition. We need to >>> restart >>> completions when it becomes not full, which requires a doorbell write. >>> Having >>> everyone from the main iothread use the same timer as the doorbell handler >>> just >>> ensures single threaded list access. >> >> Could we let nvme_process_aers register another timer/BH to trigger >> nvme_enqueue_req_completion in the iothread? In this way we won’t need the >> timer_mod in nvme_enqueue_req_completion. > > Yes, we could have process_aers in a timer. Which would probably be > preferable in order to limit the amount of work the mmio handler is > doing in that rare case. However, its such a rare case (only misbehaving > drivers) that it's probably not worth optimizing for. I think putting nvme_process_aers in a timer can help make sure nvme_enqueue_req_completion is only called in an iothread context. Currently it can be called either in iothread or vcpu thread. So nvme_enqueue_req_completion has to infer its context, in my patch using the cq->ioeventfd_enabled flag. This approach is probably error-prone. Honestly I am not sure whether cq->ioeventfd_enabled is enough to guarantee we are in iothread. >> We can also avoid some potential currency problems because CQ is only >> modified in the iothread. > > There are currently no concurrency problems because of the Big QEMU > Lock. When the mmio handler is running, the vcpu holds the BQL (and > whenever the main iothread is running, it is holding the BQL). Will this still hold when we move on to iothreads? > >> BTW, are there any reason that we must use timers (not BH) here? Also why do >> we choose to delay for 500ns? > > No particular reason. do not see any reason why this could not be bottom > halfs. This will likely change into bhs when we add iothread support > anyway. I will try BH when I start the iothread work.
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 6 19:34, Jinhao Fan wrote: > at 2:43 AM, Keith Busch wrote: > > > On Tue, Jul 05, 2022 at 07:11:36PM +0200, Klaus Jensen wrote: > >> On Jul 5 22:24, Jinhao Fan wrote: > >>> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > >>> *cq, NvmeRequest *req) > >>> > >>> QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); > >>> QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > >>> -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > >>> + > >>> +if (req->sq->ioeventfd_enabled) { > >>> +/* Post CQE directly since we are in main loop thread */ > >>> +nvme_post_cqes(cq); > >>> +} else { > >>> +/* Schedule the timer to post CQE later since we are in vcpu > >>> thread */ > >>> +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > >>> 500); > >>> +} > >> > >> Actually, we are only in the vcpu thread if we come here from > >> nvme_process_db that in very rare circumstances may enqueue the > >> completion of an AER due to an invalid doorbell write. > >> > >> In general, nvme_enqueue_req_completion is only ever called from the > >> main iothread. Which actually causes me to wonder why we defer this work > >> in the first place. It does have the benefit that we queue up several > >> completions before posting them in one go and raising the interrupt. > >> But I wonder if that could be handled better. > > > > I think the timer is used because of the cq_full condition. We need to > > restart > > completions when it becomes not full, which requires a doorbell write. > > Having > > everyone from the main iothread use the same timer as the doorbell handler > > just > > ensures single threaded list access. > > Could we let nvme_process_aers register another timer/BH to trigger > nvme_enqueue_req_completion in the iothread? In this way we won’t need the > timer_mod in nvme_enqueue_req_completion. Yes, we could have process_aers in a timer. Which would probably be preferable in order to limit the amount of work the mmio handler is doing in that rare case. However, its such a rare case (only misbehaving drivers) that it's probably not worth optimizing for. > We can also avoid some potential currency problems because CQ is only > modified in the iothread. > There are currently no concurrency problems because of the Big QEMU Lock. When the mmio handler is running, the vcpu holds the BQL (and whenever the main iothread is running, it is holding the BQL). > BTW, are there any reason that we must use timers (not BH) here? Also why do > we choose to delay for 500ns? No particular reason. do not see any reason why this could not be bottom halfs. This will likely change into bhs when we add iothread support anyway. signature.asc Description: PGP signature
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 2:43 AM, Keith Busch wrote: > On Tue, Jul 05, 2022 at 07:11:36PM +0200, Klaus Jensen wrote: >> On Jul 5 22:24, Jinhao Fan wrote: >>> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue >>> *cq, NvmeRequest *req) >>> >>> QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); >>> QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); >>> -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); >>> + >>> +if (req->sq->ioeventfd_enabled) { >>> +/* Post CQE directly since we are in main loop thread */ >>> +nvme_post_cqes(cq); >>> +} else { >>> +/* Schedule the timer to post CQE later since we are in vcpu >>> thread */ >>> +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); >>> +} >> >> Actually, we are only in the vcpu thread if we come here from >> nvme_process_db that in very rare circumstances may enqueue the >> completion of an AER due to an invalid doorbell write. >> >> In general, nvme_enqueue_req_completion is only ever called from the >> main iothread. Which actually causes me to wonder why we defer this work >> in the first place. It does have the benefit that we queue up several >> completions before posting them in one go and raising the interrupt. >> But I wonder if that could be handled better. > > I think the timer is used because of the cq_full condition. We need to restart > completions when it becomes not full, which requires a doorbell write. Having > everyone from the main iothread use the same timer as the doorbell handler > just > ensures single threaded list access. Could we let nvme_process_aers register another timer/BH to trigger nvme_enqueue_req_completion in the iothread? In this way we won’t need the timer_mod in nvme_enqueue_req_completion. We can also avoid some potential currency problems because CQ is only modified in the iothread. BTW, are there any reason that we must use timers (not BH) here? Also why do we choose to delay for 500ns?
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
at 1:11 AM, Klaus Jensen wrote: > On Jul 5 22:24, Jinhao Fan wrote: >> Add property "ioeventfd" which is enabled by default. When this is >> enabled, updates on the doorbell registers will cause KVM to signal >> an event to the QEMU main loop to handle the doorbell updates. >> Therefore, instead of letting the vcpu thread run both guest VM and >> IO emulation, we now use the main loop thread to do IO emulation and >> thus the vcpu thread has more cycles for the guest VM. > > This is not entirely accurate. > > Yes, the ioeventfd causes the doorbell write to be handled by the main > iothread, but previously we did not do any substantial device emulation > in the vcpu thread either. That is the reason why we only handle the > bare minimum of the doorbell write and then defer any work until the > timer fires on the main iothread. > > But with this patch we just go ahead and do the work (nvme_process_sq) > immediately since we are already in the main iothread. > Thanks for pointing this out. I previously thought the timers are fired in the vcpu threads. I misunderstood why this optimization works but accidentally got the code right.
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Tue, Jul 05, 2022 at 07:11:36PM +0200, Klaus Jensen wrote: > On Jul 5 22:24, Jinhao Fan wrote: > > @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > > *cq, NvmeRequest *req) > > > > QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); > > QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > > -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > > + > > +if (req->sq->ioeventfd_enabled) { > > +/* Post CQE directly since we are in main loop thread */ > > +nvme_post_cqes(cq); > > +} else { > > +/* Schedule the timer to post CQE later since we are in vcpu > > thread */ > > +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > > +} > > Actually, we are only in the vcpu thread if we come here from > nvme_process_db that in very rare circumstances may enqueue the > completion of an AER due to an invalid doorbell write. > > In general, nvme_enqueue_req_completion is only ever called from the > main iothread. Which actually causes me to wonder why we defer this work > in the first place. It does have the benefit that we queue up several > completions before posting them in one go and raising the interrupt. > But I wonder if that could be handled better. I think the timer is used because of the cq_full condition. We need to restart completions when it becomes not full, which requires a doorbell write. Having everyone from the main iothread use the same timer as the doorbell handler just ensures single threaded list access.
Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
On Jul 5 22:24, Jinhao Fan wrote: > Add property "ioeventfd" which is enabled by default. When this is > enabled, updates on the doorbell registers will cause KVM to signal > an event to the QEMU main loop to handle the doorbell updates. > Therefore, instead of letting the vcpu thread run both guest VM and > IO emulation, we now use the main loop thread to do IO emulation and > thus the vcpu thread has more cycles for the guest VM. > This is not entirely accurate. Yes, the ioeventfd causes the doorbell write to be handled by the main iothread, but previously we did not do any substantial device emulation in the vcpu thread either. That is the reason why we only handle the bare minimum of the doorbell write and then defer any work until the timer fires on the main iothread. But with this patch we just go ahead and do the work (nvme_process_sq) immediately since we are already in the main iothread. > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c > index c952c34f94..4b75c5f549 100644 > --- a/hw/nvme/ctrl.c > +++ b/hw/nvme/ctrl.c > @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue > *cq, NvmeRequest *req) > > QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); > QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); > -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > + > +if (req->sq->ioeventfd_enabled) { > +/* Post CQE directly since we are in main loop thread */ > +nvme_post_cqes(cq); > +} else { > +/* Schedule the timer to post CQE later since we are in vcpu thread > */ > +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); > +} Actually, we are only in the vcpu thread if we come here from nvme_process_db that in very rare circumstances may enqueue the completion of an AER due to an invalid doorbell write. In general, nvme_enqueue_req_completion is only ever called from the main iothread. Which actually causes me to wonder why we defer this work in the first place. It does have the benefit that we queue up several completions before posting them in one go and raising the interrupt. But I wonder if that could be handled better. Anyway, it doesnt affect the correctness of your patch - just an observation that we should look into possibly. LGTM, Reviewed-by: Klaus Jensen signature.asc Description: PGP signature
[PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates
Add property "ioeventfd" which is enabled by default. When this is enabled, updates on the doorbell registers will cause KVM to signal an event to the QEMU main loop to handle the doorbell updates. Therefore, instead of letting the vcpu thread run both guest VM and IO emulation, we now use the main loop thread to do IO emulation and thus the vcpu thread has more cycles for the guest VM. Since ioeventfd does not tell us the exact value that is written, it is only useful when shadow doorbell buffer is enabled, where we check for the value in the shadow doorbell buffer when we get the doorbell update event. IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) qd 1 4 16 64 qemu35 121 176 153 ioeventfd 41 133 258 313 Changes since v3: - Do not deregister ioeventfd when it was not enabled on a SQ/CQ Signed-off-by: Jinhao Fan --- hw/nvme/ctrl.c | 114 - hw/nvme/nvme.h | 5 +++ 2 files changed, 118 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index c952c34f94..4b75c5f549 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); -timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + +if (req->sq->ioeventfd_enabled) { +/* Post CQE directly since we are in main loop thread */ +nvme_post_cqes(cq); +} else { +/* Schedule the timer to post CQE later since we are in vcpu thread */ +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); +} } static void nvme_process_aers(void *opaque) @@ -4195,10 +4202,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } +static void nvme_cq_notifier(EventNotifier *e) +{ +NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); +NvmeCtrl *n = cq->ctrl; + +event_notifier_test_and_clear(&cq->notifier); + +nvme_update_cq_head(cq); + +if (cq->tail == cq->head) { +if (cq->irq_enabled) { +n->cq_pending--; +} + +nvme_irq_deassert(n, cq); +} + +nvme_post_cqes(cq); +} + +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) +{ +NvmeCtrl *n = cq->ctrl; +uint16_t offset = (cq->cqid << 3) + (1 << 2); +int ret; + +ret = event_notifier_init(&cq->notifier, 0); +if (ret < 0) { +return ret; +} + +event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); +memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); + +return 0; +} + +static void nvme_sq_notifier(EventNotifier *e) +{ +NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); + +event_notifier_test_and_clear(&sq->notifier); + +nvme_process_sq(sq); +} + +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) +{ +NvmeCtrl *n = sq->ctrl; +uint16_t offset = sq->sqid << 3; +int ret; + +ret = event_notifier_init(&sq->notifier, 0); +if (ret < 0) { +return ret; +} + +event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); +memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); + +return 0; +} + static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) { +uint16_t offset = sq->sqid << 3; + n->sq[sq->sqid] = NULL; timer_free(sq->timer); +if (sq->ioeventfd_enabled) { +memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); +event_notifier_cleanup(&sq->notifier); +} g_free(sq->io_req); if (sq->sqid) { g_free(sq); @@ -4271,6 +4350,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, if (n->dbbuf_enabled) { sq->db_addr = n->dbbuf_dbs + (sqid << 3); sq->ei_addr = n->dbbuf_eis + (sqid << 3); + +if (n->params.ioeventfd && sq->sqid != 0) { +if (!nvme_init_sq_ioeventfd(sq)) { +sq->ioeventfd_enabled = true; +} +} } assert(n->cq[cqid]); @@ -4575,8 +4660,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { +uint16_t offset = (cq->cqid << 3) + (1 << 2); + n->cq[cq->cqid] = NULL; timer_free(cq->timer); +if (cq->ioeventfd_enabled) { +memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); +event_notifier_cleanup(&cq->notifier); +} if (msix_enabled(&n->parent_obj)) { msix_vector_unuse(&n->parent_obj, cq->vector); } @@ -4635,6 +4727,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, if (n->dbb