On Dienstag, 31. August 2021 12:57:49 CEST Greg Kurz wrote: > On Mon, 30 Aug 2021 17:55:04 +0200 > > Christian Schoenebeck <qemu_...@crudebyte.com> wrote: > > Apparently commit 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4 '9pfs: reduce > > latency of Twalk' has introduced occasional crashes. > > > > My first impression after looking at the backtrace: looks like the patch > > itself is probably not causing this, but rather unmasked this issue (i.e. > > increased the chance to be triggered). > > > > The crash is because of 'elem' is NULL in virtio_pdu_vunmarshal() (frame > > 0). > Ouch... this certainly isn't expected to happen :-\ > > elem is popped out the vq in handle_9p_output(): > > elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > if (!elem) { > goto out_free_pdu; > } > [...] > v->elems[pdu->idx] = elem; > > and cleared on PDU completion in virtio_9p_push_and_notify() : > > v->elems[pdu->idx] = NULL; > > > I can't think of a way where push_and_notify() could collide > with handle_output()... both are supposed to be run sequentially > by the main event loop.
Ok, I made a quick "is same thread" assertion check: diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c index 54ee93b71f..bb6ebd16aa 100644 --- a/hw/9pfs/virtio-9p-device.c +++ b/hw/9pfs/virtio-9p-device.c @@ -28,12 +28,25 @@ #include "qemu/module.h" #include "sysemu/qtest.h" +static void assert_thread(void) { + static QemuThread thread; + static bool calledBefore; + if (!calledBefore) { + calledBefore = 1; + qemu_thread_get_self(&thread); + return; + } + assert(qemu_thread_is_self(&thread)); +} + static void virtio_9p_push_and_notify(V9fsPDU *pdu) { V9fsState *s = pdu->s; V9fsVirtioState *v = container_of(s, V9fsVirtioState, state); VirtQueueElement *elem = v->elems[pdu->idx]; + assert_thread(); + /* push onto queue and notify */ virtqueue_push(v->vq, elem, pdu->size); g_free(elem); @@ -51,6 +64,8 @@ static void handle_9p_output(VirtIODevice *vdev, VirtQueue *vq) ssize_t len; VirtQueueElement *elem; + assert_thread(); + while ((pdu = pdu_alloc(s))) { P9MsgHeader out; @@ -125,6 +140,8 @@ static ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t offset, VirtQueueElement *elem = v->elems[pdu->idx]; ssize_t ret; + assert_thread(); + ret = v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap); if (ret < 0) { VirtIODevice *vdev = VIRTIO_DEVICE(v); @@ -143,6 +160,8 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset, VirtQueueElement *elem = v->elems[pdu->idx]; ssize_t ret; + assert_thread(); + ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, ap); if (ret < 0) { VirtIODevice *vdev = VIRTIO_DEVICE(v); @@ -160,6 +179,8 @@ static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov, VirtQueueElement *elem = v->elems[pdu->idx]; size_t buf_size = iov_size(elem->in_sg, elem->in_num); + assert_thread(); + if (buf_size < size) { VirtIODevice *vdev = VIRTIO_DEVICE(v); @@ -180,6 +201,8 @@ static void virtio_init_out_iov_from_pdu(V9fsPDU *pdu, struct iovec **piov, VirtQueueElement *elem = v->elems[pdu->idx]; size_t buf_size = iov_size(elem->out_sg, elem->out_num); + assert_thread(); + if (buf_size < size) { VirtIODevice *vdev = VIRTIO_DEVICE(v); And it triggered, however I am not sure if some of those functions I asserted above are indeed allowed to be executed on a different thread than main thread: Program terminated with signal SIGABRT, Aborted. #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 50 ../sysdeps/unix/sysv/linux/raise.c: No such file or directory. [Current thread is 1 (Thread 0x7fd0bcef1700 (LWP 6470))] (gdb) bt #0 0x00007fd3fe6db7bb in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/ sysv/linux/raise.c:50 #1 0x00007fd3fe6c6535 in __GI_abort () at abort.c:79 #2 0x00007fd3fe6c640f in __assert_fail_base (fmt=0x7fd3fe828ee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x563569935704 "qemu_thread_is_self(&thread)", file=0x5635699356e6 2 #3 0x00007fd3fe6d4102 in __GI___assert_fail (assertion=assertion@entry=0x563569935704 "qemu_thread_is_self(&thread)", file=file@entry=0x5635699356e6 "../hw/9pfs/virtio-9p-device.c", line1 #4 0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:39 #5 0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:31 #6 0x00005635695c9a4b in virtio_9p_push_and_notify (pdu=0x56356bc38f38) at ../hw/9pfs/virtio-9p-device.c:48 #7 0x00005635693c2f28 in pdu_complete (pdu=pdu@entry=0x56356bc38f38, len=<optimized out>, len@entry=-4) at ../hw/9pfs/9p.c:1059 #8 0x00005635693c49e0 in v9fs_walk (opaque=0x56356bc38f38) at ../hw/9pfs/ 9p.c:1868 #9 0x00005635698256bb in coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at ../util/coroutine-ucontext.c:173 #10 0x00007fd3fe6f0b50 in __correctly_grouped_prefixwc (begin=0x2 <error: Cannot access memory at address 0x2>, end=0x0, thousands=0 L'\000', grouping=0x7fd3fe6db7bb <__GI_raise+267> "H\213\214$\b\1 #11 0x0000000000000000 in () (gdb) Also there is no official qemu_main_thread() function it seems yet. That might be useful in general. > Maybe active some traces ? I need to read up on how to use traces, then I'll check that as well. But probably ensuring thread sanity in a way proposed above would probably make sense first. I also tried a little retry hack as a test, just in case this was some sort of "not ready yet" issue, because I saw Philippe was working on some virtio queue not ready issue lately: https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04460.html https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04471.html diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c index 54ee93b71f..ba332c27b6 100644 --- a/hw/9pfs/virtio-9p-device.c +++ b/hw/9pfs/virtio-9p-device.c @@ -141,6 +141,19 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t offset, V9fsState *s = pdu->s; V9fsVirtioState *v = container_of(s, V9fsVirtioState, state); VirtQueueElement *elem = v->elems[pdu->idx]; + for (int i = 0; !elem; ++i) { + bool in_coroutine = qemu_in_coroutine(); + printf("TRIGGERED %d (in_coroutine=%d)!\n", i, in_coroutine); + fflush(stdout); + if (in_coroutine) { + qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000); + } else { + g_usleep(100); + } + s = pdu->s; + v = container_of(s, V9fsVirtioState, state); + elem = v->elems[pdu->idx]; + } ssize_t ret; ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, ap); But that did not bring much: TRIGGERED 0 (in_coroutine=1)! TRIGGERED 1 (in_coroutine=1)! TRIGGERED 2 (in_coroutine=1)! ... TRIGGERED 43 (in_coroutine=1)! TRIGGERED 44 (in_coroutine=1)! qemu_aio_coroutine_enter: Co-routine was already scheduled in 'qemu_co_sleep_ns' Aborted (core dumped) > > bt taken with HEAD being 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4: > > > > Program terminated with signal SIGSEGV, Segmentation fault. > > #0 virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7, > > fmt=0x55a9352766d1 > > "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146 > > 146 ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, > > 1, fmt, ap); > > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))] > > (gdb) bt full > > #0 0x000055a934dfb9a7 in virtio_pdu_vunmarshal (pdu=0x55a93717cde8, > > offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at > > ../hw/9pfs/virtio-9p-device.c: 146 > > > > s = 0x55a93717b4b8 > > v = 0x55a93717aee0 > > elem = 0x0 > > So this is v->elems[pdu->idx]... what's the value of pdu->idx ? In that originally posted core dump it was 113: #0 virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146 146 ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, ap); [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))] (gdb) p pdu->idx $1 = 113 > > ret = <optimized out> > > > > #1 0x000055a934bf35e8 in pdu_unmarshal (pdu=pdu@entry=0x55a93717cde8, > > offset=offset@entry=7, fmt=fmt@entry=0x55a9352766d1 "ddw") at > > ../hw/9pfs/9p.c: 71 > > > > ret = <optimized out> > > ap = {{gp_offset = 24, fp_offset = 48, overflow_arg_area = > > > > 0x7f38a9ad9db0, reg_save_area = 0x7f38a9ad9cf0}} > > #2 0x000055a934bf68db in v9fs_walk (opaque=0x55a93717cde8) at ../hw/9pfs/ > > 9p.c:1720 > > > > name_idx = <optimized out> > > qids = 0x0 > > i = <optimized out> > > err = 0 > > dpath = {size = 0, data = 0x0} > > path = {size = 0, data = 0x0} > > pathes = 0x0 > > nwnames = 1 > > stbuf = > > > > {st_dev = 2050, st_ino = 1199848, st_nlink = 1, st_mode = > > 41471, > > > > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 13, st_blksize > > = > > 4096, st_blocks = 16, s} > > > > fidst = > > > > {st_dev = 2050, st_ino = 1198183, st_nlink = 3, st_mode = > > 16877, > > > > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 12288, > > st_blksize = 4096, st_blocks = 32} > > > > stbufs = 0x0 > > offset = 7 > > fid = 299 > > newfid = 687 > > wnames = 0x0 > > fidp = <optimized out> > > newfidp = 0x0 > > pdu = 0x55a93717cde8 > > s = 0x55a93717b4b8 > > qid = {type = 2 '\002', version = 1556732739, path = 2399697} > > > > #3 0x000055a93505760b in coroutine_trampoline (i0=<optimized out>, > > i1=<optimized out>) at ../util/coroutine-ucontext.c:173