On Dienstag, 31. August 2021 12:57:49 CEST Greg Kurz wrote:
> On Mon, 30 Aug 2021 17:55:04 +0200
> 
> Christian Schoenebeck <qemu_...@crudebyte.com> wrote:
> > Apparently commit 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4 '9pfs: reduce
> > latency of Twalk' has introduced occasional crashes.
> > 
> > My first impression after looking at the backtrace: looks like the patch
> > itself is probably not causing this, but rather unmasked this issue (i.e.
> > increased the chance to be triggered).
> > 
> > The crash is because of 'elem' is NULL in virtio_pdu_vunmarshal() (frame
> > 0).
> Ouch... this certainly isn't expected to happen :-\
> 
> elem is popped out the vq in handle_9p_output():
> 
>         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
>         if (!elem) {
>             goto out_free_pdu;
>         }
> [...]
>         v->elems[pdu->idx] = elem;
> 
> and cleared on PDU completion in virtio_9p_push_and_notify() :
> 
>     v->elems[pdu->idx] = NULL;
> 
> 
> I can't think of a way where push_and_notify() could collide
> with handle_output()... both are supposed to be run sequentially
> by the main event loop.

Ok, I made a quick "is same thread" assertion check:

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 54ee93b71f..bb6ebd16aa 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -28,12 +28,25 @@
 #include "qemu/module.h"
 #include "sysemu/qtest.h"
 
+static void assert_thread(void) {
+    static QemuThread thread;
+    static bool calledBefore;
+    if (!calledBefore) {
+        calledBefore = 1;
+        qemu_thread_get_self(&thread);
+        return;
+    }
+    assert(qemu_thread_is_self(&thread));
+}
+
 static void virtio_9p_push_and_notify(V9fsPDU *pdu)
 {
     V9fsState *s = pdu->s;
     V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
     VirtQueueElement *elem = v->elems[pdu->idx];
 
+    assert_thread();
+
     /* push onto queue and notify */
     virtqueue_push(v->vq, elem, pdu->size);
     g_free(elem);
@@ -51,6 +64,8 @@ static void handle_9p_output(VirtIODevice *vdev, VirtQueue 
*vq)
     ssize_t len;
     VirtQueueElement *elem;
 
+    assert_thread();
+
     while ((pdu = pdu_alloc(s))) {
         P9MsgHeader out;
 
@@ -125,6 +140,8 @@ static ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -143,6 +160,8 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -160,6 +179,8 @@ static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->in_sg, elem->in_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
@@ -180,6 +201,8 @@ static void virtio_init_out_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->out_sg, elem->out_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
And it triggered, however I am not sure if some of those functions I asserted 
above are indeed allowed to be executed on a different thread than main 
thread:

Program terminated with signal SIGABRT, Aborted.
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
[Current thread is 1 (Thread 0x7fd0bcef1700 (LWP 6470))]
(gdb) bt
#0  0x00007fd3fe6db7bb in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/
sysv/linux/raise.c:50
#1  0x00007fd3fe6c6535 in __GI_abort () at abort.c:79
#2  0x00007fd3fe6c640f in __assert_fail_base
    (fmt=0x7fd3fe828ee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
assertion=0x563569935704 "qemu_thread_is_self(&thread)", file=0x5635699356e6 2
#3  0x00007fd3fe6d4102 in __GI___assert_fail
    (assertion=assertion@entry=0x563569935704 "qemu_thread_is_self(&thread)", 
file=file@entry=0x5635699356e6 "../hw/9pfs/virtio-9p-device.c", line1
#4  0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:39
#5  0x00005635695c98aa in assert_thread () at ../hw/9pfs/virtio-9p-device.c:31
#6  0x00005635695c9a4b in virtio_9p_push_and_notify (pdu=0x56356bc38f38) at 
../hw/9pfs/virtio-9p-device.c:48
#7  0x00005635693c2f28 in pdu_complete (pdu=pdu@entry=0x56356bc38f38, 
len=<optimized out>, len@entry=-4) at ../hw/9pfs/9p.c:1059
#8  0x00005635693c49e0 in v9fs_walk (opaque=0x56356bc38f38) at ../hw/9pfs/
9p.c:1868
#9  0x00005635698256bb in coroutine_trampoline (i0=<optimized out>, 
i1=<optimized out>) at ../util/coroutine-ucontext.c:173
#10 0x00007fd3fe6f0b50 in __correctly_grouped_prefixwc
    (begin=0x2 <error: Cannot access memory at address 0x2>, end=0x0, 
thousands=0 L'\000', grouping=0x7fd3fe6db7bb <__GI_raise+267> "H\213\214$\b\1
#11 0x0000000000000000 in  ()
(gdb)

Also there is no official qemu_main_thread() function it seems yet. That might 
be useful in general.

> Maybe active some traces ?

I need to read up on how to use traces, then I'll check that as well. But 
probably ensuring thread sanity in a way proposed above would probably make 
sense first.

I also tried a little retry hack as a test, just in case this was some sort of 
"not ready yet" issue, because I saw Philippe was working on some virtio queue 
not ready issue lately:
https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04460.html
https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04471.html

diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 54ee93b71f..ba332c27b6 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -141,6 +141,19 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t 
offset,
     V9fsState *s = pdu->s;
     V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
     VirtQueueElement *elem = v->elems[pdu->idx];
+    for (int i = 0; !elem; ++i) {
+        bool in_coroutine = qemu_in_coroutine();
+        printf("TRIGGERED %d (in_coroutine=%d)!\n", i, in_coroutine);
+        fflush(stdout);
+        if (in_coroutine) {
+            qemu_co_sleep_ns(QEMU_CLOCK_REALTIME, 100000);
+        } else {
+            g_usleep(100);
+        }
+        s = pdu->s;
+        v = container_of(s, V9fsVirtioState, state);
+        elem = v->elems[pdu->idx];
+    }
     ssize_t ret;
 
     ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
ap);

But that did not bring much:

TRIGGERED 0 (in_coroutine=1)!
TRIGGERED 1 (in_coroutine=1)!
TRIGGERED 2 (in_coroutine=1)!
...
TRIGGERED 43 (in_coroutine=1)!
TRIGGERED 44 (in_coroutine=1)!
qemu_aio_coroutine_enter: Co-routine was already scheduled in 
'qemu_co_sleep_ns'
Aborted (core dumped)

> > bt taken with HEAD being 8d6cb100731c4d28535adbf2a3c2d1f29be3fef4:
> > 
> > Program terminated with signal SIGSEGV, Segmentation fault.
> > #0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7,
> > fmt=0x55a9352766d1
> > "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
> > 146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset,
> > 1, fmt, ap);
> > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > (gdb) bt full
> > #0  0x000055a934dfb9a7 in virtio_pdu_vunmarshal (pdu=0x55a93717cde8,
> > offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at
> > ../hw/9pfs/virtio-9p-device.c: 146
> > 
> >         s = 0x55a93717b4b8
> >         v = 0x55a93717aee0
> >         elem = 0x0
> 
> So this is v->elems[pdu->idx]... what's the value of pdu->idx ?

In that originally posted core dump it was 113:

#0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7, fmt=0x55a9352766d1 
"ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, 
fmt, ap);
[Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
(gdb) p pdu->idx
$1 = 113

> >         ret = <optimized out>
> > 
> > #1  0x000055a934bf35e8 in pdu_unmarshal (pdu=pdu@entry=0x55a93717cde8,
> > offset=offset@entry=7, fmt=fmt@entry=0x55a9352766d1 "ddw") at
> > ../hw/9pfs/9p.c: 71
> > 
> >         ret = <optimized out>
> >         ap = {{gp_offset = 24, fp_offset = 48, overflow_arg_area =
> > 
> > 0x7f38a9ad9db0, reg_save_area = 0x7f38a9ad9cf0}}
> > #2  0x000055a934bf68db in v9fs_walk (opaque=0x55a93717cde8) at ../hw/9pfs/
> > 9p.c:1720
> > 
> >         name_idx = <optimized out>
> >         qids = 0x0
> >         i = <optimized out>
> >         err = 0
> >         dpath = {size = 0, data = 0x0}
> >         path = {size = 0, data = 0x0}
> >         pathes = 0x0
> >         nwnames = 1
> >         stbuf =
> >         
> >             {st_dev = 2050, st_ino = 1199848, st_nlink = 1, st_mode =
> >             41471,
> > 
> > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 13, st_blksize
> > =
> > 4096, st_blocks = 16, s}
> > 
> >         fidst =
> >         
> >             {st_dev = 2050, st_ino = 1198183, st_nlink = 3, st_mode =
> >             16877,
> > 
> > st_uid = 0, st_gid = 0, __pad0 = 0, st_rdev = 0, st_size = 12288,
> > st_blksize = 4096, st_blocks = 32}
> > 
> >         stbufs = 0x0
> >         offset = 7
> >         fid = 299
> >         newfid = 687
> >         wnames = 0x0
> >         fidp = <optimized out>
> >         newfidp = 0x0
> >         pdu = 0x55a93717cde8
> >         s = 0x55a93717b4b8
> >         qid = {type = 2 '\002', version = 1556732739, path = 2399697}
> > 
> > #3  0x000055a93505760b in coroutine_trampoline (i0=<optimized out>,
> > i1=<optimized out>) at ../util/coroutine-ucontext.c:173



Reply via email to