Hi,

When running 169 in parallel (e.g. like so:

$ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done
$ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done

in four different shells), I get aborts:

(Often I get segfaults, but that's because of
http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html --
feel free to apply the attached patch to make them go away)


WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest
-nodefaults -machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback
.................E..
======================================================================
ERROR:
test_do_test_migration_resume_source_not_persistent__not_migbitmap
(__main__.TestDirtyBitmapMigration)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "169", line 206, in <lambda>
    setattr(klass, 'test_' + method + name, lambda self: mc(self))
  File "169", line 113, in do_test_migration_resume_source
    self.check_bitmap(self.vm_a, sha256)
  File "169", line 72, in check_bitmap
    node='drive0', name='bitmap0')
  File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp
    return self._qmp.cmd(cmd, args=qmp_args)
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd
    return self.cmd_obj(qmp_cmd)
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj
    resp = self.__json_read()
  File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in
__json_read
    data = self.__sockfile.readline()
  File "/usr/lib64/python2.7/socket.py", line 451, in readline
    data = self._sock.recv(self._rbufsize)
error: [Errno 104] Connection reset by peer

----------------------------------------------------------------------
Ran 20 tests

FAILED (errors=1)


Or:

WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
-machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback
WARNING:qemu:qemu received signal 6:
build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64
-chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock
-mon chardev=mon,mode=control -display none -vga none -qtest
unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults
-machine accel=qtest -drive
if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback

...................F
======================================================================
FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap
(__main__.TestDirtyBitmapMigration)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "169", line 206, in <lambda>
    setattr(klass, 'test_' + method + name, lambda self: mc(self))
  File "169", line 125, in do_test_migration_resume_source
    self.assertEqual(log, '')
AssertionError: "qemu-system-x86_64: invalid runstate transition:
'running' -> 'postmigrate'\n" != ''

----------------------------------------------------------------------
Ran 20 tests

FAILED (failures=1)


The backtrace always goes like this:

(gdb) bt
#0  0x00007f0acf5cc53f in raise () at /lib64/libc.so.6
#1  0x00007f0acf5b6895 in abort () at /lib64/libc.so.6
#2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
at vl.c:742
#3  0x000055a46ebbb1a6 in runstate_set
(new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730
#4  0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000)
at migration/migration.c:2972
#5  0x000055a46ed39129 in migration_thread
(opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130
#6  0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at
util/qemu-thread-posix.c:502


#7  0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0
#8  0x00007f0acf6916a3 in clone () at /lib64/libc.so.6
(gdb) frame 2
#2  0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE)
at vl.c:742
742             abort();
(gdb) print current_run_state
$1 = RUN_STATE_RUNNING


Neither of migration or runstates are my strong suite, so I thought I'd
report it before diving into it.

Max
From 4f141f42f2ae8cf509495ee0962fd45e160f33af Mon Sep 17 00:00:00 2001
From: Max Reitz <mre...@redhat.com>
Date: Wed, 23 Jan 2019 16:48:07 +0100
Subject: [PATCH] Hack to fix race in tcp_chr_disconnect()

---
 chardev/char-socket.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index eaa8e8b68f..9c326dcbf3 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -124,7 +124,12 @@ static void tcp_chr_accept(QIONetListener *listener,
                            void *opaque);
 
 static int tcp_chr_read_poll(void *opaque);
-static void tcp_chr_disconnect(Chardev *chr);
+static void tcp_chr_do_disconnect(Chardev *chr, bool locked);
+
+static void tcp_chr_disconnect(Chardev *chr)
+{
+    tcp_chr_do_disconnect(chr, false);
+}
 
 /* Called with chr_write_lock held.  */
 static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
@@ -148,7 +153,7 @@ static int tcp_chr_write(Chardev *chr, const uint8_t *buf, int len)
 
         if (ret < 0 && errno != EAGAIN) {
             if (tcp_chr_read_poll(chr) <= 0) {
-                tcp_chr_disconnect(chr);
+                tcp_chr_do_disconnect(chr, true);
                 return len;
             } /* else let the read handler finish it properly */
         }
@@ -444,8 +449,12 @@ static void update_disconnected_filename(SocketChardev *s)
  * reached, due to TLS or telnet initialization failure,
  * so can *not* assume s->connected == true
  */
-static void tcp_chr_disconnect(Chardev *chr)
+static void tcp_chr_do_disconnect(Chardev *chr, bool locked)
 {
+    if (!locked) {
+        qemu_mutex_lock(&chr->chr_write_lock);
+    }
+
     SocketChardev *s = SOCKET_CHARDEV(chr);
     bool emit_close = s->connected;
 
@@ -462,6 +471,10 @@ static void tcp_chr_disconnect(Chardev *chr)
     if (s->reconnect_time) {
         qemu_chr_socket_restart_timer(chr);
     }
+
+    if (!locked) {
+        qemu_mutex_unlock(&chr->chr_write_lock);
+    }
 }
 
 static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
-- 
2.20.1

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to