24.01.2019 13:15, Kevin Wolf wrote: > Am 24.01.2019 um 10:29 hat Vladimir Sementsov-Ogievskiy geschrieben: >> 23.01.2019 18:48, Max Reitz wrote: >>> Hi, >>> >>> When running 169 in parallel (e.g. like so: >>> >>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done >>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done >>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done >>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done >>> >>> in four different shells), I get aborts: >>> >>> (Often I get segfaults, but that's because of >>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html -- >>> feel free to apply the attached patch to make them go away) >>> >>> >>> WARNING:qemu:qemu received signal 6: >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock >>> -mon chardev=mon,mode=control -display none -vga none -qtest >>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest >>> -nodefaults -machine accel=qtest -drive >>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback >>> .................E.. >>> ====================================================================== >>> ERROR: >>> test_do_test_migration_resume_source_not_persistent__not_migbitmap >>> (__main__.TestDirtyBitmapMigration) >>> ---------------------------------------------------------------------- >>> Traceback (most recent call last): >>> File "169", line 206, in <lambda> >>> setattr(klass, 'test_' + method + name, lambda self: mc(self)) >>> File "169", line 113, in do_test_migration_resume_source >>> self.check_bitmap(self.vm_a, sha256) >>> File "169", line 72, in check_bitmap >>> node='drive0', name='bitmap0') >>> File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp >>> return self._qmp.cmd(cmd, args=qmp_args) >>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd >>> return self.cmd_obj(qmp_cmd) >>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in cmd_obj >>> resp = self.__json_read() >>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in >>> __json_read >>> data = self.__sockfile.readline() >>> File "/usr/lib64/python2.7/socket.py", line 451, in readline >>> data = self._sock.recv(self._rbufsize) >>> error: [Errno 104] Connection reset by peer >>> >>> ---------------------------------------------------------------------- >>> Ran 20 tests >>> >>> FAILED (errors=1) >>> >>> >>> Or: >>> >>> WARNING:qemu:qemu received signal 6: >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock >>> -mon chardev=mon,mode=control -display none -vga none -qtest >>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults >>> -machine accel=qtest -drive >>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback >>> WARNING:qemu:qemu received signal 6: >>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock >>> -mon chardev=mon,mode=control -display none -vga none -qtest >>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults >>> -machine accel=qtest -drive >>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback >>> >>> ...................F >>> ====================================================================== >>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap >>> (__main__.TestDirtyBitmapMigration) >>> ---------------------------------------------------------------------- >>> Traceback (most recent call last): >>> File "169", line 206, in <lambda> >>> setattr(klass, 'test_' + method + name, lambda self: mc(self)) >>> File "169", line 125, in do_test_migration_resume_source >>> self.assertEqual(log, '') >>> AssertionError: "qemu-system-x86_64: invalid runstate transition: >>> 'running' -> 'postmigrate'\n" != '' >>> >>> ---------------------------------------------------------------------- >>> Ran 20 tests >>> >>> FAILED (failures=1) >>> >>> >>> The backtrace always goes like this: >>> >>> (gdb) bt >>> #0 0x00007f0acf5cc53f in raise () at /lib64/libc.so.6 >>> #1 0x00007f0acf5b6895 in abort () at /lib64/libc.so.6 >>> #2 0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE) >>> at vl.c:742 >>> #3 0x000055a46ebbb1a6 in runstate_set >>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730 >>> #4 0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000) >>> at migration/migration.c:2972 >>> #5 0x000055a46ed39129 in migration_thread >>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130 >>> #6 0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at >>> util/qemu-thread-posix.c:502 >>> >>> >>> #7 0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0 >>> #8 0x00007f0acf6916a3 in clone () at /lib64/libc.so.6 >>> (gdb) frame 2 >>> #2 0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE) >>> at vl.c:742 >>> 742 abort(); >>> (gdb) print current_run_state >>> $1 = RUN_STATE_RUNNING >>> >>> >>> Neither of migration or runstates are my strong suite, so I thought I'd >>> report it before diving into it. >>> >>> Max >>> >> >> [...] >> 450556@1548321072.888979:migrate_set_state new state active >> 450556@1548321072.889022:migration_thread_setup_complete >> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent >> 100 bandwidth 9852 max_size 2955894 >> 450556@1548321072.988283:migration_bitmap_sync_start >> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898 >> 450556@1548321072.988495:migration_thread_low_pending 2048 >> migration_completion >> 450556@1548321072.988541:runstate_set current_state 9 new_state 7 >> 450556@1548321072.988780:migration_bitmap_sync_start >> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898 >> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running >> [1] 450556@1548321072.993237:migrate_set_state new state completed >> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: >> {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": >> "bitmap0", "node": "drive0"}} >> 450556@1548321072.993697:migration_thread_after_loop >> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: >> {"execute": "cont"} >> 450476@1548321072.994461:runstate_set current_state 7 new_state 9 >> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate' >> >> >> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] >> and go to RUNNING. >> then in migration thread we go to migration_iteration_finish() and fail to >> go to POSTMIGRATE. >> (note, that this testcase is about resuming source after migration) >> >> So, fix for test may be additionally wait for POSTMIGRATE, not only for >> MIGRATION COMPLETION. >> >> But how to fix Qemu not to crash? May be, forbid some transitions >> (FINISH_MIGRATE -> RUNNING), >> or at least error-out qmp_cont if runstate is FINISH_MIGRATE? > > qmp_cont currently checks for RUN_STATE_INMIGRATE and gives it special > treatment. As a quick hack, doing the same for RUN_STATE_FINISH_MIGRATE > might fix the problem.
So, I think it's a good thing to do right now and I will do. Hmm, done. Interesting, after applying --- a/qmp.c +++ b/qmp.c @@ -156,6 +156,9 @@ void qmp_cont(Error **errp) return; } else if (runstate_check(RUN_STATE_SUSPENDED)) { return; + } else if (runstate_check(RUN_STATE_FINISH_MIGRATE)) { + error_setg(errp, "Migration is not finalized yet"); + return; } for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { I got new fail: @@ -1,5 +1,22 @@ -.................... +WARNING:qemu:qemu received signal 6: /work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 -chardev socket,id=mon,path=/ramdisk/t3/tmplN68au/qemua-917541-monitor.sock -mon chardev=mon,mode=control -display none -vga none -qtest unix:path=/ramdisk/t3/qemua-917541-qtest.sock -machine accel=qtest -nodefaults -machine accel=qtest -drive if=virtio,id=drive0,file=/ramdisk/t3/disk_a,format=qcow2,cache=writeback +E................... +====================================================================== +ERROR: test_do_test_migration_not_persistent__migbitmap__offline_nonshared (__main__.TestDirtyBitmapMigration) +---------------------------------------------------------------------- +Traceback (most recent call last): + File "169", line 207, in <lambda> + setattr(klass, 'test_' + method + name, lambda self: mc(self)) + File "169", line 167, in do_test_migration + event = self.vm_a.event_wait('MIGRATION') + File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qemu.py", line 436, in eventwait + event = self._qmp.pull_event(wait=timeout) + File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qmp/qmp.py", line 216, in pul_event + self.__get_events(wait) + File "/work/src/qemu/reitz-migration-bug/tests/qemu-iotests/../../scripts/qmp/qmp.py", line 124, in __et_events + raise QMPTimeoutError("Timeout waiting for event") +QMPTimeoutError: Timeout waiting for event #0 0x00007f95f4cde207 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 #1 0x00007f95f4cdf8f8 in __GI_abort () at abort.c:90 #2 0x00007f95f4cd7026 in __assert_fail_base (fmt=0x7f95f4e32520 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x55b63154d370 "((&all_bdrv_states)->tqh_first == ((void *)0))", file=file@entry=0x55b63154c2b0 "block.c", line=line@entry=3534, function=function@entry=0x55b63154e03d <__PRETTY_FUNCTION__.30345> "bdrv_close_all") at assert.c:92 #3 0x00007f95f4cd70d2 in __GI___assert_fail (assertion=0x55b63154d370 "((&all_bdrv_states)->tqh_first == ((void *)0))", file=0x55b63154c2b0 "block.c", line=3534, function=0x55b63154e03d <__PRETTY_FUNCTION__.30345> "bdrv_close_all") at assert.c:101 #4 0x000055b6311e082e in bdrv_close_all () at block.c:3534 #5 0x000055b630f3ae5c in main (argc=18, argv=0x7ffda6f14428, envp=0x7ffda6f144c0) at vl.c:4670 will dig it a bit.. -- Best regards, Vladimir