24.01.2019 13:10, Dr. David Alan Gilbert wrote: > * Vladimir Sementsov-Ogievskiy (vsement...@virtuozzo.com) wrote: >> 24.01.2019 12:29, Vladimir Sementsov-Ogievskiy wrote: >>> 23.01.2019 18:48, Max Reitz wrote: >>>> Hi, >>>> >>>> When running 169 in parallel (e.g. like so: >>>> >>>> $ while TEST_DIR=/tmp/t0 ./check -T -qcow2 169; do; done >>>> $ while TEST_DIR=/tmp/t1 ./check -T -qcow2 169; do; done >>>> $ while TEST_DIR=/tmp/t2 ./check -T -qcow2 169; do; done >>>> $ while TEST_DIR=/tmp/t3 ./check -T -qcow2 169; do; done >>>> >>>> in four different shells), I get aborts: >>>> >>>> (Often I get segfaults, but that's because of >>>> http://lists.nongnu.org/archive/html/qemu-devel/2018-12/msg05579.html -- >>>> feel free to apply the attached patch to make them go away) >>>> >>>> >>>> WARNING:qemu:qemu received signal 6: >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>>> -chardev socket,id=mon,path=/tmp/t0/tmpbX30XU/qemua-25745-monitor.sock >>>> -mon chardev=mon,mode=control -display none -vga none -qtest >>>> unix:path=/tmp/t0/qemua-25745-qtest.sock -machine accel=qtest >>>> -nodefaults -machine accel=qtest -drive >>>> if=virtio,id=drive0,file=/tmp/t0/disk_a,format=qcow2,cache=writeback >>>> .................E.. >>>> ====================================================================== >>>> ERROR: >>>> test_do_test_migration_resume_source_not_persistent__not_migbitmap >>>> (__main__.TestDirtyBitmapMigration) >>>> ---------------------------------------------------------------------- >>>> Traceback (most recent call last): >>>> File "169", line 206, in <lambda> >>>> setattr(klass, 'test_' + method + name, lambda self: mc(self)) >>>> File "169", line 113, in do_test_migration_resume_source >>>> self.check_bitmap(self.vm_a, sha256) >>>> File "169", line 72, in check_bitmap >>>> node='drive0', name='bitmap0') >>>> File "tests/qemu-iotests/../../scripts/qemu.py", line 369, in qmp >>>> return self._qmp.cmd(cmd, args=qmp_args) >>>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 191, in cmd >>>> return self.cmd_obj(qmp_cmd) >>>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 174, in >>>> cmd_obj >>>> resp = self.__json_read() >>>> File "tests/qemu-iotests/../../scripts/qmp/qmp.py", line 82, in >>>> __json_read >>>> data = self.__sockfile.readline() >>>> File "/usr/lib64/python2.7/socket.py", line 451, in readline >>>> data = self._sock.recv(self._rbufsize) >>>> error: [Errno 104] Connection reset by peer >>>> >>>> ---------------------------------------------------------------------- >>>> Ran 20 tests >>>> >>>> FAILED (errors=1) >>>> >>>> >>>> Or: >>>> >>>> WARNING:qemu:qemu received signal 6: >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock >>>> -mon chardev=mon,mode=control -display none -vga none -qtest >>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults >>>> -machine accel=qtest -drive >>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback >>>> WARNING:qemu:qemu received signal 6: >>>> build/tests/qemu-iotests/../../x86_64-softmmu/qemu-system-x86_64 >>>> -chardev socket,id=mon,path=/tmp/t3/tmp0pllWD/qemua-3445-monitor.sock >>>> -mon chardev=mon,mode=control -display none -vga none -qtest >>>> unix:path=/tmp/t3/qemua-3445-qtest.sock -machine accel=qtest -nodefaults >>>> -machine accel=qtest -drive >>>> if=virtio,id=drive0,file=/tmp/t3/disk_a,format=qcow2,cache=writeback >>>> >>>> ...................F >>>> ====================================================================== >>>> FAIL: test_do_test_migration_resume_source_persistent__not_migbitmap >>>> (__main__.TestDirtyBitmapMigration) >>>> ---------------------------------------------------------------------- >>>> Traceback (most recent call last): >>>> File "169", line 206, in <lambda> >>>> setattr(klass, 'test_' + method + name, lambda self: mc(self)) >>>> File "169", line 125, in do_test_migration_resume_source >>>> self.assertEqual(log, '') >>>> AssertionError: "qemu-system-x86_64: invalid runstate transition: >>>> 'running' -> 'postmigrate'\n" != '' >>>> >>>> ---------------------------------------------------------------------- >>>> Ran 20 tests >>>> >>>> FAILED (failures=1) >>>> >>>> >>>> The backtrace always goes like this: >>>> >>>> (gdb) bt >>>> #0 0x00007f0acf5cc53f in raise () at /lib64/libc.so.6 >>>> #1 0x00007f0acf5b6895 in abort () at /lib64/libc.so.6 >>>> #2 0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE) >>>> at vl.c:742 >>>> #3 0x000055a46ebbb1a6 in runstate_set >>>> (new_state=new_state@entry=RUN_STATE_POSTMIGRATE) at vl.c:730 >>>> #4 0x000055a46ed39129 in migration_iteration_finish (s=0x55a4708be000) >>>> at migration/migration.c:2972 >>>> #5 0x000055a46ed39129 in migration_thread >>>> (opaque=opaque@entry=0x55a4708be000) at migration/migration.c:3130 >>>> #6 0x000055a46eea665a in qemu_thread_start (args=<optimized out>) at >>>> util/qemu-thread-posix.c:502 >>>> >>>> >>>> #7 0x00007f0acf76258e in start_thread () at /lib64/libpthread.so.0 >>>> #8 0x00007f0acf6916a3 in clone () at /lib64/libc.so.6 >>>> (gdb) frame 2 >>>> #2 0x000055a46ebbb1a6 in runstate_set (new_state=RUN_STATE_POSTMIGRATE) >>>> at vl.c:742 >>>> 742 abort(); >>>> (gdb) print current_run_state >>>> $1 = RUN_STATE_RUNNING >>>> >>>> >>>> Neither of migration or runstates are my strong suite, so I thought I'd >>>> report it before diving into it. >>>> >>>> Max >>>> >>> >>> [...] >>> 450556@1548321072.888979:migrate_set_state new state active >>> 450556@1548321072.889022:migration_thread_setup_complete >>> 450556@1548321072.988275:migrate_transferred transferred 985298 time_spent >>> 100 bandwidth 9852 max_size 2955894 >>> 450556@1548321072.988283:migration_bitmap_sync_start >>> 450556@1548321072.988293:migration_bitmap_sync_end dirty_pages 32898 >>> 450556@1548321072.988495:migration_thread_low_pending 2048 >>> migration_completion >>> 450556@1548321072.988541:runstate_set current_state 9 new_state 7 >>> 450556@1548321072.988780:migration_bitmap_sync_start >>> 450556@1548321072.988790:migration_bitmap_sync_end dirty_pages 32898 >>> 450556@1548321072.993112:migrate_global_state_pre_save saved state: running >>> [1] 450556@1548321072.993237:migrate_set_state new state completed >>> 450549@1548321072.993668:handle_qmp_command mon 0x55aebef99830 req: >>> {"execute": "x-debug-block-dirty-bitmap-sha256", "arguments": {"name": >>> "bitmap0", "node": "drive0"}} >>> 450556@1548321072.993697:migration_thread_after_loop >>> [2] 450549@1548321072.993917:handle_qmp_command mon 0x55aebef99830 req: >>> {"execute": "cont"} >>> 450476@1548321072.994461:runstate_set current_state 7 new_state 9 >>> qemu-system-x86_64: invalid runstate transition: 'running' -> 'postmigrate' >>> >>> >>> Aha, so, in test we wait for MIGRATION COMPLETED [1] and run qmp_cont [2] >>> and go to RUNNING. >>> then in migration thread we go to migration_iteration_finish() and fail to >>> go to POSTMIGRATE. >>> (note, that this testcase is about resuming source after migration) >>> >>> So, fix for test may be additionally wait for POSTMIGRATE, not only for >>> MIGRATION COMPLETION. >>> >>> But how to fix Qemu not to crash? May be, forbid some transitions >>> (FINISH_MIGRATE -> RUNNING), >>> or at least error-out qmp_cont if runstate is FINISH_MIGRATE? >>> >>> >>> >> >> >> Ok, this seems to fix test for me: >> >> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169 >> index 527aebd0cb..dd98168c8d 100755 >> --- a/tests/qemu-iotests/169 >> +++ b/tests/qemu-iotests/169 >> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): >> event = self.vm_a.event_wait('MIGRATION') >> if event['data']['status'] == 'completed': >> break >> + while True: >> + result = self.vm_a.qmp('query-status') >> + if (result['return']['status'] == 'postmigrate'): >> + break > > Hmm; I can see that it might help; although I still can't quite see > how the bad transition that was reported can have happened. >
It can be assumed from my trace above: 1. migration thread do migratioN_completion and goes to RUN_STATE_FINISH_MIGRATE through vm_stop_force_state(RUN_STATE_FINISH_MIGRATE) 2. migration_completion do migrate_set_state(&s->state, current_active_state, MIGRATION_STATUS_COMPLETED); and MIGRATION COMPLETED event is reported, so 3. 169 iotest gets event, and think that all is done about migration. And it call qmp_cont 4. qmp_cont do transition from RUN_STATE_FINISH_MIGRATE to RUN_STATE_RUNNING 5. migration thread goes to migration_iteration_finish and do invalid transition > >> # test that bitmap is still here >> removed = (not migrate_bitmaps) and persistent >> >> - >> >> >> And this is the full diff of the test I used, to skip checking log (as I >> enabled traces) and to skip all other test-cases: >> >> (hmm, we definitely need simple way to run one test case from test, without >> editing it) >> >> diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169 >> index 527aebd0cb..dd98168c8d 100755 >> --- a/tests/qemu-iotests/169 >> +++ b/tests/qemu-iotests/169 >> @@ -102,6 +102,10 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): >> event = self.vm_a.event_wait('MIGRATION') >> if event['data']['status'] == 'completed': >> break >> + while True: >> + result = self.vm_a.qmp('query-status') >> + if (result['return']['status'] == 'postmigrate'): >> + break >> >> # test that bitmap is still here >> removed = (not migrate_bitmaps) and persistent >> @@ -117,12 +121,12 @@ class TestDirtyBitmapMigration(iotests.QMPTestCase): >> >> # catch 'Could not reopen qcow2 layer: Bitmap already exists' >> # possible error >> - log = self.vm_a.get_log() >> - log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log) >> - log = re.sub(r'^(wrote .* bytes at offset >> .*\n.*KiB.*ops.*sec.*\n){3}', >> - '', log) >> - log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log) >> - self.assertEqual(log, '') >> + #log = self.vm_a.get_log() >> + #log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log) >> + #log = re.sub(r'^(wrote .* bytes at offset >> .*\n.*KiB.*ops.*sec.*\n){3}', >> + #'', log) >> + #log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log) >> + #self.assertEqual(log, '') >> >> # test that bitmap is still persistent >> self.vm_a.launch() >> @@ -211,14 +215,16 @@ for cmb in list(itertools.product((True, False), >> repeat=4)): >> name += '_online' if cmb[2] else '_offline' >> name += '_shared' if cmb[3] else '_nonshared' >> >> - inject_test_case(TestDirtyBitmapMigration, name, 'do_test_migration', >> + if False: >> + inject_test_case(TestDirtyBitmapMigration, name, >> 'do_test_migration', >> *list(cmb)) >> >> for cmb in list(itertools.product((True, False), repeat=2)): >> name = ('_' if cmb[0] else '_not_') + 'persistent_' >> name += ('_' if cmb[1] else '_not_') + 'migbitmap' >> >> - inject_test_case(TestDirtyBitmapMigration, name, >> + if name == '_persistent__migbitmap': >> + inject_test_case(TestDirtyBitmapMigration, name, >> 'do_test_migration_resume_source', *list(cmb)) >> >> if __name__ == '__main__': >> diff --git a/tests/qemu-iotests/169.out b/tests/qemu-iotests/169.out >> index 3a89159833..ae1213e6f8 100644 >> --- a/tests/qemu-iotests/169.out >> +++ b/tests/qemu-iotests/169.out >> @@ -1,5 +1,5 @@ >> -.................... >> +. >> ---------------------------------------------------------------------- >> -Ran 20 tests >> +Ran 1 tests >> >> OK >> >> >> >> -- >> Best regards, >> Vladimir > -- > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK > -- Best regards, Vladimir