On 05/20/2016 03:36 PM, Changlong Xie wrote:
+if (!failover) {
+/*
+ * This BDS will be closed, and the job should be completed
+ * before the BDS is closed, because we will access hidden
+ * disk, secondary disk in backup_job_completed().
+ */
+if (s->secondary_disk->bs->job) {
+block_job_cancel_sync(s->secondary_disk->bs->job);
+}
+secondary_do_checkpoint(s, errp);
+s->replication_state = BLOCK_REPLICATION_DONE;
+aio_context_release(aio_context);
+return;
+}
+
+s->replication_state = BLOCK_REPLICATION_FAILOVER;
+if (s->secondary_disk->bs->job) {
+block_job_cancel(s->secondary_disk->bs->job);
+}
Since commit b6d2e599 "block: Convert block job core to BlockBackend",
blockjob uses BB instead of bdrv_ref(), this introduces unexpected
Segmentation fault with COLO.
In the below backtrace, you can see that. During failover, s->target was
changed to an illegal value "0x1e1e1e1e1e1e1e1e" in bakup_complete.
Then the active commit job what also has a pointer that refer to
s->target will use this illegal pointer. To avoid this, we should use
"bloc_job_cancel_sync" to ensure backup job complete synchronously.
% MALLOC_PERTURB_=$(($RANDOM % 255 + 1))
% export MALLOC_PERTURB_
% gdb --args ./tests/test-replication
(gdb) b backup_complete
(gdb) r
(gdb) n
(gdb) n
(gdb) watch s->target
(gdb) c
Old value = (BlockBackend *) 0x55f1d990
New value = (BlockBackend *) 0x1e1e1e1e1e1e1e1e
0x75a811eb in memset () from /lib64/libc.so.6
(gdb) bt
#0 0x75a811eb in memset () from /lib64/libc.so.6
#1 0x75a7500e in _int_free () from /lib64/libc.so.6
#2 0x7705bf7f in g_free () from /lib64/libglib-2.0.so.0
#3 0x5557e924 in block_job_unref (job=0x55f1d630) at
blockjob.c:124
#4 0x5557e9da in block_job_completed_single
(job=0x55f1d630) at blockjob.c:143
#5 0x5557ecc8 in block_job_completed (job=0x55f1d630,
ret=0) at blockjob.c:215
#6 0x555e6d49 in backup_complete (job=0x55f1d630,
opaque=0x55f1dd50) at block/backup.c:325
#7 0x5557f5f4 in block_job_defer_to_main_loop_bh
(opaque=0x596e1dc0) at blockjob.c:500
#8 0x555747d7 in aio_bh_call (bh=0x596e1c30) at async.c:66
#9 0x55574899 in aio_bh_poll (ctx=0x55ce2d60) at async.c:94
#10 0x55581d4d in aio_dispatch (ctx=0x55ce2d60) at
aio-posix.c:308
#11 0x555823cb in aio_poll (ctx=0x55ce2d60, blocking=false)
at aio-posix.c:479
#12 0x555d639b in bdrv_drain_poll (bs=0x55dec210) at
block/io.c:190
#13 0x555d6566 in bdrv_drained_begin (bs=0x55dec210) at
block/io.c:240
#14 0x55577261 in bdrv_child_cb_drained_begin
(child=0x596e1ab0) at block.c:665
#15 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55f0e130)
at block/io.c:54
#16 0x555d652b in bdrv_drained_begin (bs=0x55f0e130) at
block/io.c:232
#17 0x55577261 in bdrv_child_cb_drained_begin
(child=0x596e1810) at block.c:665
#18 0x555d5e81 in bdrv_parent_drained_begin (bs=0x596d7030)
at block/io.c:54
#19 0x555d652b in bdrv_drained_begin (bs=0x596d7030) at
block/io.c:232
#20 0x55577261 in bdrv_child_cb_drained_begin
(child=0x55df8830) at block.c:665
#21 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55df0bb0)
at block/io.c:54
#22 0x555d66ee in bdrv_drain_all () at block/io.c:301
#23 0x55579f9f in bdrv_reopen_multiple (bs_queue=0x55f1dd30,
errp=0x7fffd768) at block.c:1953
#24 0x5557a169 in bdrv_reopen (bs=0x55df0bb0,
bdrv_flags=24578, errp=0x7fffd8e0) at block.c:1994
#25 0x555d54a7 in commit_active_start (bs=0x55f0e130,
base=0x55df0bb0, speed=0, on_error=BLOCKDEV_ON_ERROR_REPORT,
cb=0x555e8b97 ,
opaque=0x55dec210, errp=0x7fffd8e0, auto_complete=true) at
block/mirror.c:901
#26 0x555e8d98 in replication_stop (rs=0x5746f7b0,
failover=true, errp=0x7fffd8e0) at block/replication.c:623
#27 0x555873d1 in replication_stop_all (failover=true,
errp=0x7fffd928) at replication.c:98
#28 0x5557406d in test_secondary_start () at
tests/test-replication.c:403
#29 0x7707a5e1 in g_test_run_suite_internal () from
/lib64/libglib-2.0.so.0
#30 0x7707a7a6 in g_test_run_suite_internal () from
/lib64/libglib-2.0.so.0
#31 0x7707a7a6 in g_test_run_suite_internal () from
/lib64/libglib-2.0.so.0
#32 0x7707ab1b in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x555746c8 in main (argc=1, argv=0x7fffdda8) at
tests/test-replication.c:545
(gdb) d breakpoints
(gdb) c
Program received signal SIGSEGV, Segmentation fault.
0x555c7d6c in blk_bs (blk=0x1e1e1e1e1e1e1e1e) at
block/block-backend.c:389
389 return blk->root ? b