Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-05-30 Thread Stefan Hajnoczi
On Fri, May 20, 2016 at 03:36:18PM +0800, Changlong Xie wrote:
> +/* start backup job now */
> +error_setg(&s->blocker,
> +   "block device is in use by internal backup job");
> +
> +top_bs = bdrv_lookup_bs(s->top_id, s->top_id, errp);
> +if (!top_bs || !check_top_bs(top_bs, bs)) {
> +reopen_backing_file(s, false, NULL);
> +aio_context_release(aio_context);
> +return;
> +}

Missing error_setg() with an error message when check_top_bs() fails.


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-05-30 Thread Changlong Xie

On 05/31/2016 02:14 AM, Stefan Hajnoczi wrote:

On Fri, May 20, 2016 at 03:36:18PM +0800, Changlong Xie wrote:

+/* start backup job now */
+error_setg(&s->blocker,
+   "block device is in use by internal backup job");
+
+top_bs = bdrv_lookup_bs(s->top_id, s->top_id, errp);
+if (!top_bs || !check_top_bs(top_bs, bs)) {
+reopen_backing_file(s, false, NULL);
+aio_context_release(aio_context);
+return;
+}


Missing error_setg() with an error message when check_top_bs() fails.



Will add.

Thanks
-Xie





Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-06-06 Thread Changlong Xie

On 05/20/2016 03:36 PM, Changlong Xie wrote:

+
+/*
+ * Must protect backup target if backup job was stopped/cancelled
+ * unexpectedly
+ */
+bdrv_ref(s->hidden_disk->bs);
+
+backup_start(s->secondary_disk->bs, s->hidden_disk->bs, 0,
+ MIRROR_SYNC_MODE_NONE, NULL, BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
+ s, NULL, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(s);
+bdrv_unref(s->hidden_disk->bs);
+aio_context_release(aio_context);
+return;
+}
+break;
+default:
+aio_context_release(aio_context);
+abort();
+}


commit 5c438bc6 introduce BB for I/O, so we don't need protect backup 
target by ourself now.


-/*
- * Must protect backup target if backup job was stopped/cancelled
- * unexpectedly
- */
-bdrv_ref(s->hidden_disk->bs);
-
 backup_start(s->secondary_disk->bs, s->hidden_disk->bs, 0,
  MIRROR_SYNC_MODE_NONE, NULL, 
BLOCKDEV_ON_ERROR_REPORT,

  BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
@@ -508,7 +502,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,

 if (local_err) {
 error_propagate(errp, local_err);
 backup_job_cleanup(s);
-bdrv_unref(s->hidden_disk->bs);
 aio_context_release(aio_context);
 return;
 }

will update in next version.





Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-06-06 Thread Changlong Xie

On 05/20/2016 03:36 PM, Changlong Xie wrote:

+if (!failover) {
+/*
+ * This BDS will be closed, and the job should be completed
+ * before the BDS is closed, because we will access hidden
+ * disk, secondary disk in backup_job_completed().
+ */
+if (s->secondary_disk->bs->job) {
+block_job_cancel_sync(s->secondary_disk->bs->job);
+}
+secondary_do_checkpoint(s, errp);
+s->replication_state = BLOCK_REPLICATION_DONE;
+aio_context_release(aio_context);
+return;
+}
+
+s->replication_state = BLOCK_REPLICATION_FAILOVER;
+if (s->secondary_disk->bs->job) {
+block_job_cancel(s->secondary_disk->bs->job);
+}


Since commit b6d2e599 "block: Convert block job core to BlockBackend", 
blockjob uses BB instead of bdrv_ref(), this introduces unexpected 
Segmentation fault with COLO.


In the below backtrace, you can see that. During failover, s->target was 
changed to an illegal value "0x1e1e1e1e1e1e1e1e" in bakup_complete.
Then the active commit job what also has a pointer that refer to 
s->target will use this illegal pointer. To avoid this, we should use 
"bloc_job_cancel_sync" to ensure backup job complete synchronously.


% MALLOC_PERTURB_=$(($RANDOM % 255 + 1))
% export MALLOC_PERTURB_
% gdb --args ./tests/test-replication
(gdb) b backup_complete
(gdb) r
(gdb) n
(gdb) n
(gdb) watch s->target
(gdb) c
Old value = (BlockBackend *) 0x55f1d990
New value = (BlockBackend *) 0x1e1e1e1e1e1e1e1e
0x75a811eb in memset () from /lib64/libc.so.6
(gdb) bt
#0  0x75a811eb in memset () from /lib64/libc.so.6
#1  0x75a7500e in _int_free () from /lib64/libc.so.6
#2  0x7705bf7f in g_free () from /lib64/libglib-2.0.so.0
#3  0x5557e924 in block_job_unref (job=0x55f1d630) at 
blockjob.c:124
#4  0x5557e9da in block_job_completed_single 
(job=0x55f1d630) at blockjob.c:143
#5  0x5557ecc8 in block_job_completed (job=0x55f1d630, 
ret=0) at blockjob.c:215
#6  0x555e6d49 in backup_complete (job=0x55f1d630, 
opaque=0x55f1dd50) at block/backup.c:325
#7  0x5557f5f4 in block_job_defer_to_main_loop_bh 
(opaque=0x596e1dc0) at blockjob.c:500

#8  0x555747d7 in aio_bh_call (bh=0x596e1c30) at async.c:66
#9  0x55574899 in aio_bh_poll (ctx=0x55ce2d60) at async.c:94
#10 0x55581d4d in aio_dispatch (ctx=0x55ce2d60) at 
aio-posix.c:308
#11 0x555823cb in aio_poll (ctx=0x55ce2d60, blocking=false) 
at aio-posix.c:479
#12 0x555d639b in bdrv_drain_poll (bs=0x55dec210) at 
block/io.c:190
#13 0x555d6566 in bdrv_drained_begin (bs=0x55dec210) at 
block/io.c:240
#14 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x596e1ab0) at block.c:665
#15 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55f0e130) 
at block/io.c:54
#16 0x555d652b in bdrv_drained_begin (bs=0x55f0e130) at 
block/io.c:232
#17 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x596e1810) at block.c:665
#18 0x555d5e81 in bdrv_parent_drained_begin (bs=0x596d7030) 
at block/io.c:54
#19 0x555d652b in bdrv_drained_begin (bs=0x596d7030) at 
block/io.c:232
#20 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x55df8830) at block.c:665
#21 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55df0bb0) 
at block/io.c:54

#22 0x555d66ee in bdrv_drain_all () at block/io.c:301
#23 0x55579f9f in bdrv_reopen_multiple (bs_queue=0x55f1dd30, 
errp=0x7fffd768) at block.c:1953
#24 0x5557a169 in bdrv_reopen (bs=0x55df0bb0, 
bdrv_flags=24578, errp=0x7fffd8e0) at block.c:1994
#25 0x555d54a7 in commit_active_start (bs=0x55f0e130, 
base=0x55df0bb0, speed=0, on_error=BLOCKDEV_ON_ERROR_REPORT, 
cb=0x555e8b97 ,
opaque=0x55dec210, errp=0x7fffd8e0, auto_complete=true) at 
block/mirror.c:901
#26 0x555e8d98 in replication_stop (rs=0x5746f7b0, 
failover=true, errp=0x7fffd8e0) at block/replication.c:623
#27 0x555873d1 in replication_stop_all (failover=true, 
errp=0x7fffd928) at replication.c:98
#28 0x5557406d in test_secondary_start () at 
tests/test-replication.c:403
#29 0x7707a5e1 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#30 0x7707a7a6 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#31 0x7707a7a6 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0

#32 0x7707ab1b in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x555746c8 in main (argc=1, argv=0x7fffdda8) at 
tests/test-replication.c:545

(gdb) d breakpoints
(gdb) c
Program received signal SIGSEGV, Segmentation fault.
0x555c7d6c in blk_bs (blk=0x1e1e1e1e1e1e1e1e) at 
block/block-backend.c:389

389 return blk->root ? b