Re: [Qemu-block] [Qemu-devel] [PATCH 1/2] scsi: add unrealize method for SCSI devices

2018-02-07 Thread Fam Zheng
On Wed, 02/07 17:36, Paolo Bonzini wrote:
> The next patch will introduce a different unrealize implementation
> for scsi-block.  Compared to the code before commit fb7b5c0df6
> ("scsi: devirtualize unrealize of SCSI devices", 2014-10-31), the
> common code for all SCSI devices is kept in scsi-bus.c.
> 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fam Zheng 



Re: [Qemu-block] [Qemu-devel] [PATCH 2/2] scsi: add block job opblockers for scsi-block

2018-02-07 Thread Fam Zheng
On Wed, 02/07 17:36, Paolo Bonzini wrote:
> scsi-block bypasses the dirty bitmaps and pre-write notifiers, so it
> cannot be the source of a block job.  The gist of the fix is to add
> op-blockers to the BlockBackend, and remove them at "unrealize" time,
> but things are a little more complex because quit closes the BlockBackend
> without going through unrealize.
> 
> So use Notifiers: the remove_bs notifier is called by bdrv_close_all, and
> the insert_bs notifier might not be really necessary but make things a
> little more symmetric.
> 
> Suggested-by: Karen Noel 

:)

> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fam Zheng 

Though I have one comment below.

> ---
>  block/block-backend.c  |  9 ++
>  hw/scsi/scsi-disk.c| 62 
> ++
>  include/sysemu/block-backend.h |  1 +
>  3 files changed, 72 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index baef8e7abc..1759639a4a 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1747,6 +1747,15 @@ bool blk_op_is_blocked(BlockBackend *blk, BlockOpType 
> op, Error **errp)
>  return bdrv_op_is_blocked(bs, op, errp);
>  }
>  
> +void blk_op_block(BlockBackend *blk, BlockOpType op, Error *reason)
> +{
> +BlockDriverState *bs = blk_bs(blk);
> +
> +if (bs) {
> +bdrv_op_block(bs, op, reason);
> +}
> +}
> +
>  void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 49d2559d93..023673cb04 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -2578,9 +2578,39 @@ static int get_device_type(SCSIDiskState *s)
>  return 0;
>  }
>  
> +typedef struct SCSIBlockState {
> +SCSIDiskState sd;
> +Error *mirror_source;
> +Error *backup_source;
> +Error *commit_source;
> +Notifier insert_bs;
> +Notifier remove_bs;
> +} SCSIBlockState;
> +
> +static void scsi_block_insert_bs(Notifier *n, void *opaque)
> +{
> +SCSIBlockState *sb = container_of(n, SCSIBlockState, insert_bs);
> +SCSIDiskState *s = &sb->sd;
> +
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
> sb->mirror_source);
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
> sb->commit_source);
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
> sb->backup_source);
> +}
> +
> +static void scsi_block_remove_bs(Notifier *n, void *opaque)
> +{
> +SCSIBlockState *sb = container_of(n, SCSIBlockState, remove_bs);
> +SCSIDiskState *s = &sb->sd;
> +
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
> sb->mirror_source);
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
> sb->commit_source);
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
> sb->backup_source);
> +}
> +
>  static void scsi_block_realize(SCSIDevice *dev, Error **errp)
>  {
>  SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
> +SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
>  int sg_version;
>  int rc;
>  
> @@ -2626,6 +2656,36 @@ static void scsi_block_realize(SCSIDevice *dev, Error 
> **errp)
>  
>  scsi_realize(&s->qdev, errp);
>  scsi_generic_read_device_identification(&s->qdev);
> +
> +/* For op blockers, due to lack of support for dirty bitmaps.  */
> +error_setg(&sb->mirror_source,
> +   "scsi-block does not support acting as a mirroring source");
> +error_setg(&sb->commit_source,
> +   "scsi-block does not support acting as an active commit 
> source");

An alternative way would be adding BLOCK_OP_TYPE_DIRTY_BITMAP. The error message
will not be as nice but it can be useful for another (blockjob) operation that
requires dirty bitmap support, or another device that doesn't support dirty
bitmaps. Though there isn't one for now.

> +
> +/* For op blockers, due to lack of support for write notifiers.  */
> +error_setg(&sb->backup_source,
> +   "scsi-block does not support acting as a backup source");
> +
> +sb->insert_bs.notify = scsi_block_insert_bs;
> +blk_add_insert_bs_notifier(s->qdev.conf.blk, &sb->insert_bs);
> +sb->remove_bs.notify = scsi_block_remove_bs;
> +blk_add_remove_bs_notifier(s->qdev.conf.blk, &sb->remove_bs);
> +
> +scsi_block_insert_bs(&sb->insert_bs, s->qdev.conf.blk);
> +}
> +
> +static void scsi_block_unrealize(SCSIDevice *dev, Error **errp)
> +{
> +SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
> +SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
> +
> +notifier_remove(&sb->insert_bs);
> +notifier_remove(&sb->remove_bs);
> +scsi_block_remove_bs(&sb->insert_bs, s->qdev.conf.blk);
> +error_free(sb->mirror_source);
> +error_free(sb->commit_source);
> +error_free(sb->backup_source);
>  }
>  
>  typedef struct SCSIBlockReq {
> @@ -3017,6 +3077,7 @@ static void scs

Re: [Qemu-block] [Qemu-devel] [PATCH] block: early check for blockers on drive-mirror

2018-02-07 Thread Fam Zheng
On Wed, 02/07 17:29, Paolo Bonzini wrote:
> Even if an op blocker is present for BLOCK_OP_TYPE_MIRROR_SOURCE,
> it is checked a bit late and the result is that the target is
> created even if drive-mirror subsequently fails.  Add an early
> check to avoid this.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  blockdev.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/blockdev.c b/blockdev.c
> index 8e977eef11..c7e2e0a00e 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -3565,6 +3565,11 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
>  return;
>  }
>  
> +/* Early check to avoid creating target */
> +if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_MIRROR_SOURCE, errp)) {
> +return;
> +}
> +
>  aio_context = bdrv_get_aio_context(bs);
>  aio_context_acquire(aio_context);
>  
> -- 
> 2.14.3
> 
> 

Reviewed-by: Fam Zheng 



Re: [Qemu-block] [PATCH v2 11/13] block/mirror: remove block_job_sleep_ns calls

2018-02-07 Thread Max Reitz
On 2018-02-07 23:46, Max Reitz wrote:
> On 2018-01-19 21:58, John Snow wrote:
>> We're attempting to slacken the mirror loop in three different places,
>> but we can combine these three attempts. Combine the early loop call to
>> block_job_pause_point with the two late-loop calls to block_job_sleep_ns.
>>
>> When delay_ns is 0 and it has not been SLICE_TIME since the last yield,
>> block_job_relax is merely a call to block_job_pause_point, so this should
>> be equivalent with the exception that if we have managed to not yield at
>> all in the last SLICE_TIME ns, we will now do so.
>>
>> I am not sure that condition was possible,
>> so this loop should be equivalent.
> 
> Well, to me it even sounds like a positive change if it was a change.
> We want the job to yield after SLICE_TIME ns, after all, and I don't
> think it matters where that happens, exactly.
> 
>>
>> Signed-off-by: John Snow 
>> ---
>>  block/mirror.c | 22 +++---
>>  block/trace-events |  2 +-
>>  2 files changed, 12 insertions(+), 12 deletions(-)
>>
>> diff --git a/block/mirror.c b/block/mirror.c
>> index a0e0044de2..192e03694f 100644
>> --- a/block/mirror.c
>> +++ b/block/mirror.c
>> @@ -761,7 +761,7 @@ static void coroutine_fn mirror_run(void *opaque)
>>  assert(!s->dbi);
>>  s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
>>  for (;;) {
>> -uint64_t delay_ns = 0;
>> +static uint64_t delay_ns = 0;
> 
> Errr.  Are you sure about that?
> 
> Now every mirror job in the qeny process will share this single
> variable.  Was that your intention?

("Errr" @myself for "qeny")



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 13/13] blockjob: remove block_job_pause_point from interface

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> Remove the last call in block/mirror, using relax instead.
> relax may do nothing if we are canceled, so allow iteration to return
> prematurely and allow mirror_run to handle the cancellation logic.

Ah, now you write it with two l? ;-)

> 
> This is a functional change to mirror that should have the effect of
> cancelled mirror jobs being able to respond to that request a little

??!??!  Such inconsistency.  Many l.

> sooner instead of launching new requests.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   |  4 +++-
>  blockjob.c   | 10 +-
>  include/block/blockjob_int.h |  9 -
>  3 files changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 192e03694f..8e6b5b25a9 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -345,7 +345,9 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>  mirror_wait_for_io(s);
>  }
>  
> -block_job_pause_point(&s->common);
> +if (block_job_relax(&s->common, 0)) {
> +return 0;
> +}

:c

>  
>  /* Find the number of consective dirty chunks following the first dirty
>   * one, and wait for in flight requests in them. */
> diff --git a/blockjob.c b/blockjob.c
> index 40167d6896..27c13fdd08 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -60,6 +60,7 @@ static void __attribute__((__constructor__)) 
> block_job_init(void)
>  static void block_job_event_cancelled(BlockJob *job);
>  static void block_job_event_completed(BlockJob *job, const char *msg);
>  static void block_job_enter_cond(BlockJob *job, bool(*fn)(BlockJob *job));
> +static int coroutine_fn block_job_pause_point(BlockJob *job);
>  
>  /* Transactional group of block jobs */
>  struct BlockJobTxn {
> @@ -793,7 +794,14 @@ static void block_job_do_yield(BlockJob *job, uint64_t 
> ns)
>  assert(job->busy);
>  }
>  
> -int coroutine_fn block_job_pause_point(BlockJob *job)
> +/**
> + * block_job_pause_point:
> + * @job: The job that is ready to pause.
> + *
> + * Pause now if block_job_pause() has been called.  Block jobs that perform
> + * lots of I/O must call this between requests so that the job can be paused.

But jobs can't call this anymore, now.  This part of the comment should
either mention block_job_relax() instead or should be moved there
altogether.

Max

> + */
> +static int coroutine_fn block_job_pause_point(BlockJob *job)
>  {
>  assert(job && block_job_started(job));
>  
> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index c4891a5a9b..57327cbc5a 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -201,15 +201,6 @@ void block_job_completed(BlockJob *job, int ret);
>   */
>  bool block_job_is_cancelled(BlockJob *job);
>  
> -/**
> - * block_job_pause_point:
> - * @job: The job that is ready to pause.
> - *
> - * Pause now if block_job_pause() has been called.  Block jobs that perform
> - * lots of I/O must call this between requests so that the job can be paused.
> - */
> -int coroutine_fn block_job_pause_point(BlockJob *job);
> -
>  /**
>   * block_job_enter:
>   * @job: The job to enter.
> 




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 12/13] blockjob: privatize block_job_sleep_ns

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> There's not currently any external caller of it.
> 
> Except in tests, but we'll fix that here too.
> 
> Replace usages in test cases with block_job_relax, which functions
> similarly enough to be used as a drop-in replacement.
> 
> Very technically block_job_sleep_ns(job, 0) behaves differently
> from block_job_relax(job, 0) in that relax may resolve to a no-op,
> but this makes no difference in the test in which it is used.
> 
> Signed-off-by: John Snow 
> ---
>  blockjob.c   | 11 ++-
>  include/block/blockjob_int.h | 11 ---
>  tests/test-bdrv-drain.c  |  2 +-
>  tests/test-blockjob-txn.c|  2 +-
>  4 files changed, 12 insertions(+), 14 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 11/13] block/mirror: remove block_job_sleep_ns calls

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> We're attempting to slacken the mirror loop in three different places,
> but we can combine these three attempts. Combine the early loop call to
> block_job_pause_point with the two late-loop calls to block_job_sleep_ns.
> 
> When delay_ns is 0 and it has not been SLICE_TIME since the last yield,
> block_job_relax is merely a call to block_job_pause_point, so this should
> be equivalent with the exception that if we have managed to not yield at
> all in the last SLICE_TIME ns, we will now do so.
> 
> I am not sure that condition was possible,
> so this loop should be equivalent.

Well, to me it even sounds like a positive change if it was a change.
We want the job to yield after SLICE_TIME ns, after all, and I don't
think it matters where that happens, exactly.

> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c | 22 +++---
>  block/trace-events |  2 +-
>  2 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index a0e0044de2..192e03694f 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -761,7 +761,7 @@ static void coroutine_fn mirror_run(void *opaque)
>  assert(!s->dbi);
>  s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
>  for (;;) {
> -uint64_t delay_ns = 0;
> +static uint64_t delay_ns = 0;

Errr.  Are you sure about that?

Now every mirror job in the qeny process will share this single
variable.  Was that your intention?

>  int64_t cnt, delta;
>  bool should_complete;
>  
> @@ -770,9 +770,16 @@ static void coroutine_fn mirror_run(void *opaque)
>  goto immediate_exit;
>  }
>  
> -block_job_pause_point(&s->common);
> -
>  cnt = bdrv_get_dirty_count(s->dirty_bitmap);
> +
> +trace_mirror_before_relax(s, cnt, s->synced, delay_ns);
> +if (block_job_relax(&s->common, delay_ns)) {

See the reply to that patch.

> +if (!s->synced) {
> +goto immediate_exit;
> +}
> +}
> +delay_ns = 0;
> +
>  /* s->common.offset contains the number of bytes already processed so
>   * far, cnt is the number of dirty bytes remaining and
>   * s->bytes_in_flight is the number of bytes currently being
> @@ -849,15 +856,8 @@ static void coroutine_fn mirror_run(void *opaque)
>  }
>  
>  ret = 0;
> -trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
> -if (!s->synced) {
> -block_job_sleep_ns(&s->common, delay_ns);
> -if (block_job_is_cancelled(&s->common)) {
> -break;
> -}
> -} else if (!should_complete) {
> +if (s->synced && !should_complete) {
>  delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
> -block_job_sleep_ns(&s->common, delay_ns);
>  }
>  }

Basic idea looks good to me (apart from the static delay_ns), but, well,
block-job-cancel on a busy job still breaks.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 10/13] block/mirror: condense cancellation and relax calls

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> We can count on the relax call to check cancellation for us, so
> condense these concurrent calls.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 3c73caed5e..a0e0044de2 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -610,9 +610,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob 
> *s)
>  int bytes = MIN(s->bdev_length - offset,
>  QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
>  
> -block_job_relax(&s->common, 0);
> -
> -if (block_job_is_cancelled(&s->common)) {
> +if (block_job_relax(&s->common, 0)) {
>  s->initial_zeroing_ongoing = false;
>  return 0;
>  }
> @@ -638,9 +636,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob 
> *s)
>  int bytes = MIN(s->bdev_length - offset,
>  QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
>  
> -block_job_relax(&s->common, 0);
> -
> -if (block_job_is_cancelled(&s->common)) {
> +if (block_job_relax(&s->common, 0)) {
>  return 0;
>  }

“See last patch.”



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 09/13] block/backup: remove yield_and_check

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> This is a respin of the same functionality as mirror_throttle,
> so trash this and replace it with the generic version.
> 
> yield_and_check returned true if canceled, false otherwise.
> block_job_relax returns -ECANCELED if canceled, 0 otherwise.
> 
> Signed-off-by: John Snow 
> ---
>  block/backup.c | 20 
>  1 file changed, 4 insertions(+), 16 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index b4204c0ee4..0624c3b322 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -334,29 +334,17 @@ static void backup_complete(BlockJob *job, void *opaque)
>  g_free(data);
>  }
>  
> -static bool coroutine_fn yield_and_check(BackupBlockJob *job)
> +static uint64_t get_delay_ns(BackupBlockJob *job)
>  {
>  uint64_t delay_ns = 0;
>  
> -if (block_job_is_cancelled(&job->common)) {
> -return true;
> -}
> -
> -/* we need to yield so that bdrv_drain_all() returns.
> - * (without, VM does not reboot)
> - */
>  if (job->common.speed) {
>  delay_ns = ratelimit_calculate_delay(&job->limit,
>   job->bytes_read);
>  job->bytes_read = 0;
>  }
>  
> -block_job_relax(&job->common, delay_ns);
> -if (block_job_is_cancelled(&job->common)) {
> -return true;
> -}
> -
> -return false;
> +return delay_ns;
>  }
>  
>  static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
> @@ -369,7 +357,7 @@ static int coroutine_fn 
> backup_run_incremental(BackupBlockJob *job)
>  hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
>  do {
> -if (yield_and_check(job)) {
> +if (block_job_relax(&job->common, get_delay_ns(job))) {
>  return 0;
>  }
>  ret = backup_do_cow(job, cluster * job->cluster_size,
> @@ -465,7 +453,7 @@ static void coroutine_fn backup_run(void *opaque)
>  bool error_is_read;
>  int alloced = 0;
>  
> -if (yield_and_check(job)) {
> +if (block_job_relax(&job->common, get_delay_ns(job))) {
>  break;
>  }
>  
> 

I'd very much prefer an explicit block_job_relax(...) == -ECANCELED, I
have to say.

If you coerce me, I can give an R-b, but you'll have to wrest it from me
by force!

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 08/13] allow block_job_relax to return -ECANCELED

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> This is just an optimization for callers who are likely going to
> want to check quite close to this call if the job was canceled or
> not anyway.

But jobs are “cancelled” and not “canceled”.

!!!

> 
> Along the same lines, add the return to block_job_pause_point and
> block_job_sleep_ns, so we don't have to re-check it quite so
> excessively.
> 
> Signed-off-by: John Snow 
> ---
>  blockjob.c   | 28 +---
>  include/block/blockjob_int.h |  8 +---
>  2 files changed, 22 insertions(+), 14 deletions(-)

[...]

> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index 5f1520fab7..1ceb47e1e6 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -147,7 +147,7 @@ void *block_job_create(const char *job_id, const 
> BlockJobDriver *driver,
>   * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
>   * interrupt the wait.
>   */
> -void block_job_sleep_ns(BlockJob *job, int64_t ns);
> +int block_job_sleep_ns(BlockJob *job, int64_t ns);
>  
>  /**
>   * block_job_yield:
> @@ -167,8 +167,10 @@ void block_job_yield(BlockJob *job);
>   * If delay_ns is 0, yield if it has been SLICE_TIME
>   * nanoseconds since the last yield. Otherwise, check
>   * if we need to yield for a pause event.
> + *
> + * returns ECANCELED if the job has been canceled.

-ECANCELED, please.

With that fixed:

Reviewed-by: Max Reitz 

>   */
> -void block_job_relax(BlockJob *job, int64_t delay_ns);
> +int block_job_relax(BlockJob *job, int64_t delay_ns);
>  
>  /**
>   * block_job_pause_all:
> @@ -217,7 +219,7 @@ bool block_job_is_cancelled(BlockJob *job);
>   * Pause now if block_job_pause() has been called.  Block jobs that perform
>   * lots of I/O must call this between requests so that the job can be paused.
>   */
> -void coroutine_fn block_job_pause_point(BlockJob *job);
> +int coroutine_fn block_job_pause_point(BlockJob *job);
>  
>  /**
>   * block_job_enter:
> 




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 07/13] block/backup: use block_job_relax

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> See two commits back for justification.

When will git-log support hyperlinks so you can write "HEAD^^" here? ^^

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 04/13] blockjob: allow block_job_throttle to take delay_ns

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> Instead of only sleeping for 0ms when we've hit a timeout, optionally
> take a longer more explicit delay_ns that always forces the sleep.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   |  4 ++--
>  blockjob.c   |  9 -
>  include/block/blockjob_int.h | 10 +++---
>  3 files changed, 13 insertions(+), 10 deletions(-)

[...]

> diff --git a/blockjob.c b/blockjob.c
> index 6f2e709b51..51c0eb5d9e 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -906,12 +906,11 @@ void block_job_yield(BlockJob *job)
>  block_job_pause_point(job);
>  }
>  
> -void block_job_relax(BlockJob *job)
> +void block_job_relax(BlockJob *job, int64_t delay_ns)
>  {
> -int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> -
> -if (now - job->last_enter_ns > SLICE_TIME) {
> -block_job_sleep_ns(job, 0);
> +if (delay_ns || (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - \
> + job->last_enter_ns > SLICE_TIME)) {
> +block_job_sleep_ns(job, delay_ns);

I can't say I like the readability of that any better...

(And I'd argue that if delay_ns > 0, the one superfluous call to
qemu_clock_get_ns() isn't going to harm performance.)

>  } else {
>  block_job_pause_point(job);
>  }
> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index 553784d86f..5f1520fab7 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -160,11 +160,15 @@ void block_job_yield(BlockJob *job);
>  /**
>   * block_job_relax:
>   * @job: The job that calls the function.
> + * @delay_ns: The amount of time to sleep for
>   *
> - * Yield if it has been SLICE_TIME nanoseconds since the last yield.
> - * Otherwise, check if we need to pause, and yield if so.
> + * Sleep for delay_ns nanoseconds.
> + *
> + * If delay_ns is 0, yield if it has been SLICE_TIME
> + * nanoseconds since the last yield. Otherwise, check
> + * if we need to yield for a pause event.

That "Otherwise" now sounds as if it refers to the "If delay_ns is 0".

Code change is OK, but this comment needs some fixing.

Max

>   */
> -void block_job_relax(BlockJob *job);
> +void block_job_relax(BlockJob *job, int64_t delay_ns);
>  
>  /**
>   * block_job_pause_all:
> 




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 03/13] blockjob: create block_job_relax

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> This will replace mirror_throttle, for reuse in other jobs.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   | 15 ++-
>  blockjob.c   | 11 +++
>  include/block/blockjob_int.h |  9 +
>  3 files changed, 22 insertions(+), 13 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 01/13] blockjob: record time of last entrance

2018-02-07 Thread Max Reitz
On 2018-01-19 21:58, John Snow wrote:
> The mirror job makes a semi-inaccurate record of the last time we yielded
> by recording the last time we left a "pause", but this doesn't always
> correlate to the time we actually last successfully ceded control.
> 
> Record the time we last *exited* a yield centrally. In other words, record
> the time we began execution of this job to know how long we have been
> selfish for.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   | 8 ++--
>  blockjob.c   | 2 ++
>  include/block/blockjob.h | 5 +
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index c9badc1203..88f4e8964d 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -63,7 +63,6 @@ typedef struct MirrorBlockJob {
>  QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
>  int buf_free_count;
>  
> -uint64_t last_pause_ns;
>  unsigned long *in_flight_bitmap;
>  int in_flight;
>  int64_t bytes_in_flight;
> @@ -596,8 +595,7 @@ static void mirror_throttle(MirrorBlockJob *s)
>  {
>  int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  
> -if (now - s->last_pause_ns > SLICE_TIME) {
> -s->last_pause_ns = now;
> +if (now - s->common.last_enter_ns > SLICE_TIME) {
>  block_job_sleep_ns(&s->common, 0);
>  } else {
>  block_job_pause_point(&s->common);
> @@ -769,7 +767,6 @@ static void coroutine_fn mirror_run(void *opaque)
>  
>  mirror_free_init(s);
>  
> -s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  if (!s->is_none_mode) {
>  ret = mirror_dirty_init(s);
>  if (ret < 0 || block_job_is_cancelled(&s->common)) {
> @@ -803,7 +800,7 @@ static void coroutine_fn mirror_run(void *opaque)
>   * We do so every SLICE_TIME nanoseconds, or when there is an error,
>   * or when the source is clean, whichever comes first.
>   */
> -delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
> +delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - 
> s->common.last_enter_ns;

The horror.

>  if (delta < SLICE_TIME &&
>  s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
>  if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
> @@ -878,7 +875,6 @@ static void coroutine_fn mirror_run(void *opaque)
>  delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
>  block_job_sleep_ns(&s->common, delay_ns);
>  }
> -s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  }

Hmmm.  So you're now relying on block_job_sleep_ns() updating
last_enter_ns.  But it will only do so if we actually do sleep, which we
will not if the block job has been cancelled.

Then, last_enter_ns will stay at its current value indefinitely because
neither block_job_sleep_ns() nor block_job_pause_point() will yield.
OK, so at this point we should leave the mirror loop.  There are three
points where this is done:

(1) If s->ret < 0.  OK, let's hope it isn't.
(2) If cnt == 0 && should_complete.  We'll come back to that.
(3) If !s->synced && block_job_is_cancelled(...).  So basically now, if
the job has not emitted a READY event yet.

But if we have emitted the READY, we have to wait for cnt == 0 &&
should_complete (note that should_complete in turn will only be set if
s->in_flight == 0 && cnt == 0).  But unless delta < SLICE_TIME, we will
never do another mirror_iteration(), so unless we have already started
the necessary requests, they will never be started and we will loop forever.

So try this on master (I prefixed the QMP lines with in/out depending on
the direction -- note that it's important to have the block-job-cancel
on the same line the human-monitor-command finishes on so they are
executed right after each other!):

$ ./qemu-img create -f qcow2 foo.qcow2 64M
$ x86_64-softmmu/qemu-system-x86_64 -qmp stdio
In: {"QMP": {"version": {"qemu": {"micro": 50, "minor": 11, "major": 2},
"package": " (v2.9.0-632-g4a52d43-dirty)"}, "capabilities": []}}
Out: {"execute":"qmp_capabilities"}
In: {"return": {}}
Out: {"execute":"object-add","arguments":
  {"qom-type":"throttle-group","id":"tg0","props":{"limits":
 {"bps-total":16777216
In: {"return": {}}
Out: {"execute":"blockdev-add","arguments":
  {"node-name":"source","driver":"qcow2","file":
   {"driver":"file","filename":"foo.qcow2"}}}
In: {"return": {}}
Out: {"execute":"blockdev-add","arguments":
  {"node-name":"target","driver":"throttle",
   "throttle-group":"tg0","file":
   {"driver":"null-co","size":67108864}}}
In: {"return": {}}
Out: {"execute":"blockdev-mirror","arguments":
 {"job-id":"mirror","device":"source","target":"target",
  "sync":"full"}}
In: {"return": {}}
In: {"timestamp": {"seconds": 1518040566, "microseconds": 658111},
 "event": "BLOCK_JOB_READY", "data":
 {"device": "mirror", "len": 67108864, "offset": 67108864,
  "speed": 0, "t

Re: [Qemu-block] Block Migration and CPU throttling

2018-02-07 Thread Peter Lieven

> Am 07.02.2018 um 19:29 schrieb Dr. David Alan Gilbert :
> 
> * Peter Lieven (p...@kamp.de) wrote:
>> Am 12.12.2017 um 18:05 schrieb Dr. David Alan Gilbert:
>>> * Peter Lieven (p...@kamp.de) wrote:
 Am 21.09.2017 um 14:36 schrieb Dr. David Alan Gilbert:
> * Peter Lieven (p...@kamp.de) wrote:
>> Am 19.09.2017 um 16:41 schrieb Dr. David Alan Gilbert:
>>> * Peter Lieven (p...@kamp.de) wrote:
 Am 19.09.2017 um 16:38 schrieb Dr. David Alan Gilbert:
> * Peter Lieven (p...@kamp.de) wrote:
>> Hi,
>> 
>> I just noticed that CPU throttling and Block Migration don't work 
>> together very well.
>> During block migration the throttling heuristic detects that we 
>> obviously make no progress
>> in ram transfer. But the reason is the running block migration and 
>> not a too high dirty pages rate.
>> 
>> The result is that any VM is throttled by 99% during block migration.
> Hmm that's unfortunate; do you have a bandwidth set lower than your
> actual network connection? I'm just wondering if it's actually going
> between the block and RAM iterative sections or getting stuck in ne.
 It happens also if source and dest are on the same machine and speed 
 is set to 100G.
>>> But does it happen if they're not and the speed is set low?
>> Yes, it does. I noticed it in our test environment between different 
>> nodes with a 10G
>> link in between. But its totally clear why it happens. During block 
>> migration we transfer
>> all dirty memory pages in each round (if there is moderate memory load), 
>> but all dirty
>> pages are obviously more than 50% of the transferred ram in that round.
>> It is more exactly 100%. But the current logic triggers on this 
>> condition.
>> 
>> I think I will go forward and send a patch which disables auto converge 
>> during
>> block migration bulk stage.
> Yes, that's fair;  it probably would also make sense to throttle the RAM
> migration during the block migration bulk stage, since the chances are
> it's not going to get far.  (I think in the nbd setup, the main
> migration process isn't started until the end of bulk).
 Catching up with the idea of delaying ram migration until block bulk has 
 completed.
 What do you think is the easiest way to achieve this?
>>> 
>>> 
>>> I think the answer depends whether we think this is a 'special' or we
>>> need a new general purpose mechanism.
>>> 
>>> If it was really general then we'd probably want to split the iterative
>>> stage in two somehow, and only do RAM in the second half.
>>> 
>>> But I'm not sure it's worth it; I suspect the easiest way is:
>>> 
>>>a) Add a counter in migration/ram.c or in the RAM state somewhere
>>>b) Make ram_save_inhibit increment the counter
>>>c) Check the counter at the head of ram_save_iterate and just exit
>>>  if it's none 0
>>>d) Call ram_save_inhibit from block_save_setup
>>>e) Then release it when you've finished the bulk stage
>>> 
>>> Make sure you still count the RAM in the pending totals, otherwise
>>> migration might think it's finished a bit early.
>> 
>> Is there any culprit I don't see or is it as easy as this?
> 
> Hmm, looks promising doesn't it;  might need an include or two tidied
> up, but looks worth a try.   Just be careful that there are no cases
> where block migration can't transfer data in that state, otherwise we'll
> keep coming back to here and spewing empty sections.

I already tested it and it actually works.

What would you expect to be cleaned up before it would be a proper patch?

Are there any implications with RDMA and/or post copy migration?
Is block migration possible at all with those?

Peter



[Qemu-block] [ovirt-users] qcow2 images corruption

2018-02-07 Thread Nicolas Ecarnot

Hello,

TL; DR : qcow2 images keep getting corrupted. Any workaround?

Long version:
This discussion has already been launched by me on the oVirt and on 
qemu-block mailing list, under similar circumstances but I learned 
further things since months and here are some informations :


- We are using 2 oVirt 3.6.7.5-1.el7.centos datacenters, using CentOS 
7.{2,3} hosts

- Hosts :
  - CentOS 7.2 1511 :
- Kernel = 3.10.0 327
- KVM : 2.3.0-31
- libvirt : 1.2.17
- vdsm : 4.17.32-1
  - CentOS 7.3 1611 :
- Kernel 3.10.0 514
- KVM : 2.3.0-31
- libvirt 2.0.0-10
- vdsm : 4.17.32-1
- Our storage is 2 Equallogic SANs connected via iSCSI on a dedicated 
network
- Depends on weeks, but all in all, there are around 32 hosts, 8 storage 
domains and for various reasons, very few VMs (less than 200).
- One peculiar point is that most of our VMs are provided an additional 
dedicated network interface that is iSCSI-connected to some volumes of 
our SAN - these volumes not being part of the oVirt setup. That could 
lead to a lot of additional iSCSI traffic.


From times to times, a random VM appears paused by oVirt.
Digging into the oVirt engine logs, then into the host vdsm logs, it 
appears that the host considers the qcow2 image as corrupted.
Along what I consider as a conservative behavior, vdsm stops any 
interaction with this image and marks it as paused.

Any try to unpause it leads to the same conservative pause.

After having found (https://access.redhat.com/solutions/1173623) the 
right logical volume hosting the qcow2 image, I can run qemu-img check 
on it.

- On 80% of my VMs, I find no errors.
- On 15% of them, I find Leaked cluster errors that I can correct using 
"qemu-img check -r all"
- On 5% of them, I find Leaked clusters errors and further fatal errors, 
which can not be corrected with qemu-img.
In rare cases, qemu-img can correct them, but destroys large parts of 
the image (becomes unusable), and on other cases it can not correct them 
at all.


Months ago, I already sent a similar message but the error message was 
about No space left on device 
(https://www.mail-archive.com/qemu-block@gnu.org/msg00110.html).


This time, I don't have this message about space, but only corruption.

I kept reading and found a similar discussion in the Proxmox group :
https://lists.ovirt.org/pipermail/users/2018-February/086750.html

https://forum.proxmox.com/threads/qcow2-corruption-after-snapshot-or-heavy-disk-i-o.32865/page-2

What I read similar to my case is :
- usage of qcow2
- heavy disk I/O
- using the virtio-blk driver

In the proxmox thread, they tend to say that using virtio-scsi is the 
solution. Having asked this question to oVirt experts 
(https://lists.ovirt.org/pipermail/users/2018-February/086753.html) but 
it's not clear the driver is to blame.


I agree with the answer Yaniv Kaul gave to me, saying I have to properly 
report the issue, so I'm longing to know which peculiar information I 
can give you now.


As you can imagine, all this setup is in production, and for most of the 
VMs, I can not "play" with them. Moreover, we launched a campaign of 
nightly stopping every VM, qemu-img check them one by one, then boot.

So it might take some time before I find another corrupted image.
(which I'll preciously store for debug)

Other informations : We very rarely do snapshots, but I'm close to 
imagine that automated migrations of VMs could trigger similar behaviors 
on qcow2 images.


Last point about the versions we use : yes that's old, yes we're 
planning to upgrade, but we don't know when.


Regards,

--
Nicolas ECARNOT
___
Users mailing list
us...@ovirt.org
http://lists.ovirt.org/mailman/listinfo/users



Re: [Qemu-block] [Qemu-devel] [PATCH v2] iotests: 205: support luks format

2018-02-07 Thread Eric Blake

On 02/07/2018 02:37 PM, Eric Blake wrote:

On 02/06/2018 12:57 PM, Eric Blake wrote:

On 02/06/2018 12:26 PM, Daniel P. Berrangé wrote:
On Tue, Feb 06, 2018 at 09:25:07PM +0300, Vladimir 
Sementsov-Ogievskiy wrote:

Support default luks options in VM.add_drive and in new library
function qemu_img_create. Use it in 205 iotests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Reviewed-by: Daniel P. Berrange 


Thanks. I'll take this through my NBD queue.

git git://repo.or.cz/qemu/ericb.git nbd



I'm seeing this failure now :(

$ ./check -luks 205
QEMU  -- "/home/eblake/qemu/x86_64-softmmu/qemu-system-x86_64" 
-nodefaults -machine accel=qtest

QEMU_IMG  -- "/home/eblake/qemu/qemu-img"
QEMU_IO   -- "/home/eblake/qemu/qemu-io"  --cache writeback
QEMU_NBD  -- "/home/eblake/qemu/qemu-nbd"
IMGFMT    -- luks (iter-time=10)
IMGPROTO  -- file
PLATFORM  -- Linux/x86_64 red 4.14.16-300.fc27.x86_64
TEST_DIR  -- /home/eblake/qemu/tests/qemu-iotests/scratch
SOCKET_SCM_HELPER -- /home/eblake/qemu/tests/qemu-iotests/socket_scm_helper

205 [failed, exit status 1] - output mismatch (see 205.out.bad)
--- /home/eblake/qemu/tests/qemu-iotests/205.out    2018-02-07 
09:48:13.346107367 -0600
+++ /home/eblake/qemu/tests/qemu-iotests/205.out.bad    2018-02-07 
14:35:21.859890826 -0600

@@ -1,5 +1,159 @@
-...
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?


Perhaps a false alarm due to a stale qemu-system-x86 process left over 
from an earlier aborted test run.  When I retried on a fresh system, the 
test passed for me.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [PATCH v2] iotests: 205: support luks format

2018-02-07 Thread Eric Blake

On 02/06/2018 12:57 PM, Eric Blake wrote:

On 02/06/2018 12:26 PM, Daniel P. Berrangé wrote:
On Tue, Feb 06, 2018 at 09:25:07PM +0300, Vladimir Sementsov-Ogievskiy 
wrote:

Support default luks options in VM.add_drive and in new library
function qemu_img_create. Use it in 205 iotests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Reviewed-by: Daniel P. Berrange 


Thanks. I'll take this through my NBD queue.

git git://repo.or.cz/qemu/ericb.git nbd



I'm seeing this failure now :(

$ ./check -luks 205
QEMU  -- "/home/eblake/qemu/x86_64-softmmu/qemu-system-x86_64" 
-nodefaults -machine accel=qtest

QEMU_IMG  -- "/home/eblake/qemu/qemu-img"
QEMU_IO   -- "/home/eblake/qemu/qemu-io"  --cache writeback
QEMU_NBD  -- "/home/eblake/qemu/qemu-nbd"
IMGFMT-- luks (iter-time=10)
IMGPROTO  -- file
PLATFORM  -- Linux/x86_64 red 4.14.16-300.fc27.x86_64
TEST_DIR  -- /home/eblake/qemu/tests/qemu-iotests/scratch
SOCKET_SCM_HELPER -- /home/eblake/qemu/tests/qemu-iotests/socket_scm_helper

205 [failed, exit status 1] - output mismatch (see 205.out.bad)
--- /home/eblake/qemu/tests/qemu-iotests/205.out	2018-02-07 
09:48:13.346107367 -0600
+++ /home/eblake/qemu/tests/qemu-iotests/205.out.bad	2018-02-07 
14:35:21.859890826 -0600

@@ -1,5 +1,159 @@
-...
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+EEE
+==
+ERROR: test_connect_after_remove_default (__main__.TestNbdServerRemove)
+--
+Traceback (most recent call last):
+  File "205", line 37, in setUp
+self.vm.launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 221, in launch

+self._launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 244, in _launch

+self._post_launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qtest.py", 
line 100, in _post_launch

+super(QEMUQtestMachine, self)._post_launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 196, in _post_launch

+self._qmp.accept()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qmp/qmp.py", 
line 157, in accept

+return self.__negotiate_capabilities()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qmp/qmp.py", 
line 73, in __negotiate_capabilities

+raise QMPConnectError
+QMPConnectError
...

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



[Qemu-block] qcow2 images corruption

2018-02-07 Thread Nicolas Ecarnot

Hello,

TL; DR : qcow2 images keep getting corrupted. Any workaround?

Long version:
This discussion has already been launched by me on the oVirt and on 
qemu-block mailing list, under similar circumstances but I learned 
further things since months and here are some informations :


- We are using 2 oVirt 3.6.7.5-1.el7.centos datacenters, using CentOS 
7.{2,3} hosts

- Hosts :
  - CentOS 7.2 1511 :
- Kernel = 3.10.0 327
- KVM : 2.3.0-31
- libvirt : 1.2.17
- vdsm : 4.17.32-1
  - CentOS 7.3 1611 :
- Kernel 3.10.0 514
- KVM : 2.3.0-31
- libvirt 2.0.0-10
- vdsm : 4.17.32-1
- Our storage is 2 Equallogic SANs connected via iSCSI on a dedicated 
network
- Depends on weeks, but all in all, there are around 32 hosts, 8 storage 
domains and for various reasons, very few VMs (less than 200).
- One peculiar point is that most of our VMs are provided an additional 
dedicated network interface that is iSCSI-connected to some volumes of 
our SAN - these volumes not being part of the oVirt setup. That could 
lead to a lot of additional iSCSI traffic.


From times to times, a random VM appears paused by oVirt.
Digging into the oVirt engine logs, then into the host vdsm logs, it 
appears that the host considers the qcow2 image as corrupted.
Along what I consider as a conservative behavior, vdsm stops any 
interaction with this image and marks it as paused.

Any try to unpause it leads to the same conservative pause.

After having found (https://access.redhat.com/solutions/1173623) the 
right logical volume hosting the qcow2 image, I can run qemu-img check 
on it.

- On 80% of my VMs, I find no errors.
- On 15% of them, I find Leaked cluster errors that I can correct using 
"qemu-img check -r all"
- On 5% of them, I find Leaked clusters errors and further fatal errors, 
which can not be corrected with qemu-img.
In rare cases, qemu-img can correct them, but destroys large parts of 
the image (becomes unusable), and on other cases it can not correct them 
at all.


Months ago, I already sent a similar message but the error message was 
about No space left on device 
(https://www.mail-archive.com/qemu-block@gnu.org/msg00110.html).


This time, I don't have this message about space, but only corruption.

I kept reading and found a similar discussion in the Proxmox group :
https://lists.ovirt.org/pipermail/users/2018-February/086750.html

https://forum.proxmox.com/threads/qcow2-corruption-after-snapshot-or-heavy-disk-i-o.32865/page-2

What I read similar to my case is :
- usage of qcow2
- heavy disk I/O
- using the virtio-blk driver

In the proxmox thread, they tend to say that using virtio-scsi is the 
solution. Having asked this question to oVirt experts 
(https://lists.ovirt.org/pipermail/users/2018-February/086753.html) but 
it's not clear the driver is to blame.


I agree with the answer Yaniv Kaul gave to me, saying I have to properly 
report the issue, so I'm longing to know which peculiar information I 
can give you now.


As you can imagine, all this setup is in production, and for most of the 
VMs, I can not "play" with them. Moreover, we launched a campaign of 
nightly stopping every VM, qemu-img check them one by one, then boot.

So it might take some time before I find another corrupted image.
(which I'll preciously store for debug)

Other informations : We very rarely do snapshots, but I'm close to 
imagine that automated migrations of VMs could trigger similar behaviors 
on qcow2 images.


Last point about the versions we use : yes that's old, yes we're 
planning to upgrade, but we don't know when.


Regards,

--
Nicolas ECARNOT



Re: [Qemu-block] Block Migration and CPU throttling

2018-02-07 Thread Dr. David Alan Gilbert
* Peter Lieven (p...@kamp.de) wrote:
> Am 12.12.2017 um 18:05 schrieb Dr. David Alan Gilbert:
> > * Peter Lieven (p...@kamp.de) wrote:
> > > Am 21.09.2017 um 14:36 schrieb Dr. David Alan Gilbert:
> > > > * Peter Lieven (p...@kamp.de) wrote:
> > > > > Am 19.09.2017 um 16:41 schrieb Dr. David Alan Gilbert:
> > > > > > * Peter Lieven (p...@kamp.de) wrote:
> > > > > > > Am 19.09.2017 um 16:38 schrieb Dr. David Alan Gilbert:
> > > > > > > > * Peter Lieven (p...@kamp.de) wrote:
> > > > > > > > > Hi,
> > > > > > > > > 
> > > > > > > > > I just noticed that CPU throttling and Block Migration don't 
> > > > > > > > > work together very well.
> > > > > > > > > During block migration the throttling heuristic detects that 
> > > > > > > > > we obviously make no progress
> > > > > > > > > in ram transfer. But the reason is the running block 
> > > > > > > > > migration and not a too high dirty pages rate.
> > > > > > > > > 
> > > > > > > > > The result is that any VM is throttled by 99% during block 
> > > > > > > > > migration.
> > > > > > > > Hmm that's unfortunate; do you have a bandwidth set lower than 
> > > > > > > > your
> > > > > > > > actual network connection? I'm just wondering if it's actually 
> > > > > > > > going
> > > > > > > > between the block and RAM iterative sections or getting stuck 
> > > > > > > > in ne.
> > > > > > > It happens also if source and dest are on the same machine and 
> > > > > > > speed is set to 100G.
> > > > > > But does it happen if they're not and the speed is set low?
> > > > > Yes, it does. I noticed it in our test environment between different 
> > > > > nodes with a 10G
> > > > > link in between. But its totally clear why it happens. During block 
> > > > > migration we transfer
> > > > > all dirty memory pages in each round (if there is moderate memory 
> > > > > load), but all dirty
> > > > > pages are obviously more than 50% of the transferred ram in that 
> > > > > round.
> > > > > It is more exactly 100%. But the current logic triggers on this 
> > > > > condition.
> > > > > 
> > > > > I think I will go forward and send a patch which disables auto 
> > > > > converge during
> > > > > block migration bulk stage.
> > > > Yes, that's fair;  it probably would also make sense to throttle the RAM
> > > > migration during the block migration bulk stage, since the chances are
> > > > it's not going to get far.  (I think in the nbd setup, the main
> > > > migration process isn't started until the end of bulk).
> > > Catching up with the idea of delaying ram migration until block bulk has 
> > > completed.
> > > What do you think is the easiest way to achieve this?
> > 
> > 
> > I think the answer depends whether we think this is a 'special' or we
> > need a new general purpose mechanism.
> > 
> > If it was really general then we'd probably want to split the iterative
> > stage in two somehow, and only do RAM in the second half.
> > 
> > But I'm not sure it's worth it; I suspect the easiest way is:
> > 
> > a) Add a counter in migration/ram.c or in the RAM state somewhere
> > b) Make ram_save_inhibit increment the counter
> > c) Check the counter at the head of ram_save_iterate and just exit
> >   if it's none 0
> > d) Call ram_save_inhibit from block_save_setup
> > e) Then release it when you've finished the bulk stage
> > 
> > Make sure you still count the RAM in the pending totals, otherwise
> > migration might think it's finished a bit early.
> 
> Is there any culprit I don't see or is it as easy as this?

Hmm, looks promising doesn't it;  might need an include or two tidied
up, but looks worth a try.   Just be careful that there are no cases
where block migration can't transfer data in that state, otherwise we'll
keep coming back to here and spewing empty sections.

Dave

> diff --git a/migration/ram.c b/migration/ram.c
> index cb1950f..c67bcf1 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -2255,6 +2255,13 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>  int64_t t0;
>  int done = 0;
> 
> +    if (blk_mig_bulk_active()) {
> +    /* Avoid transferring RAM during bulk phase of block migration as
> + * the bulk phase will usually take a lot of time and transferring
> + * RAM updates again and again is pointless. */
> +    goto out;
> +    }
> +
>  rcu_read_lock();
>  if (ram_list.version != rs->last_version) {
>  ram_state_reset(rs);
> @@ -2301,6 +2308,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>   */
>  ram_control_after_iterate(f, RAM_CONTROL_ROUND);
> 
> +out:
>  qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>  ram_counters.transferred += 8;
> 
> 
> Peter
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



[Qemu-block] [PATCH 2/2] scsi: add block job opblockers for scsi-block

2018-02-07 Thread Paolo Bonzini
scsi-block bypasses the dirty bitmaps and pre-write notifiers, so it
cannot be the source of a block job.  The gist of the fix is to add
op-blockers to the BlockBackend, and remove them at "unrealize" time,
but things are a little more complex because quit closes the BlockBackend
without going through unrealize.

So use Notifiers: the remove_bs notifier is called by bdrv_close_all, and
the insert_bs notifier might not be really necessary but make things a
little more symmetric.

Suggested-by: Karen Noel 
Signed-off-by: Paolo Bonzini 
---
 block/block-backend.c  |  9 ++
 hw/scsi/scsi-disk.c| 62 ++
 include/sysemu/block-backend.h |  1 +
 3 files changed, 72 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index baef8e7abc..1759639a4a 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1747,6 +1747,15 @@ bool blk_op_is_blocked(BlockBackend *blk, BlockOpType 
op, Error **errp)
 return bdrv_op_is_blocked(bs, op, errp);
 }
 
+void blk_op_block(BlockBackend *blk, BlockOpType op, Error *reason)
+{
+BlockDriverState *bs = blk_bs(blk);
+
+if (bs) {
+bdrv_op_block(bs, op, reason);
+}
+}
+
 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
 {
 BlockDriverState *bs = blk_bs(blk);
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 49d2559d93..023673cb04 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2578,9 +2578,39 @@ static int get_device_type(SCSIDiskState *s)
 return 0;
 }
 
+typedef struct SCSIBlockState {
+SCSIDiskState sd;
+Error *mirror_source;
+Error *backup_source;
+Error *commit_source;
+Notifier insert_bs;
+Notifier remove_bs;
+} SCSIBlockState;
+
+static void scsi_block_insert_bs(Notifier *n, void *opaque)
+{
+SCSIBlockState *sb = container_of(n, SCSIBlockState, insert_bs);
+SCSIDiskState *s = &sb->sd;
+
+blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
sb->mirror_source);
+blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
sb->commit_source);
+blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
sb->backup_source);
+}
+
+static void scsi_block_remove_bs(Notifier *n, void *opaque)
+{
+SCSIBlockState *sb = container_of(n, SCSIBlockState, remove_bs);
+SCSIDiskState *s = &sb->sd;
+
+blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
sb->mirror_source);
+blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
sb->commit_source);
+blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
sb->backup_source);
+}
+
 static void scsi_block_realize(SCSIDevice *dev, Error **errp)
 {
 SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
+SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
 int sg_version;
 int rc;
 
@@ -2626,6 +2656,36 @@ static void scsi_block_realize(SCSIDevice *dev, Error 
**errp)
 
 scsi_realize(&s->qdev, errp);
 scsi_generic_read_device_identification(&s->qdev);
+
+/* For op blockers, due to lack of support for dirty bitmaps.  */
+error_setg(&sb->mirror_source,
+   "scsi-block does not support acting as a mirroring source");
+error_setg(&sb->commit_source,
+   "scsi-block does not support acting as an active commit 
source");
+
+/* For op blockers, due to lack of support for write notifiers.  */
+error_setg(&sb->backup_source,
+   "scsi-block does not support acting as a backup source");
+
+sb->insert_bs.notify = scsi_block_insert_bs;
+blk_add_insert_bs_notifier(s->qdev.conf.blk, &sb->insert_bs);
+sb->remove_bs.notify = scsi_block_remove_bs;
+blk_add_remove_bs_notifier(s->qdev.conf.blk, &sb->remove_bs);
+
+scsi_block_insert_bs(&sb->insert_bs, s->qdev.conf.blk);
+}
+
+static void scsi_block_unrealize(SCSIDevice *dev, Error **errp)
+{
+SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
+SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
+
+notifier_remove(&sb->insert_bs);
+notifier_remove(&sb->remove_bs);
+scsi_block_remove_bs(&sb->insert_bs, s->qdev.conf.blk);
+error_free(sb->mirror_source);
+error_free(sb->commit_source);
+error_free(sb->backup_source);
 }
 
 typedef struct SCSIBlockReq {
@@ -3017,6 +3077,7 @@ static void scsi_block_class_initfn(ObjectClass *klass, 
void *data)
 SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
 
 sc->realize  = scsi_block_realize;
+sc->unrealize= scsi_block_unrealize;
 sc->alloc_req= scsi_block_new_request;
 sc->parse_cdb= scsi_block_parse_cdb;
 sdc->dma_readv   = scsi_block_dma_readv;
@@ -3031,6 +3092,7 @@ static const TypeInfo scsi_block_info = {
 .name  = "scsi-block",
 .parent= TYPE_SCSI_DISK_BASE,
 .class_init= scsi_block_class_initfn,
+.instance_size = sizeof(SCSIBlockState),
 };
 #endif
 
diff --git a/include/sysemu/block-backend.h b/include/sy

[Qemu-block] [PATCH 0/2] scsi: add block job opblockers for scsi-block

2018-02-07 Thread Paolo Bonzini
SCSI passthrough bypasses the block layer and issues SCSI commands
directly to the disk.  This breaks write notifiers and dirty bitmaps,
so that scsi-block devices cannot act as a mirror or backup source
(and commit too, even though that shouldn't be possible at all in the
lack of a backing file).  This series adds op blockers for that purpose.

There is currently a blk_op_unblock but no blk_op_block, so patch 2
adds it.

Paolo

Paolo Bonzini (2):
  scsi: add unrealize method for SCSI devices
  scsi: add block job opblockers for scsi-block

 block/block-backend.c  |  9 ++
 hw/scsi/scsi-bus.c |  4 +++
 hw/scsi/scsi-disk.c| 62 ++
 include/hw/scsi/scsi.h |  1 +
 include/sysemu/block-backend.h |  1 +
 5 files changed, 77 insertions(+)

-- 
2.14.3




[Qemu-block] [PATCH] block: early check for blockers on drive-mirror

2018-02-07 Thread Paolo Bonzini
Even if an op blocker is present for BLOCK_OP_TYPE_MIRROR_SOURCE,
it is checked a bit late and the result is that the target is
created even if drive-mirror subsequently fails.  Add an early
check to avoid this.

Signed-off-by: Paolo Bonzini 
---
 blockdev.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index 8e977eef11..c7e2e0a00e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3565,6 +3565,11 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
 return;
 }
 
+/* Early check to avoid creating target */
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_MIRROR_SOURCE, errp)) {
+return;
+}
+
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 
-- 
2.14.3




[Qemu-block] [PATCH 1/2] scsi: add unrealize method for SCSI devices

2018-02-07 Thread Paolo Bonzini
The next patch will introduce a different unrealize implementation
for scsi-block.  Compared to the code before commit fb7b5c0df6
("scsi: devirtualize unrealize of SCSI devices", 2014-10-31), the
common code for all SCSI devices is kept in scsi-bus.c.

Signed-off-by: Paolo Bonzini 
---
 hw/scsi/scsi-bus.c | 4 
 include/hw/scsi/scsi.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 05e501efd3..a0790438f0 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -212,12 +212,16 @@ static void scsi_qdev_realize(DeviceState *qdev, Error 
**errp)
 static void scsi_qdev_unrealize(DeviceState *qdev, Error **errp)
 {
 SCSIDevice *dev = SCSI_DEVICE(qdev);
+SCSIDeviceClass *sc = SCSI_DEVICE_GET_CLASS(dev);
 
 if (dev->vmsentry) {
 qemu_del_vm_change_state_handler(dev->vmsentry);
 }
 
 scsi_device_purge_requests(dev, SENSE_CODE(NO_SENSE));
+if (sc->unrealize) {
+sc->unrealize(dev, errp);
+}
 blockdev_mark_auto_del(dev->conf.blk);
 }
 
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 802a647cdc..569a4b9d14 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -59,6 +59,7 @@ struct SCSIRequest {
 typedef struct SCSIDeviceClass {
 DeviceClass parent_class;
 void (*realize)(SCSIDevice *dev, Error **errp);
+void (*unrealize)(SCSIDevice *dev, Error **errp);
 int (*parse_cdb)(SCSIDevice *dev, SCSICommand *cmd, uint8_t *buf,
  void *hba_private);
 SCSIRequest *(*alloc_req)(SCSIDevice *s, uint32_t tag, uint32_t lun,
-- 
2.14.3





[Qemu-block] [PATCH v10 10/12] migration: add postcopy migration of dirty bitmaps

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Postcopy migration of dirty bitmaps. Only named dirty bitmaps are migrated.

If destination qemu is already containing a dirty bitmap with the same name
as a migrated bitmap (for the same node), then, if their granularities are
the same the migration will be done, otherwise the error will be generated.

If destination qemu doesn't contain such bitmap it will be created.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/migration/misc.h   |   3 +
 migration/migration.h  |   3 +
 migration/block-dirty-bitmap.c | 737 +
 migration/migration.c  |   5 +
 migration/savevm.c |   2 +
 vl.c   |   1 +
 migration/Makefile.objs|   1 +
 migration/trace-events |  14 +
 8 files changed, 766 insertions(+)
 create mode 100644 migration/block-dirty-bitmap.c

diff --git a/include/migration/misc.h b/include/migration/misc.h
index 77fd4f587c..4ebf24c6c2 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -56,4 +56,7 @@ bool migration_has_failed(MigrationState *);
 bool migration_in_postcopy_after_devices(MigrationState *);
 void migration_global_dump(Monitor *mon);
 
+/* migration/block-dirty-bitmap.c */
+void dirty_bitmap_mig_init(void);
+
 #endif
diff --git a/migration/migration.h b/migration/migration.h
index 861cdfaa96..79f72b7e50 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -233,4 +233,7 @@ void migrate_send_rp_pong(MigrationIncomingState *mis,
 void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname,
   ram_addr_t start, size_t len);
 
+void dirty_bitmap_mig_before_vm_start(void);
+void init_dirty_bitmap_incoming_migration(void);
+
 #endif
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
new file mode 100644
index 00..5b41f7140d
--- /dev/null
+++ b/migration/block-dirty-bitmap.c
@@ -0,0 +1,737 @@
+/*
+ * Block dirty bitmap postcopy migration
+ *
+ * Copyright IBM, Corp. 2009
+ * Copyright (c) 2016-2017 Virtuozzo International GmbH. All rights reserved.
+ *
+ * Authors:
+ *  Liran Schour   
+ *  Vladimir Sementsov-Ogievskiy 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ * This file is derived from migration/block.c, so it's author and IBM 
copyright
+ * are here, although content is quite different.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ *
+ ****
+ *
+ * Here postcopy migration of dirty bitmaps is realized. Only QMP-addressable
+ * bitmaps are migrated.
+ *
+ * Bitmap migration implies creating bitmap with the same name and granularity
+ * in destination QEMU. If the bitmap with the same name (for the same node)
+ * already exists on destination an error will be generated.
+ *
+ * format of migration:
+ *
+ * # Header (shared for different chunk types)
+ * 1, 2 or 4 bytes: flags (see qemu_{put,put}_flags)
+ * [ 1 byte: node name size ] \  flags & DEVICE_NAME
+ * [ n bytes: node name ] /
+ * [ 1 byte: bitmap name size ] \  flags & BITMAP_NAME
+ * [ n bytes: bitmap name ] /
+ *
+ * # Start of bitmap migration (flags & START)
+ * header
+ * be64: granularity
+ * 1 byte: bitmap flags (corresponds to BdrvDirtyBitmap)
+ *   bit 0-  bitmap is enabled
+ *   bit 1-  bitmap is persistent
+ *   bit 2-  bitmap is autoloading
+ *   bits 3-7 - reserved, must be zero
+ *
+ * # Complete of bitmap migration (flags & COMPLETE)
+ * header
+ *
+ * # Data chunk of bitmap migration
+ * header
+ * be64: start sector
+ * be32: number of sectors
+ * [ be64: buffer size  ] \ ! (flags & ZEROES)
+ * [ n bytes: buffer] /
+ *
+ * The last chunk in stream should contain flags & EOS. The chunk may skip
+ * device and/or bitmap names, assuming them to be the same with the previous
+ * chunk.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "block/block_int.h"
+#include "sysemu/block-backend.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+#include "migration/qemu-file.h"
+#include "migration/vmstate.h"
+#include "migration/register.h"
+#include "qemu/hbitmap.h"
+#include "sysemu/sysemu.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "trace.h"
+
+#define CHUNK_SIZE (1 << 10)
+
+/* Flags occupy one, two or four bytes (Big Endian). The size is determined as
+ * follows:
+ * in first (most significant) byte bit 8 is clear  -->  one byte
+ * in first byte bit 8 is set-->  two or four bytes, depending on second
+ *byte:
+ *| in second byte bit 8 is clear  -->  two bytes
+ *| in second byte bit 8 is set-->  four bytes
+ */
+#define DIRTY_BITMAP_MIG_FLAG_EOS   0x01
+#define DIRTY_BITMAP_MIG_FLAG_ZEROES

[Qemu-block] [PATCH v10 00/12] Dirty bitmaps postcopy migration

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Hi all!

There is a new version of dirty bitmap postcopy migration series.

Now it is based on Max's block tree: 
https://github.com/XanClic/qemu/commits/block,
where it needs only one patch: "block: maintain persistent disabled bitmaps",
but I hope it is near to be merged.

v10

clone: tag postcopy-v10 from https://src.openvz.org/scm/~vsementsov/qemu.git
online: 
https://src.openvz.org/users/vsementsov/repos/qemu/browse?at=postcopy-v10

01,02: r-b Fam
03: adjust comments about locking
04: fixed 124 iotest (was broken because of small mistake in 
block/dirty-bitmap.c)
05: rebased on master, staff from migration_thread is moved to 
migration_iteration_run, so
drop r-b by John and Juan
06: 2.11->2.12, r-b Fam
07,08,09,: r-b Fam

10: move to device names instead of node names, looks like libvirt don't care 
about
same node-names.
flag AUTOLOAD is ignored for now
use QEMU_ALIGN_UP and DIV_ROUND_UP
skip automatically inserted nodes, when search for dirty bitmaps
allow migration of no bitmaps (see in dirty_bitmap_load_header new logic
   with nothing variable, which avoids extra 
errors)
handle return code of dirty_bitmap_load_header
avoid iteration if there are no bitmaps (see new .no_bitmaps field of 
 dirty_bitmap_mig_state)
call dirty_bitmap_mig_before_vm_start from process_incoming_migration_bh 
too,
to enable bitmaps in case of postcopy not actually started.
11: not add r-b Fam
tiny reorganisation of do_test_migration parameters: remove useless default
values and make shared_storage to be the last
disable shared storage test for now, until it will be fixed (it will be 
separate
series, more related to qcow2 than to migration)
12: r-b Fam

also, "iotests: add default node-name" is dropped, as not more needed.


v9

clone: tag postcopy-v9 from https://src.openvz.org/scm/~vsementsov/qemu.git
online: https://src.openvz.org/users/vsementsov/repos/qemu/browse?at=postcopy-v9

01: r-b John
02: was incomplete, now add here bdrv_reclaim_dirty_bitmap fix
03: new
04: new
05: r-b John
07: fix type in commit message, r-b John
09: add comment about is_active_iterate, r-b Snow and keep Juan's r-b, hope 
comment is ok
10: change copyright to Virtuozzo
reword comment at the top of the file
rewrite init_dirty_bitmap_migration, to not do same things twice (John)
  and skip _only_ unnamed bitmaps, error out for unnamed nodes (John)
use new "locked" state of bitmaps instead of frozen on source vm
do not support migrating bitmap to existent one with the same name,
  keep only create-new-bitmap way
break loop in dirty_bitmap_load_complete when bitmap is found
use bitmap locking instead of context acquire
12: rewrite, to add more cases. (note, that 169 iotest is also in my
"[PATCH v2 0/3] fix bitmaps migration through shared storage", which 
probably should
go to qemu-stable. So this patch should rewrite it, but here I make it like 
new patch,
to simplify review. When "[PATCH v2..." merged I'll rebase this on it), 
drop r-b
13: move to separate test, drop r-b


v8.1

clone: tag postcopy-v8.1 from https://src.openvz.org/scm/~vsementsov/qemu.git
online: 
https://src.openvz.org/users/vsementsov/repos/qemu/browse?at=postcopy-v8.1

05: fix compilation, add new version for cmma_save_pending too.


v8

clone: tag postcopy-v8 from https://src.openvz.org/scm/~vsementsov/qemu.git
online: https://src.openvz.org/users/vsementsov/repos/qemu/browse?at=postcopy-v8

- rebased on master
- patches 01-03 from v7 are already merged to master
- patch order is changed to make it possible to merge block/dirty-bitmap patches
  in separate if is needed
01: new patch
03: fixed to use _locked version of bdrv_release_dirty_bitmap
06: qapi-schema.json -> qapi/migration.json
2.9 -> 2.11
10: protocol changed a bit:
  instead of 1 byte "bitmap enabled flag" this byte becomes just "flags"
  and have "enabled", "persistent" and "autoloading" flags inside.
  also, make all migrated bitmaps to be not persistent (to prevent their
  storing on source vm)
14: new patch


patches status:
01-04 - are only about block/dirty-bitmap and have no r-b. Fam, John, Paolo 
(about bitmap lock),
please look at. These patches are ok to be merged in separate (but before 
05-14)
other patches are about migration
05-09 has Juan's r-b (and some of them has John's and Eric's r-bs)
10 - the main patch (dirty bitmaps migration), has no r-b.
11 - preparation for tests, not related to migration directly, has Max's r-b, 
ok to be merged
separately (but before 12-14)
12-14 - tests, 12 and 13 have Max's r-b, 14 is new


v7

clone: tag postcopy-v7 from https://src.openvz.org/scm/~vsementsov/qemu.git
online: https://src.openvz.org/users/vsementsov/repos/qemu/browse?at=postcopy-v7

- rebased on dirty-bitmap byte-based interfaces
(based on git://repo.or.cz/qemu/ericb.git branch nbd-byte-dirty-v4)
- migration o

[Qemu-block] [PATCH v10 06/12] qapi: add dirty-bitmaps migration capability

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: John Snow 
Reviewed-by: Eric Blake 
Reviewed-by: Juan Quintela 
Reviewed-by: Fam Zheng 
---
 qapi/migration.json   | 6 +-
 migration/migration.h | 1 +
 migration/migration.c | 9 +
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 4cd3d13158..6d8181c45f 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -352,12 +352,16 @@
 #
 # @x-multifd: Use more than one fd for migration (since 2.11)
 #
+# @dirty-bitmaps: If enabled, QEMU will migrate named dirty bitmaps.
+# (since 2.12)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
'compress', 'events', 'postcopy-ram', 'x-colo', 'release-ram',
-   'block', 'return-path', 'pause-before-switchover', 'x-multifd' ] }
+   'block', 'return-path', 'pause-before-switchover', 'x-multifd',
+   'dirty-bitmaps' ] }
 
 ##
 # @MigrationCapabilityStatus:
diff --git a/migration/migration.h b/migration/migration.h
index 786d971ce2..861cdfaa96 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -203,6 +203,7 @@ bool migrate_postcopy(void);
 bool migrate_release_ram(void);
 bool migrate_postcopy_ram(void);
 bool migrate_zero_blocks(void);
+bool migrate_dirty_bitmaps(void);
 
 bool migrate_auto_converge(void);
 bool migrate_use_multifd(void);
diff --git a/migration/migration.c b/migration/migration.c
index 3beedd676e..d683c3d693 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1543,6 +1543,15 @@ int migrate_decompress_threads(void)
 return s->parameters.decompress_threads;
 }
 
+bool migrate_dirty_bitmaps(void)
+{
+MigrationState *s;
+
+s = migrate_get_current();
+
+return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
+}
+
 bool migrate_use_events(void)
 {
 MigrationState *s;
-- 
2.11.1




[Qemu-block] [PATCH v10 02/12] block/dirty-bitmap: fix locking in bdrv_reclaim_dirty_bitmap

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Like other setters here these functions should take a lock.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Fam Zheng 
---
 block/dirty-bitmap.c | 85 
 1 file changed, 53 insertions(+), 32 deletions(-)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 0d0e807216..75435f6c2f 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -242,6 +242,51 @@ void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap 
*bitmap)
 qemu_mutex_unlock(bitmap->mutex);
 }
 
+/* Called within bdrv_dirty_bitmap_lock..unlock */
+static void bdrv_do_release_matching_dirty_bitmap_locked(
+BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+bool (*cond)(BdrvDirtyBitmap *bitmap))
+{
+BdrvDirtyBitmap *bm, *next;
+
+QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
+if ((!bitmap || bm == bitmap) && (!cond || cond(bm))) {
+assert(!bm->active_iterators);
+assert(!bdrv_dirty_bitmap_frozen(bm));
+assert(!bm->meta);
+QLIST_REMOVE(bm, list);
+hbitmap_free(bm->bitmap);
+g_free(bm->name);
+g_free(bm);
+
+if (bitmap) {
+return;
+}
+}
+}
+
+if (bitmap) {
+abort();
+}
+}
+
+/* Called with BQL taken.  */
+static void bdrv_do_release_matching_dirty_bitmap(
+BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+bool (*cond)(BdrvDirtyBitmap *bitmap))
+{
+bdrv_dirty_bitmaps_lock(bs);
+bdrv_do_release_matching_dirty_bitmap_locked(bs, bitmap, cond);
+bdrv_dirty_bitmaps_unlock(bs);
+}
+
+/* Called within bdrv_dirty_bitmap_lock..unlock */
+static void bdrv_release_dirty_bitmap_locked(BlockDriverState *bs,
+ BdrvDirtyBitmap *bitmap)
+{
+bdrv_do_release_matching_dirty_bitmap_locked(bs, bitmap, NULL);
+}
+
 /**
  * For a bitmap with a successor, yield our name to the successor,
  * delete the old bitmap, and return a handle to the new bitmap.
@@ -281,7 +326,11 @@ BdrvDirtyBitmap 
*bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
BdrvDirtyBitmap *parent,
Error **errp)
 {
-BdrvDirtyBitmap *successor = parent->successor;
+BdrvDirtyBitmap *successor;
+
+qemu_mutex_lock(parent->mutex);
+
+successor = parent->successor;
 
 if (!successor) {
 error_setg(errp, "Cannot reclaim a successor when none is present");
@@ -292,9 +341,11 @@ BdrvDirtyBitmap 
*bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
 error_setg(errp, "Merging of parent and successor bitmap failed");
 return NULL;
 }
-bdrv_release_dirty_bitmap(bs, successor);
+bdrv_release_dirty_bitmap_locked(bs, successor);
 parent->successor = NULL;
 
+qemu_mutex_unlock(parent->mutex);
+
 return parent;
 }
 
@@ -322,36 +373,6 @@ static bool bdrv_dirty_bitmap_has_name(BdrvDirtyBitmap 
*bitmap)
 }
 
 /* Called with BQL taken.  */
-static void bdrv_do_release_matching_dirty_bitmap(
-BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
-bool (*cond)(BdrvDirtyBitmap *bitmap))
-{
-BdrvDirtyBitmap *bm, *next;
-bdrv_dirty_bitmaps_lock(bs);
-QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
-if ((!bitmap || bm == bitmap) && (!cond || cond(bm))) {
-assert(!bm->active_iterators);
-assert(!bdrv_dirty_bitmap_frozen(bm));
-assert(!bm->meta);
-QLIST_REMOVE(bm, list);
-hbitmap_free(bm->bitmap);
-g_free(bm->name);
-g_free(bm);
-
-if (bitmap) {
-goto out;
-}
-}
-}
-if (bitmap) {
-abort();
-}
-
-out:
-bdrv_dirty_bitmaps_unlock(bs);
-}
-
-/* Called with BQL taken.  */
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
 {
 bdrv_do_release_matching_dirty_bitmap(bs, bitmap, NULL);
-- 
2.11.1




[Qemu-block] [PATCH v10 04/12] dirty-bitmap: add locked state

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Add special state, when qmp operations on the bitmap are disabled.
It is needed during bitmap migration. "Frozen" state is not
appropriate here, because it looks like bitmap is unchanged.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json |  5 -
 include/block/dirty-bitmap.h |  3 +++
 block/dirty-bitmap.c | 16 
 blockdev.c   | 19 +++
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1d44c439e9..236586a5e1 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -426,10 +426,13 @@
 # @active: The bitmap is actively monitoring for new writes, and can be 
cleared,
 #  deleted, or used for backup operations.
 #
+# @locked: The bitmap is currently in-use by some operation and can not be
+#  cleared, deleted, or used for backup operations. (Since 2.12)
+#
 # Since: 2.4
 ##
 { 'enum': 'DirtyBitmapStatus',
-  'data': ['active', 'disabled', 'frozen'] }
+  'data': ['active', 'disabled', 'frozen', 'locked'] }
 
 ##
 # @BlockDirtyInfo:
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index b2e1fd913a..9daaf38897 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -68,6 +68,8 @@ void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap 
*bitmap);
 void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value);
 void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap *bitmap,
bool persistent);
+void bdrv_dirty_bitmap_set_qmp_locked(BdrvDirtyBitmap *bitmap, bool 
qmp_locked);
+
 
 /* Functions that require manual locking.  */
 void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap);
@@ -87,6 +89,7 @@ bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap 
*bitmap);
 bool bdrv_has_readonly_bitmaps(BlockDriverState *bs);
 bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_get_persistance(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_qmp_locked(BdrvDirtyBitmap *bitmap);
 bool bdrv_has_changed_persistent_bitmaps(BlockDriverState *bs);
 BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BlockDriverState *bs,
 BdrvDirtyBitmap *bitmap);
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index ce00ff3474..221bcb9996 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -40,6 +40,8 @@ struct BdrvDirtyBitmap {
 QemuMutex *mutex;
 HBitmap *bitmap;/* Dirty bitmap implementation */
 HBitmap *meta;  /* Meta dirty bitmap */
+bool qmp_locked;/* Bitmap is frozen, it can't be modified
+   through QMP */
 BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
 char *name; /* Optional non-empty unique ID */
 int64_t size;   /* Size of the bitmap, in bytes */
@@ -183,6 +185,18 @@ bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 return bitmap->successor;
 }
 
+void bdrv_dirty_bitmap_set_qmp_locked(BdrvDirtyBitmap *bitmap, bool qmp_locked)
+{
+qemu_mutex_lock(bitmap->mutex);
+bitmap->qmp_locked = qmp_locked;
+qemu_mutex_unlock(bitmap->mutex);
+}
+
+bool bdrv_dirty_bitmap_qmp_locked(BdrvDirtyBitmap *bitmap)
+{
+return bitmap->qmp_locked;
+}
+
 /* Called with BQL taken.  */
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap)
 {
@@ -194,6 +208,8 @@ DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap 
*bitmap)
 {
 if (bdrv_dirty_bitmap_frozen(bitmap)) {
 return DIRTY_BITMAP_STATUS_FROZEN;
+} else if (bdrv_dirty_bitmap_qmp_locked(bitmap)) {
+return DIRTY_BITMAP_STATUS_LOCKED;
 } else if (!bdrv_dirty_bitmap_enabled(bitmap)) {
 return DIRTY_BITMAP_STATUS_DISABLED;
 } else {
diff --git a/blockdev.c b/blockdev.c
index a1dc39345a..2cbe9ff8ea 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2113,6 +2113,9 @@ static void 
block_dirty_bitmap_clear_prepare(BlkActionState *common,
 if (bdrv_dirty_bitmap_frozen(state->bitmap)) {
 error_setg(errp, "Cannot modify a frozen bitmap");
 return;
+} else if (bdrv_dirty_bitmap_qmp_locked(state->bitmap)) {
+error_setg(errp, "Cannot modify a locked bitmap");
+return;
 } else if (!bdrv_dirty_bitmap_enabled(state->bitmap)) {
 error_setg(errp, "Cannot clear a disabled bitmap");
 return;
@@ -2857,6 +2860,11 @@ void qmp_block_dirty_bitmap_remove(const char *node, 
const char *name,
"Bitmap '%s' is currently frozen and cannot be removed",
name);
 return;
+} else if (bdrv_dirty_bitmap_qmp_locked(bitmap)) {
+error_setg(errp,
+   "Bitmap '%s' is currently locked and cannot be removed",
+   name);
+return;
 }
 
 if (bdrv_dirty_bitmap_get_persistance(bitmap)) {
@@ -2891,6 +2899,11 @@ void qmp_

[Qemu-block] [PATCH v10 12/12] iotests: add dirty bitmap postcopy test

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Test
- start two vms (vm_a, vm_b)

- in a
- do writes from set A
- do writes from set B
- fix bitmap sha256
- clear bitmap
- do writes from set A
- start migration
- than, in b
- wait vm start (postcopy should start)
- do writes from set B
- check bitmap sha256

The test should verify postcopy migration and then merging with delta
(changes in target, during postcopy process).

Reduce supported cache modes to only 'none', because with cache on time
from source.STOP to target.RESUME is unpredictable and we can fail with
timout while waiting for target.RESUME.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Fam Zheng 
---
 tests/qemu-iotests/199| 105 ++
 tests/qemu-iotests/199.out|   5 ++
 tests/qemu-iotests/group  |   1 +
 tests/qemu-iotests/iotests.py |   7 ++-
 4 files changed, 117 insertions(+), 1 deletion(-)
 create mode 100755 tests/qemu-iotests/199
 create mode 100644 tests/qemu-iotests/199.out

diff --git a/tests/qemu-iotests/199 b/tests/qemu-iotests/199
new file mode 100755
index 00..f872040a81
--- /dev/null
+++ b/tests/qemu-iotests/199
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#
+# Tests for dirty bitmaps postcopy migration.
+#
+# Copyright (c) 2016-2017 Virtuozzo International GmbH. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+import iotests
+import time
+from iotests import qemu_img
+
+disk_a = os.path.join(iotests.test_dir, 'disk_a')
+disk_b = os.path.join(iotests.test_dir, 'disk_b')
+size = '256G'
+fifo = os.path.join(iotests.test_dir, 'mig_fifo')
+
+class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase):
+
+def tearDown(self):
+self.vm_a.shutdown()
+self.vm_b.shutdown()
+os.remove(disk_a)
+os.remove(disk_b)
+os.remove(fifo)
+
+def setUp(self):
+os.mkfifo(fifo)
+qemu_img('create', '-f', iotests.imgfmt, disk_a, size)
+qemu_img('create', '-f', iotests.imgfmt, disk_b, size)
+self.vm_a = iotests.VM(path_suffix='a').add_drive(disk_a)
+self.vm_b = iotests.VM(path_suffix='b').add_drive(disk_b)
+self.vm_b.add_incoming("exec: cat '" + fifo + "'")
+self.vm_a.launch()
+self.vm_b.launch()
+
+def test_postcopy(self):
+write_size = 0x4000
+granularity = 512
+chunk = 4096
+
+result = self.vm_a.qmp('block-dirty-bitmap-add', node='drive0',
+   name='bitmap', granularity=granularity)
+self.assert_qmp(result, 'return', {});
+
+s = 0
+while s < write_size:
+self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % (s, chunk))
+s += 0x1
+s = 0x8000
+while s < write_size:
+self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % (s, chunk))
+s += 0x1
+
+result = self.vm_a.qmp('x-debug-block-dirty-bitmap-sha256',
+   node='drive0', name='bitmap')
+sha256 = result['return']['sha256']
+
+result = self.vm_a.qmp('block-dirty-bitmap-clear', node='drive0',
+   name='bitmap')
+self.assert_qmp(result, 'return', {});
+s = 0
+while s < write_size:
+self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % (s, chunk))
+s += 0x1
+
+result = self.vm_a.qmp('migrate-set-capabilities',
+   capabilities=[{'capability': 'dirty-bitmaps',
+  'state': True}])
+self.assert_qmp(result, 'return', {})
+
+result = self.vm_a.qmp('migrate', uri='exec:cat>' + fifo)
+self.assertNotEqual(self.vm_a.event_wait("STOP"), None)
+self.assertNotEqual(self.vm_b.event_wait("RESUME"), None)
+
+s = 0x8000
+while s < write_size:
+self.vm_b.hmp_qemu_io('drive0', 'write %d %d' % (s, chunk))
+s += 0x1
+
+result = self.vm_b.qmp('query-block');
+while len(result['return'][0]['dirty-bitmaps']) > 1:
+time.sleep(2)
+result = self.vm_b.qmp('query-block');
+
+result = self.vm_b.qmp('x-debug-block-dirty-bitmap-sha256',
+   node='drive0', name='bitmap')
+
+self.assert_qmp(result, 'return/sha256', sha256);
+
+if __name__ =

[Qemu-block] [PATCH v10 07/12] migration: include migrate_dirty_bitmaps in migrate_postcopy

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Enable postcopy if dirty bitmap migration is enabled.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Juan Quintela 
Reviewed-by: John Snow 
Reviewed-by: Fam Zheng 
---
 migration/migration.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/migration.c b/migration/migration.c
index d683c3d693..d09f53d6c3 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1486,7 +1486,7 @@ bool migrate_postcopy_ram(void)
 
 bool migrate_postcopy(void)
 {
-return migrate_postcopy_ram();
+return migrate_postcopy_ram() || migrate_dirty_bitmaps();
 }
 
 bool migrate_auto_converge(void)
-- 
2.11.1




[Qemu-block] [PATCH v10 05/12] migration: introduce postcopy-only pending

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
There would be savevm states (dirty-bitmap) which can migrate only in
postcopy stage. The corresponding pending is introduced here.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/migration/register.h | 17 +++--
 migration/savevm.h   |  5 +++--
 hw/s390x/s390-stattrib.c |  7 ---
 migration/block.c|  7 ---
 migration/migration.c| 16 +---
 migration/ram.c  |  9 +
 migration/savevm.c   | 13 -
 migration/trace-events   |  2 +-
 8 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/include/migration/register.h b/include/migration/register.h
index f4f7bdc177..9436a87678 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -37,8 +37,21 @@ typedef struct SaveVMHandlers {
 int (*save_setup)(QEMUFile *f, void *opaque);
 void (*save_live_pending)(QEMUFile *f, void *opaque,
   uint64_t threshold_size,
-  uint64_t *non_postcopiable_pending,
-  uint64_t *postcopiable_pending);
+  uint64_t *res_precopy_only,
+  uint64_t *res_compatible,
+  uint64_t *res_postcopy_only);
+/* Note for save_live_pending:
+ * - res_precopy_only is for data which must be migrated in precopy phase
+ * or in stopped state, in other words - before target vm start
+ * - res_compatible is for data which may be migrated in any phase
+ * - res_postcopy_only is for data which must be migrated in postcopy phase
+ * or in stopped state, in other words - after source vm stop
+ *
+ * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
+ * whole amount of pending data.
+ */
+
+
 LoadStateHandler *load_state;
 int (*load_setup)(QEMUFile *f, void *opaque);
 int (*load_cleanup)(void *opaque);
diff --git a/migration/savevm.h b/migration/savevm.h
index 295c4a1f2c..cf4f0d37ca 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -38,8 +38,9 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f);
 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
bool inactivate_disks);
 void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
-   uint64_t *res_non_postcopiable,
-   uint64_t *res_postcopiable);
+   uint64_t *res_precopy_only,
+   uint64_t *res_compatible,
+   uint64_t *res_postcopy_only);
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
 void qemu_savevm_send_open_return_path(QEMUFile *f);
 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index 2902f54f11..dd3fbfd1eb 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -183,15 +183,16 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
 }
 
 static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
- uint64_t *non_postcopiable_pending,
- uint64_t *postcopiable_pending)
+  uint64_t *res_precopy_only,
+  uint64_t *res_compatible,
+  uint64_t *res_postcopy_only)
 {
 S390StAttribState *sas = S390_STATTRIB(opaque);
 S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
 long long res = sac->get_dirtycount(sas);
 
 if (res >= 0) {
-*non_postcopiable_pending += res;
+*res_precopy_only += res;
 }
 }
 
diff --git a/migration/block.c b/migration/block.c
index 1f03946797..5652ca3337 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -866,8 +866,9 @@ static int block_save_complete(QEMUFile *f, void *opaque)
 }
 
 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-   uint64_t *non_postcopiable_pending,
-   uint64_t *postcopiable_pending)
+   uint64_t *res_precopy_only,
+   uint64_t *res_compatible,
+   uint64_t *res_postcopy_only)
 {
 /* Estimate pending number of bytes to send */
 uint64_t pending;
@@ -888,7 +889,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, 
uint64_t max_size,
 
 DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
 /* We don't do postcopy */
-*non_postcopiable_pending += pending;
+*res_precopy_only += pending;
 }
 
 static int block_load(QEMUFile *f, void *opaque, int version_id)
diff --git a/migration/migration.c b/migration/migration.c
index c99a4e62d7..3beedd676e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2215,21 +2215,23 @@ typedef 

[Qemu-block] [PATCH v10 09/12] migration: add is_active_iterate handler

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Only-postcopy savevm states (dirty-bitmap) don't need live iteration, so
to disable them and stop transporting empty sections there is a new
savevm handler.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Juan Quintela 
Reviewed-by: John Snow 
Reviewed-by: Fam Zheng 
---
 include/migration/register.h | 9 +
 migration/savevm.c   | 5 +
 2 files changed, 14 insertions(+)

diff --git a/include/migration/register.h b/include/migration/register.h
index 9436a87678..f6f12f9b1a 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -26,6 +26,15 @@ typedef struct SaveVMHandlers {
 bool (*is_active)(void *opaque);
 bool (*has_postcopy)(void *opaque);
 
+/* is_active_iterate
+ * If it is not NULL then qemu_savevm_state_iterate will skip iteration if
+ * it returns false. For example, it is needed for only-postcopy-states,
+ * which needs to be handled by qemu_savevm_state_setup and
+ * qemu_savevm_state_pending, but do not need iterations until not in
+ * postcopy stage.
+ */
+bool (*is_active_iterate)(void *opaque);
+
 /* This runs outside the iothread lock in the migration case, and
  * within the lock in the savevm case.  The callback had better only
  * use data that is local to the migration thread or protected
diff --git a/migration/savevm.c b/migration/savevm.c
index c3c60a15e3..e5d557458e 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1026,6 +1026,11 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
 continue;
 }
 }
+if (se->ops && se->ops->is_active_iterate) {
+if (!se->ops->is_active_iterate(se->opaque)) {
+continue;
+}
+}
 /*
  * In the postcopy phase, any device that doesn't know how to
  * do postcopy should have saved it's state in the _complete
-- 
2.11.1




[Qemu-block] [PATCH v10 01/12] block/dirty-bitmap: add bdrv_dirty_bitmap_enable_successor()

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Enabling bitmap successor is necessary to enable successors of bitmaps
being migrated before target vm start.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: John Snow 
Reviewed-by: Fam Zheng 
---
 include/block/dirty-bitmap.h | 1 +
 block/dirty-bitmap.c | 8 
 2 files changed, 9 insertions(+)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 89dc50946b..93e128f81b 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -20,6 +20,7 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState 
*bs,
 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
BdrvDirtyBitmap *bitmap,
Error **errp);
+void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap *bitmap);
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
 const char *name);
 void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap);
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 909f0517f8..0d0e807216 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -234,6 +234,14 @@ int bdrv_dirty_bitmap_create_successor(BlockDriverState 
*bs,
 return 0;
 }
 
+/* Called with BQL taken. */
+void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap *bitmap)
+{
+qemu_mutex_lock(bitmap->mutex);
+bdrv_enable_dirty_bitmap(bitmap->successor);
+qemu_mutex_unlock(bitmap->mutex);
+}
+
 /**
  * For a bitmap with a successor, yield our name to the successor,
  * delete the old bitmap, and return a handle to the new bitmap.
-- 
2.11.1




[Qemu-block] [PATCH v10 11/12] iotests: add dirty bitmap migration test

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
The test starts two vms (vm_a, vm_b), create dirty bitmap in
the first one, do several writes to corresponding device and
then migrate vm_a to vm_b.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/169 | 141 +
 tests/qemu-iotests/169.out |   5 ++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 147 insertions(+)
 create mode 100755 tests/qemu-iotests/169
 create mode 100644 tests/qemu-iotests/169.out

diff --git a/tests/qemu-iotests/169 b/tests/qemu-iotests/169
new file mode 100755
index 00..8801f73b10
--- /dev/null
+++ b/tests/qemu-iotests/169
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+#
+# Tests for dirty bitmaps migration.
+#
+# Copyright (c) 2016-2017 Virtuozzo International GmbH. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+import iotests
+import time
+import itertools
+import operator
+import new
+from iotests import qemu_img
+
+
+disk_a = os.path.join(iotests.test_dir, 'disk_a')
+disk_b = os.path.join(iotests.test_dir, 'disk_b')
+size = '1M'
+mig_file = os.path.join(iotests.test_dir, 'mig_file')
+
+
+class TestDirtyBitmapMigration(iotests.QMPTestCase):
+def tearDown(self):
+self.vm_a.shutdown()
+self.vm_b.shutdown()
+os.remove(disk_a)
+os.remove(disk_b)
+os.remove(mig_file)
+
+def setUp(self):
+qemu_img('create', '-f', iotests.imgfmt, disk_a, size)
+qemu_img('create', '-f', iotests.imgfmt, disk_b, size)
+
+self.vm_a = iotests.VM(path_suffix='a').add_drive(disk_a)
+self.vm_a.launch()
+
+self.vm_b = iotests.VM(path_suffix='b')
+self.vm_b.add_incoming("exec: cat '" + mig_file + "'")
+
+def add_bitmap(self, vm, granularity, persistent):
+params = {'node': 'drive0',
+  'name': 'bitmap0',
+  'granularity': granularity}
+if persistent:
+params['persistent'] = True
+params['autoload'] = True
+
+result = vm.qmp('block-dirty-bitmap-add', **params)
+self.assert_qmp(result, 'return', {});
+
+def get_bitmap_hash(self, vm):
+result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
+node='drive0', name='bitmap0')
+return result['return']['sha256']
+
+def check_bitmap(self, vm, sha256):
+result = vm.qmp('x-debug-block-dirty-bitmap-sha256',
+node='drive0', name='bitmap0')
+if sha256:
+self.assert_qmp(result, 'return/sha256', sha256);
+else:
+self.assert_qmp(result, 'error/desc',
+"Dirty bitmap 'bitmap0' not found");
+
+def do_test_migration(self, persistent, migrate_bitmaps, online,
+  shared_storage):
+granularity = 512
+
+# regions = ((start, count), ...)
+regions = ((0, 0x1),
+   (0xf, 0x1),
+   (0xa0201, 0x1000))
+
+should_migrate = migrate_bitmaps or persistent and shared_storage
+
+self.vm_b.add_drive(disk_a if shared_storage else disk_b)
+
+if online:
+os.mkfifo(mig_file)
+self.vm_b.launch()
+
+self.add_bitmap(self.vm_a, granularity, persistent)
+for r in regions:
+self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % r)
+sha256 = self.get_bitmap_hash(self.vm_a)
+
+if migrate_bitmaps:
+result = self.vm_a.qmp('migrate-set-capabilities',
+   capabilities=[{'capability': 
'dirty-bitmaps',
+  'state': True}])
+self.assert_qmp(result, 'return', {})
+
+result = self.vm_a.qmp('migrate', uri='exec:cat>' + mig_file)
+self.vm_a.event_wait("STOP")
+
+if not online:
+self.vm_a.shutdown()
+self.vm_b.launch()
+
+self.vm_b.event_wait("RESUME", timeout=10.0)
+
+self.check_bitmap(self.vm_b, sha256 if should_migrate else False)
+
+if should_migrate:
+self.vm_b.shutdown()
+self.vm_b.launch()
+self.check_bitmap(self.vm_b, sha256 if persistent else False)
+
+
+def inject_test_case(klass, name, method, *args, **kwargs):
+mc = operator.methodcaller(method, *args, **kwargs)
+setattr(klass, 'test_' + name

[Qemu-block] [PATCH v10 08/12] migration/qemu-file: add qemu_put_counted_string()

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Add function opposite to qemu_get_counted_string.
qemu_put_counted_string puts one-byte length of the string (string
should not be longer than 255 characters), and then it puts the string,
without last zero byte.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: John Snow 
Reviewed-by: Juan Quintela 
Reviewed-by: Fam Zheng 
---
 migration/qemu-file.h |  2 ++
 migration/qemu-file.c | 13 +
 2 files changed, 15 insertions(+)

diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index aae4e5ed36..f4f356ab12 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -174,4 +174,6 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t 
block_offset,
  ram_addr_t offset, size_t size,
  uint64_t *bytes_sent);
 
+void qemu_put_counted_string(QEMUFile *f, const char *name);
+
 #endif
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 2ab2bf362d..e85f501f86 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -734,6 +734,19 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256])
 }
 
 /*
+ * Put a string with one preceding byte containing its length. The length of
+ * the string should be less than 256.
+ */
+void qemu_put_counted_string(QEMUFile *f, const char *str)
+{
+size_t len = strlen(str);
+
+assert(len < 256);
+qemu_put_byte(f, len);
+qemu_put_buffer(f, (const uint8_t *)str, len);
+}
+
+/*
  * Set the blocking state of the QEMUFile.
  * Note: On some transports the OS only keeps a single blocking state for
  *   both directions, and thus changing the blocking on the main
-- 
2.11.1




[Qemu-block] [PATCH v10 03/12] block/dirty-bitmap: add _locked version of bdrv_reclaim_dirty_bitmap

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/dirty-bitmap.h |  3 +++
 block/dirty-bitmap.c | 28 ++--
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 93e128f81b..b2e1fd913a 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -92,5 +92,8 @@ BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BlockDriverState *bs,
 BdrvDirtyBitmap *bitmap);
 char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp);
 int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, uint64_t start);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BlockDriverState *bs,
+  BdrvDirtyBitmap *bitmap,
+  Error **errp);
 
 #endif
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 75435f6c2f..ce00ff3474 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -320,17 +320,13 @@ BdrvDirtyBitmap 
*bdrv_dirty_bitmap_abdicate(BlockDriverState *bs,
  * In cases of failure where we can no longer safely delete the parent,
  * we may wish to re-join the parent and child/successor.
  * The merged parent will be un-frozen, but not explicitly re-enabled.
- * Called with BQL taken.
+ * Called within bdrv_dirty_bitmap_lock..unlock and with BQL taken.
  */
-BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
-   BdrvDirtyBitmap *parent,
-   Error **errp)
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BlockDriverState *bs,
+  BdrvDirtyBitmap *parent,
+  Error **errp)
 {
-BdrvDirtyBitmap *successor;
-
-qemu_mutex_lock(parent->mutex);
-
-successor = parent->successor;
+BdrvDirtyBitmap *successor = parent->successor;
 
 if (!successor) {
 error_setg(errp, "Cannot reclaim a successor when none is present");
@@ -344,9 +340,21 @@ BdrvDirtyBitmap 
*bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
 bdrv_release_dirty_bitmap_locked(bs, successor);
 parent->successor = NULL;
 
+return parent;
+}
+
+/* Called with BQL taken. */
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs,
+   BdrvDirtyBitmap *parent,
+   Error **errp)
+{
+BdrvDirtyBitmap *ret;
+
+qemu_mutex_lock(parent->mutex);
+ret = bdrv_reclaim_dirty_bitmap_locked(bs, parent, errp);
 qemu_mutex_unlock(parent->mutex);
 
-return parent;
+return ret;
 }
 
 /**
-- 
2.11.1




Re: [Qemu-block] [PATCH] ratelimit: don't align wait time with slices

2018-02-07 Thread Stefan Hajnoczi
On Wed, Feb 07, 2018 at 08:17:58AM +0100, Wolfgang Bumiller wrote:
> It is possible for rate limited writes to keep overshooting a slice's
> quota by a tiny amount causing the slice-aligned waiting period to
> effectively halve the rate.
> 
> Signed-off-by: Wolfgang Bumiller 
> ---
> Copied the Ccs from the discussion thread, hope that's fine, as I also
> just noticed that for my reply containing this snippet I had hit reply
> on the mail that did not contain those Ccs yet, sorry about that.
> 
>  include/qemu/ratelimit.h | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH v2] vl: pause vcpus before stopping iothreads

2018-02-07 Thread Stefan Hajnoczi
On Thu, Feb 01, 2018 at 11:07:08AM +, Stefan Hajnoczi wrote:
> Commit dce8921b2baaf95974af8176406881872067adfa ("iothread: Stop threads
> before main() quits") introduced iothread_stop_all() to avoid the
> following virtio-scsi assertion failure:
> 
>   assert(blk_get_aio_context(d->conf.blk) == s->ctx);
> 
> Back then the assertion failed because when bdrv_close_all() made
> d->conf.blk NULL, blk_get_aio_context() returned the global AioContext
> instead of s->ctx.
> 
> The same assertion can still fail today when vcpus submit new I/O
> requests after iothread_stop_all() has moved the BDS to the global
> AioContext.
> 
> This patch hardens the iothread_stop_all() approach by pausing vcpus
> before calling iothread_stop_all().
> 
> Note that the assertion failure is a race condition.  It is not possible
> to reproduce it reliably.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
> v2:
>  * Add comment explaining rationale for this ordering and why it's safe.
>[Kevin]
> 
>  vl.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH v2 7/8] file-posix: account discard operations

2018-02-07 Thread Alberto Garcia
On Fri 19 Jan 2018 01:50:06 PM CET, Anton Nefedov wrote:
> This will help to identify how many of the user-issued discard operations
> (accounted on a device level) have actually suceeded down on the host file
> (even though the numbers will not be exactly the same if non-raw format
> driver is used (e.g. qcow2 sending metadata discards)).
>
> Signed-off-by: Anton Nefedov 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/file-posix.c | 21 +++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 36ee89e..544ae58 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -158,6 +158,11 @@ typedef struct BDRVRawState {
>  bool page_cache_inconsistent:1;
>  bool has_fallocate;
>  bool needs_alignment;
> +struct {
> +int64_t discard_nb_ok;
> +int64_t discard_nb_failed;
> +int64_t discard_bytes_ok;
> +} stats;

Shouldn't this new structure be defined in a header file so other
drivers can use it? Or did you define it here because you don't see that
happening soon?

The rest of the patch looks good.

Berto



Re: [Qemu-block] [PATCH v2 6/8] scsi: account unmap operations

2018-02-07 Thread Alberto Garcia
On Fri 19 Jan 2018 01:50:05 PM CET, Anton Nefedov wrote:
> Signed-off-by: Anton Nefedov 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Alberto Garcia 

Berto



Re: [Qemu-block] [PATCH v2 5/8] scsi: move unmap error checking to the complete callback

2018-02-07 Thread Alberto Garcia
On Fri 19 Jan 2018 01:50:04 PM CET, Anton Nefedov wrote:
> This will help to account the operation in the following commit.
>
> The difference is that we don't call scsi_disk_req_check_error() before
> the 1st discard iteration anymore. That function also checks if
> the request is cancelled, however it shouldn't get canceled until it
> yields in blk_aio() functions anyway.
> Same approach is already used for emulate_write_same.
>
> Signed-off-by: Anton Nefedov 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Alberto Garcia 

> @@ -1665,7 +1662,12 @@ static void scsi_unmap_complete(void *opaque, int ret)
>  r->req.aiocb = NULL;
>  
>  aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
> -scsi_unmap_complete_noio(data, ret);
> +if (scsi_disk_req_check_error(r, ret, false)) {
> +scsi_req_unref(&r->req);
> +g_free(data);

It would be nice not to have the cleanup code duplicated here, but I
don't see any obvious alternative.

Berto



Re: [Qemu-block] [PATCH v2 3/8] ide: account UNMAP (TRIM) operations

2018-02-07 Thread Alberto Garcia
On Fri 19 Jan 2018 01:50:02 PM CET, Anton Nefedov wrote:
> Signed-off-by: Anton Nefedov 

Reviewed-by: Alberto Garcia 

Berto



Re: [Qemu-block] [PATCH for-2.9-rc5 v4 2/2] block: Drain BH in bdrv_drained_begin

2018-02-07 Thread Fam Zheng
On Wed, Feb 7, 2018 at 9:10 PM, Kevin Wolf  wrote:
> Am 07.02.2018 um 13:39 hat Fam Zheng geschrieben:
>> On Wed, Feb 7, 2018 at 6:51 PM, Kevin Wolf  wrote:
>> > Am 07.02.2018 um 02:48 hat Fam Zheng geschrieben:
>> >> On Tue, Feb 6, 2018 at 11:32 PM, Kevin Wolf  wrote:
>> >> > I got a bug assigned where we have a large (200 GB) fragmented qcow2
>> >> > image, and qemu-img convert takes two hours before it even starts to
>> >> > copy data. What it does in this time is iterating through the whole
>> >> > image with bdrv_block_status_above() in order to estimate the work to be
>> >> > done (and of course it will repeat the same calls while actually copying
>> >> > data).
>> >>
>> >> The convert_iteration_sectors loop looks wasteful. Why cannot we
>> >> report progress simply based on (offset/size) * 100% so we don't need
>> >> to do this estimation?
>> >
>> > The assumption is that it's quick and it makes the progress much more
>> > meaningful. You know those progress bars that slowly crawl towards 50%
>> > and then suddenly complete within a second. Or the first 20% are quick,
>> > but then things get really slow. They are not a great example.
>> >
>> > There must have been something wrong with that image file that was
>> > reported, because they reported that it was the only image causing
>> > trouble and if they copied it away, it became quick, too.
>> >
>> > Even for a maximally fragmented 100 GB qcow2 image it only takes about a
>> > second on my laptop. So while I don't feel as certain about the loop as
>> > before, in practice it normally doesn't seem to hurt.
>>
>> No doubt about normal cases. I was unsure about corner cases like
>> slow-ish NFS etc.
>
> Yeah, NFS seems to be a bit slow. Not two-hours-slow, but when I tried
> it with a localhost NFS server, the same operation that took two seconds
> directly from the local file system took about 40s over NFS. They seem
> to go over the network for each lseek() instead of caching the file map.
> Maybe something to fix in the NFS kernel driver.
>
>> A little bit of intelligence would be limiting the time for the loop
>> to a few seconds, for example, "IMG_CVT_EST_SCALE *
>> bdrv_getlength(bs)", or a less linear map.
>
> I don't understand. What would IMG_CVT_EST_SCALE be? Isn't the problem
> that this isn't a constant factor but can be anywhere between 0% and
> 100% depending on the specific image?

This is going to be quite arbitrary, just to make sure we don't waste
a very long time on maximal fragmented images, or on slow lseek()
scenarios. Something like this:

#define IMG_CVT_EST_SCALE ((1 << 40) / 30)

time_t start = time(NULL);
while (sector_num < s->total_sectors) {
if (time(NULL) - start > MAX(30, bdrv_getlength() /
IMG_CVT_EST_SCALE)) {
/* Too much time spent on counting allocation, just fall
back to bdrv_get_allocated_file_size */
s->allocated_sectors = bdrv_get_allocated_file_size(bs);
break;
}
n = convert_iteration_sectors(s, sector_num);
...
}

So we loop for at most 30 seconds (for >1TB images).

>
>> >> > One of the things I saw is that between each pair of lseek() calls, we
>> >> > also have unnecessary poll() calls, and these were introduced by this
>> >> > patch. If I modify bdrv_common_block_status_above() so that it doesn't
>> >> > poll if data.done == true already, the time needed to iterate through my
>> >> > test image is cut in half.
>> >> >
>> >> > Now, of course, I'm still only seeing a few seconds for a 100 GB image,
>> >> > so there must be more that is wrong for the reporter, but it suggests to
>> >> > me that BDRV_POLL_WHILE() shouldn't be polling unconditionally when only
>> >> > one of its users actually needs this.
>> >>
>> >> Sounds fine to me. Maybe we could add a boolean parameter to
>> >> BDRV_POLL_WHILE?
>> >
>> > Why not call aio_poll() explicitly in the BDRV_POLL_WHILE() condition in
>> > the one place that needs it?
>>
>> Yes, that is better. Do you have a patch? Or do you want me to work on one?
>
> I don't have a patch yet. If you want to write one, be my guest.
>
> Otherwise I'd just take a to-do note for when I get back to my
> bdrv_drain work. There is still one or two series left to do anyway, and
> I think it would fit in there.

Thought that so I asked, I'll leave it to you then. :)

Fam



Re: [Qemu-block] [Qemu-devel] [PATCH v2 0/2] block latency histogram

2018-02-07 Thread Vladimir Sementsov-Ogievskiy

looks strange and unrelated.

07.02.2018 16:20, no-re...@patchew.org wrote:

Hi,

This series failed build test on ppc host. Please find the details below.

Type: series
Subject: [Qemu-devel] [PATCH v2 0/2] block latency histogram
Message-id: 20180207125037.13510-1-vsement...@virtuozzo.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash
# Testing script will be invoked under the git checkout with
# HEAD pointing to a commit that has the patches applied on top of "base"
# branch
set -e
echo "=== ENV ==="
env
echo "=== PACKAGES ==="
rpm -qa
echo "=== TEST BEGIN ==="
INSTALL=$PWD/install
BUILD=$PWD/build
mkdir -p $BUILD $INSTALL
SRC=$PWD
cd $BUILD
$SRC/configure --prefix=$INSTALL
make -j100
# XXX: we need reliable clean up
# make check -j100 V=1
make install
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
error: RPC failed; result=18, HTTP code = 200
fatal: The remote end hung up unexpectedly
error: Could not fetch 3c8cf5a9c21ff8782164d1def7f44bd888713384
Traceback (most recent call last):
   File "/home/patchew/patchew/patchew-cli", line 504, in test_one
 git_clone_repo(clone, r["repo"], r["head"], logf)
   File "/home/patchew/patchew/patchew-cli", line 48, in git_clone_repo
 stdout=logf, stderr=logf)
   File "/usr/lib64/python3.4/subprocess.py", line 558, in check_call
 raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['git', 'remote', 'add', '-f', 
'--mirror=fetch', '3c8cf5a9c21ff8782164d1def7f44bd888713384', 
'https://github.com/patchew-project/qemu']' returned non-zero exit status 1



---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org



--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH for-2.9-rc5 v4 2/2] block: Drain BH in bdrv_drained_begin

2018-02-07 Thread Kevin Wolf
Am 07.02.2018 um 13:39 hat Fam Zheng geschrieben:
> On Wed, Feb 7, 2018 at 6:51 PM, Kevin Wolf  wrote:
> > Am 07.02.2018 um 02:48 hat Fam Zheng geschrieben:
> >> On Tue, Feb 6, 2018 at 11:32 PM, Kevin Wolf  wrote:
> >> > I got a bug assigned where we have a large (200 GB) fragmented qcow2
> >> > image, and qemu-img convert takes two hours before it even starts to
> >> > copy data. What it does in this time is iterating through the whole
> >> > image with bdrv_block_status_above() in order to estimate the work to be
> >> > done (and of course it will repeat the same calls while actually copying
> >> > data).
> >>
> >> The convert_iteration_sectors loop looks wasteful. Why cannot we
> >> report progress simply based on (offset/size) * 100% so we don't need
> >> to do this estimation?
> >
> > The assumption is that it's quick and it makes the progress much more
> > meaningful. You know those progress bars that slowly crawl towards 50%
> > and then suddenly complete within a second. Or the first 20% are quick,
> > but then things get really slow. They are not a great example.
> >
> > There must have been something wrong with that image file that was
> > reported, because they reported that it was the only image causing
> > trouble and if they copied it away, it became quick, too.
> >
> > Even for a maximally fragmented 100 GB qcow2 image it only takes about a
> > second on my laptop. So while I don't feel as certain about the loop as
> > before, in practice it normally doesn't seem to hurt.
> 
> No doubt about normal cases. I was unsure about corner cases like
> slow-ish NFS etc.

Yeah, NFS seems to be a bit slow. Not two-hours-slow, but when I tried
it with a localhost NFS server, the same operation that took two seconds
directly from the local file system took about 40s over NFS. They seem
to go over the network for each lseek() instead of caching the file map.
Maybe something to fix in the NFS kernel driver.

> A little bit of intelligence would be limiting the time for the loop
> to a few seconds, for example, "IMG_CVT_EST_SCALE *
> bdrv_getlength(bs)", or a less linear map.

I don't understand. What would IMG_CVT_EST_SCALE be? Isn't the problem
that this isn't a constant factor but can be anywhere between 0% and
100% depending on the specific image?

> >> > One of the things I saw is that between each pair of lseek() calls, we
> >> > also have unnecessary poll() calls, and these were introduced by this
> >> > patch. If I modify bdrv_common_block_status_above() so that it doesn't
> >> > poll if data.done == true already, the time needed to iterate through my
> >> > test image is cut in half.
> >> >
> >> > Now, of course, I'm still only seeing a few seconds for a 100 GB image,
> >> > so there must be more that is wrong for the reporter, but it suggests to
> >> > me that BDRV_POLL_WHILE() shouldn't be polling unconditionally when only
> >> > one of its users actually needs this.
> >>
> >> Sounds fine to me. Maybe we could add a boolean parameter to
> >> BDRV_POLL_WHILE?
> >
> > Why not call aio_poll() explicitly in the BDRV_POLL_WHILE() condition in
> > the one place that needs it?
> 
> Yes, that is better. Do you have a patch? Or do you want me to work on one?

I don't have a patch yet. If you want to write one, be my guest.

Otherwise I'd just take a to-do note for when I get back to my
bdrv_drain work. There is still one or two series left to do anyway, and
I think it would fit in there.

Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v2 0/2] block latency histogram

2018-02-07 Thread no-reply
Hi,

This series failed build test on ppc host. Please find the details below.

Type: series
Subject: [Qemu-devel] [PATCH v2 0/2] block latency histogram
Message-id: 20180207125037.13510-1-vsement...@virtuozzo.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash
# Testing script will be invoked under the git checkout with
# HEAD pointing to a commit that has the patches applied on top of "base"
# branch
set -e
echo "=== ENV ==="
env
echo "=== PACKAGES ==="
rpm -qa
echo "=== TEST BEGIN ==="
INSTALL=$PWD/install
BUILD=$PWD/build
mkdir -p $BUILD $INSTALL
SRC=$PWD
cd $BUILD
$SRC/configure --prefix=$INSTALL
make -j100
# XXX: we need reliable clean up
# make check -j100 V=1
make install
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
error: RPC failed; result=18, HTTP code = 200
fatal: The remote end hung up unexpectedly
error: Could not fetch 3c8cf5a9c21ff8782164d1def7f44bd888713384
Traceback (most recent call last):
  File "/home/patchew/patchew/patchew-cli", line 504, in test_one
git_clone_repo(clone, r["repo"], r["head"], logf)
  File "/home/patchew/patchew/patchew-cli", line 48, in git_clone_repo
stdout=logf, stderr=logf)
  File "/usr/lib64/python3.4/subprocess.py", line 558, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['git', 'remote', 'add', '-f', 
'--mirror=fetch', '3c8cf5a9c21ff8782164d1def7f44bd888713384', 
'https://github.com/patchew-project/qemu']' returned non-zero exit status 1



---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

[Qemu-block] [PATCH v2 1/2] block/accounting: introduce latency histogram

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Introduce latency histogram statics for block devices.
For each accounted operation type latency region [0, +inf) is
divided into subregions by several points. Then, calculate
hits for each subregion.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/accounting.h |  9 +
 block/accounting.c | 97 ++
 2 files changed, 106 insertions(+)

diff --git a/include/block/accounting.h b/include/block/accounting.h
index b833d26d6c..9679020f64 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -45,6 +45,12 @@ struct BlockAcctTimedStats {
 QSLIST_ENTRY(BlockAcctTimedStats) entries;
 };
 
+typedef struct BlockLatencyHistogram {
+int size;
+uint64_t *points; /* @size-1 points here (all points, except 0 and +inf) */
+uint64_t *histogram[BLOCK_MAX_IOTYPE]; /* @size elements for each type */
+} BlockLatencyHistogram;
+
 struct BlockAcctStats {
 QemuMutex lock;
 uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
@@ -57,6 +63,7 @@ struct BlockAcctStats {
 QSLIST_HEAD(, BlockAcctTimedStats) intervals;
 bool account_invalid;
 bool account_failed;
+BlockLatencyHistogram latency_histogram;
 };
 
 typedef struct BlockAcctCookie {
@@ -82,5 +89,7 @@ void block_acct_merge_done(BlockAcctStats *stats, enum 
BlockAcctType type,
 int64_t block_acct_idle_time_ns(BlockAcctStats *stats);
 double block_acct_queue_depth(BlockAcctTimedStats *stats,
   enum BlockAcctType type);
+int block_latency_histogram_set(BlockAcctStats *stats, uint64List *latency);
+void block_latency_histogram_clear(BlockAcctStats *stats);
 
 #endif
diff --git a/block/accounting.c b/block/accounting.c
index 87ef5bbfaa..0051ff0c24 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -94,6 +94,100 @@ void block_acct_start(BlockAcctStats *stats, 
BlockAcctCookie *cookie,
 cookie->type = type;
 }
 
+/* block_latency_histogram_compare_func
+ * Compare @key with interval [@el, @el+1), where @el+1 is a next array element
+ * after @el.
+ * Return: -1 if @key < @el
+ *  0 if @key in [@el, @el+1)
+ * +1 if @key >= @el+1
+ */
+static int block_latency_histogram_compare_func(const void *key, const void 
*el)
+{
+uint64_t k = *(uint64_t *)key;
+uint64_t a = *(uint64_t *)el;
+uint64_t b = *((uint64_t *)el + 1);
+
+return k < a ? -1 : (k < b ? 0 : 1);
+}
+
+static void block_latency_histogram_account(BlockLatencyHistogram *hist,
+enum BlockAcctType type,
+int64_t latency_ns)
+{
+uint64_t *data, *pos;
+
+if (hist->points == NULL) {
+/* histogram disabled */
+return;
+}
+
+data = hist->histogram[type];
+
+if (latency_ns < hist->points[0]) {
+data[0]++;
+return;
+}
+
+if (latency_ns >= hist->points[hist->size - 2]) {
+data[hist->size - 1]++;
+return;
+}
+
+pos = bsearch(&latency_ns, hist->points, hist->size - 2,
+  sizeof(hist->points[0]),
+  block_latency_histogram_compare_func);
+assert(pos != NULL);
+
+data[pos - hist->points + 1]++;
+}
+
+int block_latency_histogram_set(BlockAcctStats *stats, uint64List *latency)
+{
+BlockLatencyHistogram *hist = &stats->latency_histogram;
+uint64List *entry;
+uint64_t *ptr;
+int i;
+uint64_t prev = 0;
+
+hist->size = 1;
+
+for (entry = latency; entry; entry = entry->next) {
+if (entry->value <= prev) {
+return -EINVAL;
+}
+hist->size++;
+prev = entry->value;
+}
+
+hist->points = g_renew(uint64_t, hist->points, hist->size - 1);
+for (entry = latency, ptr = hist->points; entry;
+ entry = entry->next, ptr++)
+{
+*ptr = entry->value;
+}
+
+for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
+hist->histogram[i] = g_renew(uint64_t, hist->histogram[i], hist->size);
+memset(hist->histogram[i], 0, hist->size * sizeof(uint64_t));
+}
+
+return 0;
+}
+
+void block_latency_histogram_clear(BlockAcctStats *stats)
+{
+BlockLatencyHistogram *hist = &stats->latency_histogram;
+int i;
+
+g_free(hist->points);
+hist->points = NULL;
+
+for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
+g_free(hist->histogram[i]);
+hist->histogram[i] = NULL;
+}
+}
+
 static void block_account_one_io(BlockAcctStats *stats, BlockAcctCookie 
*cookie,
  bool failed)
 {
@@ -116,6 +210,9 @@ static void block_account_one_io(BlockAcctStats *stats, 
BlockAcctCookie *cookie,
 stats->nr_ops[cookie->type]++;
 }
 
+block_latency_histogram_account(&stats->latency_histogram, cookie->type,
+latency_ns);
+
 if (!failed || stats->account_failed) {
 stats->total_time_ns[cookie->type] += latency_ns;
 stats->last_access_time_ns = time_ns;
-- 
2.11.1




[Qemu-block] [PATCH v2 2/2] qapi: add block latency histogram interface

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
Set (and clear) histogram through new command
block-latency-histogram-set and show new statistics in
query-blockstats results.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json | 73 +++-
 block/qapi.c | 31 ++
 blockdev.c   | 19 ++
 3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 8225308904..74fe3fe9c4 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -451,6 +451,74 @@
'status': 'DirtyBitmapStatus'} }
 
 ##
+# @BlockLatencyHistogramInfo:
+#
+# Block latency histogram.
+#
+# @latency: list of latency points in microseconds. Matches the @latency
+#   parameter from the last call to block-latency-histogram-set for the
+#   given device.
+#
+# @read: list of read-request counts corresponding to latency region.
+#len(@read) = len(@latency) + 1
+#@read[0] corresponds to latency region [0, @latency[0])
+#for 0 < i < len(@latency): @read[i] corresponds to latency region
+#[@latency[i-1], @latency[i])
+#@read.last_element corresponds to latency region
+#[@latency.last_element, +inf)
+#
+# @write: list of write-request counts (see @read semantics)
+#
+# @flush: list of flush-request counts (see @read semantics)
+#
+# Since: 2.12
+##
+{ 'struct': 'BlockLatencyHistogramInfo',
+  'data': {'latency': ['uint64'],
+   'read': ['uint64'],
+   'write': ['uint64'],
+   'flush': ['uint64'] } }
+
+##
+# @block-latency-histogram-set:
+#
+# Remove old latency histogram (if exist) and (optionally) create a new one.
+# Add a latency histogram to block device. If a latency histogram already
+# exists for the device it will be removed and a new one created. The latency
+# histogram may be queried through query-blockstats.
+# Set, restart or clear latency histogram.
+#
+# @device: device name to set latency histogram for.
+#
+# @latency: If unspecified, the latency histogram will be removed (if exists).
+#   Otherwise it should be list of latency points in microseconds. Old
+#   latency histogram will be removed (if exists) and a new one will be
+#   created. The sequence must be ascending, elements must be greater
+#   than zero. Histogram latency regions would be
+#   [0, @latency[0]), ..., [@latency[i], @latency[i+1]), ...,
+#   [@latency.last_element, +inf)
+#
+# Returns: error if device is not found or latency arrays is invalid.
+#
+# Since: 2.12
+#
+# Example (set new latency histogram):
+#
+# -> { "execute": "block-latency-histogram-set",
+#  "arguments": { "device": "drive0",
+# "latency": [50, 100, 200] } }
+# <- { "return": {} }
+#
+# Example (remove latency histogram):
+#
+# -> { "execute": "block-latency-histogram-set",
+#  "arguments": { "device": "drive0" } }
+# <- { "return": {} }
+##
+{ 'command': 'block-latency-histogram-set',
+  'data': {'device': 'str', '*latency': ['uint64'] } }
+
+##
 # @BlockInfo:
 #
 # Block device information.  This structure describes a virtual device and
@@ -730,6 +798,8 @@
 # @timed_stats: Statistics specific to the set of previously defined
 #   intervals of time (Since 2.5)
 #
+# @latency-histogram: @BlockLatencyHistogramInfo. (Since 2.12)
+#
 # Since: 0.14.0
 ##
 { 'struct': 'BlockDeviceStats',
@@ -742,7 +812,8 @@
'failed_flush_operations': 'int', 'invalid_rd_operations': 'int',
'invalid_wr_operations': 'int', 'invalid_flush_operations': 'int',
'account_invalid': 'bool', 'account_failed': 'bool',
-   'timed_stats': ['BlockDeviceTimedStats'] } }
+   'timed_stats': ['BlockDeviceTimedStats'],
+   '*latency-histogram': 'BlockLatencyHistogramInfo' } }
 
 ##
 # @BlockStats:
diff --git a/block/qapi.c b/block/qapi.c
index fc10f0a565..715ed17a6b 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -389,6 +389,24 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo 
**p_info,
 qapi_free_BlockInfo(info);
 }
 
+static uint64List *uint64_list(uint64_t *list, int size)
+{
+int i;
+uint64List *out_list = NULL;
+uint64List **pout_list = &out_list;
+
+for (i = 0; i < size; i++) {
+uint64List *entry = g_new(uint64List, 1);
+entry->value = list[i];
+*pout_list = entry;
+pout_list = &entry->next;
+}
+
+*pout_list = NULL;
+
+return out_list;
+}
+
 static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
 {
 BlockAcctStats *stats = blk_get_stats(blk);
@@ -454,6 +472,19 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, 
BlockBackend *blk)
 dev_stats->avg_wr_queue_depth =
 block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
 }
+
+ds->has_latency_histogram = stats->latency_histogram.points != NULL;
+if (ds->has_latency_histogram) {
+

Re: [Qemu-block] [PATCH] ratelimit: don't align wait time with slices

2018-02-07 Thread Alberto Garcia
On Wed 07 Feb 2018 08:17:58 AM CET, Wolfgang Bumiller wrote:
> It is possible for rate limited writes to keep overshooting a slice's
> quota by a tiny amount causing the slice-aligned waiting period to
> effectively halve the rate.
>
> Signed-off-by: Wolfgang Bumiller 
> ---
> Copied the Ccs from the discussion thread, hope that's fine, as I also
> just noticed that for my reply containing this snippet I had hit reply
> on the mail that did not contain those Ccs yet, sorry about that.

Stefan mentioned in another e-mail that someone proposed at some point
to unify this with the code in throttle.c. We can consider it but that
needs to be evaluated first. The other code is more complex and has
extra features (bursts) so it may not be worth the effort. Also the
algorithm is different. I can take a look in the future when I have some
time.

Otherwise, your patch looks good.

Reviewed-by: Alberto Garcia 

Berto



[Qemu-block] [PATCH v2 0/2] block latency histogram

2018-02-07 Thread Vladimir Sementsov-Ogievskiy
v2:

01: add block_latency_histogram_clear()
02: fix spelling (sorry =()
some rewordings
remove histogram if latency parameter unspecified

Vladimir Sementsov-Ogievskiy (2):
  block/accounting: introduce latency histogram
  qapi: add block latency histogram interface

 qapi/block-core.json   | 73 +-
 include/block/accounting.h |  9 +
 block/accounting.c | 97 ++
 block/qapi.c   | 31 +++
 blockdev.c | 19 +
 5 files changed, 228 insertions(+), 1 deletion(-)

-- 
2.11.1




Re: [Qemu-block] [PATCH for-2.9-rc5 v4 2/2] block: Drain BH in bdrv_drained_begin

2018-02-07 Thread Fam Zheng
On Wed, Feb 7, 2018 at 6:51 PM, Kevin Wolf  wrote:
> Am 07.02.2018 um 02:48 hat Fam Zheng geschrieben:
>> On Tue, Feb 6, 2018 at 11:32 PM, Kevin Wolf  wrote:
>> > Am 18.04.2017 um 16:30 hat Fam Zheng geschrieben:
>> >> During block job completion, nothing is preventing
>> >> block_job_defer_to_main_loop_bh from being called in a nested
>> >> aio_poll(), which is a trouble, such as in this code path:
>> >>
>> >> qmp_block_commit
>> >>   commit_active_start
>> >> bdrv_reopen
>> >>   bdrv_reopen_multiple
>> >> bdrv_reopen_prepare
>> >>   bdrv_flush
>> >> aio_poll
>> >>   aio_bh_poll
>> >> aio_bh_call
>> >>   block_job_defer_to_main_loop_bh
>> >> stream_complete
>> >>   bdrv_reopen
>> >>
>> >> block_job_defer_to_main_loop_bh is the last step of the stream job,
>> >> which should have been "paused" by the bdrv_drained_begin/end in
>> >> bdrv_reopen_multiple, but it is not done because it's in the form of a
>> >> main loop BH.
>> >>
>> >> Similar to why block jobs should be paused between drained_begin and
>> >> drained_end, BHs they schedule must be excluded as well.  To achieve
>> >> this, this patch forces draining the BH in BDRV_POLL_WHILE.
>> >>
>> >> As a side effect this fixes a hang in block_job_detach_aio_context
>> >> during system_reset when a block job is ready:
>> >>
>> >> #0  0x55aa79f3 in bdrv_drain_recurse
>> >> #1  0x55aa825d in bdrv_drained_begin
>> >> #2  0x55aa8449 in bdrv_drain
>> >> #3  0x55a9c356 in blk_drain
>> >> #4  0x55aa3cfd in mirror_drain
>> >> #5  0x55a66e11 in block_job_detach_aio_context
>> >> #6  0x55a62f4d in bdrv_detach_aio_context
>> >> #7  0x55a63116 in bdrv_set_aio_context
>> >> #8  0x55a9d326 in blk_set_aio_context
>> >> #9  0x557e38da in virtio_blk_data_plane_stop
>> >> #10 0x559f9d5f in virtio_bus_stop_ioeventfd
>> >> #11 0x559fa49b in virtio_bus_stop_ioeventfd
>> >> #12 0x559f6a18 in virtio_pci_stop_ioeventfd
>> >> #13 0x559f6a18 in virtio_pci_reset
>> >> #14 0x559139a9 in qdev_reset_one
>> >> #15 0x55916738 in qbus_walk_children
>> >> #16 0x55913318 in qdev_walk_children
>> >> #17 0x55916738 in qbus_walk_children
>> >> #18 0x559168ca in qemu_devices_reset
>> >> #19 0x5581fcbb in pc_machine_reset
>> >> #20 0x558a4d96 in qemu_system_reset
>> >> #21 0x5577157a in main_loop_should_exit
>> >> #22 0x5577157a in main_loop
>> >> #23 0x5577157a in main
>> >>
>> >> The rationale is that the loop in block_job_detach_aio_context cannot
>> >> make any progress in pausing/completing the job, because bs->in_flight
>> >> is 0, so bdrv_drain doesn't process the block_job_defer_to_main_loop
>> >> BH. With this patch, it does.
>> >>
>> >> Reported-by: Jeff Cody 
>> >> Signed-off-by: Fam Zheng 
>> >
>> > Fam, do you remember whether this was really only about drain? Because
>> > in that case...
>>
>> Yes I believe so.
>>
>> >
>> >> diff --git a/include/block/block.h b/include/block/block.h
>> >> index 97d4330..5ddc0cf 100644
>> >> --- a/include/block/block.h
>> >> +++ b/include/block/block.h
>> >> @@ -381,12 +381,13 @@ void bdrv_drain_all(void);
>> >>
>> >>  #define BDRV_POLL_WHILE(bs, cond) ({   \
>> >>  bool waited_ = false;  \
>> >> +bool busy_ = true; \
>> >>  BlockDriverState *bs_ = (bs);  \
>> >>  AioContext *ctx_ = bdrv_get_aio_context(bs_);  \
>> >>  if (aio_context_in_iothread(ctx_)) {   \
>> >> -while ((cond)) {   \
>> >> -aio_poll(ctx_, true);  \
>> >> -waited_ = true;\
>> >> +while ((cond) || busy_) {  \
>> >> +busy_ = aio_poll(ctx_, (cond));\
>> >> +waited_ |= !!(cond) | busy_;   \
>> >>  }  \
>> >>  } else {   \
>> >>  assert(qemu_get_current_aio_context() ==   \
>> >> @@ -398,11 +399,16 @@ void bdrv_drain_all(void);
>> >>   */\
>> >>  assert(!bs_->wakeup);  \
>> >>  bs_->wakeup = true;\
>> >> -while ((cond)) {   \
>> >> -aio_context_release(ctx_); \
>> >> -aio_poll(qemu_get_aio_context(), true);\
>> >> -aio_context_acqui

Re: [Qemu-block] [PATCH for-2.9-rc5 v4 2/2] block: Drain BH in bdrv_drained_begin

2018-02-07 Thread Kevin Wolf
Am 07.02.2018 um 02:48 hat Fam Zheng geschrieben:
> On Tue, Feb 6, 2018 at 11:32 PM, Kevin Wolf  wrote:
> > Am 18.04.2017 um 16:30 hat Fam Zheng geschrieben:
> >> During block job completion, nothing is preventing
> >> block_job_defer_to_main_loop_bh from being called in a nested
> >> aio_poll(), which is a trouble, such as in this code path:
> >>
> >> qmp_block_commit
> >>   commit_active_start
> >> bdrv_reopen
> >>   bdrv_reopen_multiple
> >> bdrv_reopen_prepare
> >>   bdrv_flush
> >> aio_poll
> >>   aio_bh_poll
> >> aio_bh_call
> >>   block_job_defer_to_main_loop_bh
> >> stream_complete
> >>   bdrv_reopen
> >>
> >> block_job_defer_to_main_loop_bh is the last step of the stream job,
> >> which should have been "paused" by the bdrv_drained_begin/end in
> >> bdrv_reopen_multiple, but it is not done because it's in the form of a
> >> main loop BH.
> >>
> >> Similar to why block jobs should be paused between drained_begin and
> >> drained_end, BHs they schedule must be excluded as well.  To achieve
> >> this, this patch forces draining the BH in BDRV_POLL_WHILE.
> >>
> >> As a side effect this fixes a hang in block_job_detach_aio_context
> >> during system_reset when a block job is ready:
> >>
> >> #0  0x55aa79f3 in bdrv_drain_recurse
> >> #1  0x55aa825d in bdrv_drained_begin
> >> #2  0x55aa8449 in bdrv_drain
> >> #3  0x55a9c356 in blk_drain
> >> #4  0x55aa3cfd in mirror_drain
> >> #5  0x55a66e11 in block_job_detach_aio_context
> >> #6  0x55a62f4d in bdrv_detach_aio_context
> >> #7  0x55a63116 in bdrv_set_aio_context
> >> #8  0x55a9d326 in blk_set_aio_context
> >> #9  0x557e38da in virtio_blk_data_plane_stop
> >> #10 0x559f9d5f in virtio_bus_stop_ioeventfd
> >> #11 0x559fa49b in virtio_bus_stop_ioeventfd
> >> #12 0x559f6a18 in virtio_pci_stop_ioeventfd
> >> #13 0x559f6a18 in virtio_pci_reset
> >> #14 0x559139a9 in qdev_reset_one
> >> #15 0x55916738 in qbus_walk_children
> >> #16 0x55913318 in qdev_walk_children
> >> #17 0x55916738 in qbus_walk_children
> >> #18 0x559168ca in qemu_devices_reset
> >> #19 0x5581fcbb in pc_machine_reset
> >> #20 0x558a4d96 in qemu_system_reset
> >> #21 0x5577157a in main_loop_should_exit
> >> #22 0x5577157a in main_loop
> >> #23 0x5577157a in main
> >>
> >> The rationale is that the loop in block_job_detach_aio_context cannot
> >> make any progress in pausing/completing the job, because bs->in_flight
> >> is 0, so bdrv_drain doesn't process the block_job_defer_to_main_loop
> >> BH. With this patch, it does.
> >>
> >> Reported-by: Jeff Cody 
> >> Signed-off-by: Fam Zheng 
> >
> > Fam, do you remember whether this was really only about drain? Because
> > in that case...
> 
> Yes I believe so.
> 
> >
> >> diff --git a/include/block/block.h b/include/block/block.h
> >> index 97d4330..5ddc0cf 100644
> >> --- a/include/block/block.h
> >> +++ b/include/block/block.h
> >> @@ -381,12 +381,13 @@ void bdrv_drain_all(void);
> >>
> >>  #define BDRV_POLL_WHILE(bs, cond) ({   \
> >>  bool waited_ = false;  \
> >> +bool busy_ = true; \
> >>  BlockDriverState *bs_ = (bs);  \
> >>  AioContext *ctx_ = bdrv_get_aio_context(bs_);  \
> >>  if (aio_context_in_iothread(ctx_)) {   \
> >> -while ((cond)) {   \
> >> -aio_poll(ctx_, true);  \
> >> -waited_ = true;\
> >> +while ((cond) || busy_) {  \
> >> +busy_ = aio_poll(ctx_, (cond));\
> >> +waited_ |= !!(cond) | busy_;   \
> >>  }  \
> >>  } else {   \
> >>  assert(qemu_get_current_aio_context() ==   \
> >> @@ -398,11 +399,16 @@ void bdrv_drain_all(void);
> >>   */\
> >>  assert(!bs_->wakeup);  \
> >>  bs_->wakeup = true;\
> >> -while ((cond)) {   \
> >> -aio_context_release(ctx_); \
> >> -aio_poll(qemu_get_aio_context(), true);\
> >> -aio_context_acquire(ctx_); \
> >> -waited_ = true;\
> >> +while (busy_) {  

Re: [Qemu-block] [PATCH 2/2] qapi: add block latency histogram interface

2018-02-07 Thread Vladimir Sementsov-Ogievskiy

06.02.2018 22:02, Eric Blake wrote:

On 02/06/2018 12:06 PM, Vladimir Sementsov-Ogievskiy wrote:

06.02.2018 18:50, Eric Blake wrote:

On 02/06/2018 08:07 AM, Vladimir Sementsov-Ogievskiy wrote:

Set (and clear) histogram through new command
block-latency-histogram-set and show new statistics in
query-blockstats results.



The commit message mentioned that you can set and clear histogram 
tracking; how does this interface let you clear things?  By passing 
a 0-length latency array?  If you violate the constraint (pass 
non-ascending points, for example), does the previous latency map 
get wiped out or is it still preserved unchanged?


On error nothing is changed.

By "clear" I mean zeroing statistics, not removing the whole 
histogram. And to zero statistics you can call set with the same 
latency array.
There is no way to remove histogram at all.. We can add 
block-latency-histogram-unset later if needed.


Maybe "set (or restart) histogram collection points" might read 
better? I also don't think we need a new command; if you make 
'latency' optional, then omitting it can serve to stop collecting 
statistics altogether, without needing a new command (then again, if 
you do that, now the command is used to "set, restart, and clear 
histogram collection").




I like the idea, will do.

--
Best regards,
Vladimir