date:20180207

Re: [Qemu-block] [Qemu-devel] [PATCH 1/2] scsi: add unrealize method for SCSI devices

2018-02-07 Thread Fam Zheng

On Wed, 02/07 17:36, Paolo Bonzini wrote:
> The next patch will introduce a different unrealize implementation
> for scsi-block.  Compared to the code before commit fb7b5c0df6
> ("scsi: devirtualize unrealize of SCSI devices", 2014-10-31), the
> common code for all SCSI devices is kept in scsi-bus.c.
> 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fam Zheng

Re: [Qemu-block] [Qemu-devel] [PATCH 2/2] scsi: add block job opblockers for scsi-block

2018-02-07 Thread Fam Zheng

On Wed, 02/07 17:36, Paolo Bonzini wrote:
> scsi-block bypasses the dirty bitmaps and pre-write notifiers, so it
> cannot be the source of a block job.  The gist of the fix is to add
> op-blockers to the BlockBackend, and remove them at "unrealize" time,
> but things are a little more complex because quit closes the BlockBackend
> without going through unrealize.
> 
> So use Notifiers: the remove_bs notifier is called by bdrv_close_all, and
> the insert_bs notifier might not be really necessary but make things a
> little more symmetric.
> 
> Suggested-by: Karen Noel 

:)

> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fam Zheng 

Though I have one comment below.

> ---
>  block/block-backend.c  |  9 ++
>  hw/scsi/scsi-disk.c| 62 
> ++
>  include/sysemu/block-backend.h |  1 +
>  3 files changed, 72 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index baef8e7abc..1759639a4a 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1747,6 +1747,15 @@ bool blk_op_is_blocked(BlockBackend *blk, BlockOpType 
> op, Error **errp)
>  return bdrv_op_is_blocked(bs, op, errp);
>  }
>  
> +void blk_op_block(BlockBackend *blk, BlockOpType op, Error *reason)
> +{
> +BlockDriverState *bs = blk_bs(blk);
> +
> +if (bs) {
> +bdrv_op_block(bs, op, reason);
> +}
> +}
> +
>  void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 49d2559d93..023673cb04 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -2578,9 +2578,39 @@ static int get_device_type(SCSIDiskState *s)
>  return 0;
>  }
>  
> +typedef struct SCSIBlockState {
> +SCSIDiskState sd;
> +Error *mirror_source;
> +Error *backup_source;
> +Error *commit_source;
> +Notifier insert_bs;
> +Notifier remove_bs;
> +} SCSIBlockState;
> +
> +static void scsi_block_insert_bs(Notifier *n, void *opaque)
> +{
> +SCSIBlockState *sb = container_of(n, SCSIBlockState, insert_bs);
> +SCSIDiskState *s = &sb->sd;
> +
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
> sb->mirror_source);
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
> sb->commit_source);
> +blk_op_block(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
> sb->backup_source);
> +}
> +
> +static void scsi_block_remove_bs(Notifier *n, void *opaque)
> +{
> +SCSIBlockState *sb = container_of(n, SCSIBlockState, remove_bs);
> +SCSIDiskState *s = &sb->sd;
> +
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_MIRROR_SOURCE, 
> sb->mirror_source);
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_COMMIT_SOURCE, 
> sb->commit_source);
> +blk_op_unblock(s->qdev.conf.blk, BLOCK_OP_TYPE_BACKUP_SOURCE, 
> sb->backup_source);
> +}
> +
>  static void scsi_block_realize(SCSIDevice *dev, Error **errp)
>  {
>  SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
> +SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
>  int sg_version;
>  int rc;
>  
> @@ -2626,6 +2656,36 @@ static void scsi_block_realize(SCSIDevice *dev, Error 
> **errp)
>  
>  scsi_realize(&s->qdev, errp);
>  scsi_generic_read_device_identification(&s->qdev);
> +
> +/* For op blockers, due to lack of support for dirty bitmaps.  */
> +error_setg(&sb->mirror_source,
> +   "scsi-block does not support acting as a mirroring source");
> +error_setg(&sb->commit_source,
> +   "scsi-block does not support acting as an active commit 
> source");

An alternative way would be adding BLOCK_OP_TYPE_DIRTY_BITMAP. The error message
will not be as nice but it can be useful for another (blockjob) operation that
requires dirty bitmap support, or another device that doesn't support dirty
bitmaps. Though there isn't one for now.

> +
> +/* For op blockers, due to lack of support for write notifiers.  */
> +error_setg(&sb->backup_source,
> +   "scsi-block does not support acting as a backup source");
> +
> +sb->insert_bs.notify = scsi_block_insert_bs;
> +blk_add_insert_bs_notifier(s->qdev.conf.blk, &sb->insert_bs);
> +sb->remove_bs.notify = scsi_block_remove_bs;
> +blk_add_remove_bs_notifier(s->qdev.conf.blk, &sb->remove_bs);
> +
> +scsi_block_insert_bs(&sb->insert_bs, s->qdev.conf.blk);
> +}
> +
> +static void scsi_block_unrealize(SCSIDevice *dev, Error **errp)
> +{
> +SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
> +SCSIBlockState *sb = DO_UPCAST(SCSIBlockState, sd, s);
> +
> +notifier_remove(&sb->insert_bs);
> +notifier_remove(&sb->remove_bs);
> +scsi_block_remove_bs(&sb->insert_bs, s->qdev.conf.blk);
> +error_free(sb->mirror_source);
> +error_free(sb->commit_source);
> +error_free(sb->backup_source);
>  }
>  
>  typedef struct SCSIBlockReq {
> @@ -3017,6 +3077,7 @@ static void scs

Re: [Qemu-block] [Qemu-devel] [PATCH] block: early check for blockers on drive-mirror

2018-02-07 Thread Fam Zheng

On Wed, 02/07 17:29, Paolo Bonzini wrote:
> Even if an op blocker is present for BLOCK_OP_TYPE_MIRROR_SOURCE,
> it is checked a bit late and the result is that the target is
> created even if drive-mirror subsequently fails.  Add an early
> check to avoid this.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  blockdev.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/blockdev.c b/blockdev.c
> index 8e977eef11..c7e2e0a00e 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -3565,6 +3565,11 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
>  return;
>  }
>  
> +/* Early check to avoid creating target */
> +if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_MIRROR_SOURCE, errp)) {
> +return;
> +}
> +
>  aio_context = bdrv_get_aio_context(bs);
>  aio_context_acquire(aio_context);
>  
> -- 
> 2.14.3
> 
> 

Reviewed-by: Fam Zheng

Re: [Qemu-block] [PATCH v2 11/13] block/mirror: remove block_job_sleep_ns calls

2018-02-07 Thread Max Reitz

On 2018-02-07 23:46, Max Reitz wrote:
> On 2018-01-19 21:58, John Snow wrote:
>> We're attempting to slacken the mirror loop in three different places,
>> but we can combine these three attempts. Combine the early loop call to
>> block_job_pause_point with the two late-loop calls to block_job_sleep_ns.
>>
>> When delay_ns is 0 and it has not been SLICE_TIME since the last yield,
>> block_job_relax is merely a call to block_job_pause_point, so this should
>> be equivalent with the exception that if we have managed to not yield at
>> all in the last SLICE_TIME ns, we will now do so.
>>
>> I am not sure that condition was possible,
>> so this loop should be equivalent.
> 
> Well, to me it even sounds like a positive change if it was a change.
> We want the job to yield after SLICE_TIME ns, after all, and I don't
> think it matters where that happens, exactly.
> 
>>
>> Signed-off-by: John Snow 
>> ---
>>  block/mirror.c | 22 +++---
>>  block/trace-events |  2 +-
>>  2 files changed, 12 insertions(+), 12 deletions(-)
>>
>> diff --git a/block/mirror.c b/block/mirror.c
>> index a0e0044de2..192e03694f 100644
>> --- a/block/mirror.c
>> +++ b/block/mirror.c
>> @@ -761,7 +761,7 @@ static void coroutine_fn mirror_run(void *opaque)
>>  assert(!s->dbi);
>>  s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
>>  for (;;) {
>> -uint64_t delay_ns = 0;
>> +static uint64_t delay_ns = 0;
> 
> Errr.  Are you sure about that?
> 
> Now every mirror job in the qeny process will share this single
> variable.  Was that your intention?

("Errr" @myself for "qeny")



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 13/13] blockjob: remove block_job_pause_point from interface

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> Remove the last call in block/mirror, using relax instead.
> relax may do nothing if we are canceled, so allow iteration to return
> prematurely and allow mirror_run to handle the cancellation logic.

Ah, now you write it with two l? ;-)

> 
> This is a functional change to mirror that should have the effect of
> cancelled mirror jobs being able to respond to that request a little

??!??!  Such inconsistency.  Many l.

> sooner instead of launching new requests.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   |  4 +++-
>  blockjob.c   | 10 +-
>  include/block/blockjob_int.h |  9 -
>  3 files changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 192e03694f..8e6b5b25a9 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -345,7 +345,9 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>  mirror_wait_for_io(s);
>  }
>  
> -block_job_pause_point(&s->common);
> +if (block_job_relax(&s->common, 0)) {
> +return 0;
> +}

:c

>  
>  /* Find the number of consective dirty chunks following the first dirty
>   * one, and wait for in flight requests in them. */
> diff --git a/blockjob.c b/blockjob.c
> index 40167d6896..27c13fdd08 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -60,6 +60,7 @@ static void __attribute__((__constructor__)) 
> block_job_init(void)
>  static void block_job_event_cancelled(BlockJob *job);
>  static void block_job_event_completed(BlockJob *job, const char *msg);
>  static void block_job_enter_cond(BlockJob *job, bool(*fn)(BlockJob *job));
> +static int coroutine_fn block_job_pause_point(BlockJob *job);
>  
>  /* Transactional group of block jobs */
>  struct BlockJobTxn {
> @@ -793,7 +794,14 @@ static void block_job_do_yield(BlockJob *job, uint64_t 
> ns)
>  assert(job->busy);
>  }
>  
> -int coroutine_fn block_job_pause_point(BlockJob *job)
> +/**
> + * block_job_pause_point:
> + * @job: The job that is ready to pause.
> + *
> + * Pause now if block_job_pause() has been called.  Block jobs that perform
> + * lots of I/O must call this between requests so that the job can be paused.

But jobs can't call this anymore, now.  This part of the comment should
either mention block_job_relax() instead or should be moved there
altogether.

Max

> + */
> +static int coroutine_fn block_job_pause_point(BlockJob *job)
>  {
>  assert(job && block_job_started(job));
>  
> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index c4891a5a9b..57327cbc5a 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -201,15 +201,6 @@ void block_job_completed(BlockJob *job, int ret);
>   */
>  bool block_job_is_cancelled(BlockJob *job);
>  
> -/**
> - * block_job_pause_point:
> - * @job: The job that is ready to pause.
> - *
> - * Pause now if block_job_pause() has been called.  Block jobs that perform
> - * lots of I/O must call this between requests so that the job can be paused.
> - */
> -int coroutine_fn block_job_pause_point(BlockJob *job);
> -
>  /**
>   * block_job_enter:
>   * @job: The job to enter.
> 




signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 12/13] blockjob: privatize block_job_sleep_ns

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> There's not currently any external caller of it.
> 
> Except in tests, but we'll fix that here too.
> 
> Replace usages in test cases with block_job_relax, which functions
> similarly enough to be used as a drop-in replacement.
> 
> Very technically block_job_sleep_ns(job, 0) behaves differently
> from block_job_relax(job, 0) in that relax may resolve to a no-op,
> but this makes no difference in the test in which it is used.
> 
> Signed-off-by: John Snow 
> ---
>  blockjob.c   | 11 ++-
>  include/block/blockjob_int.h | 11 ---
>  tests/test-bdrv-drain.c  |  2 +-
>  tests/test-blockjob-txn.c|  2 +-
>  4 files changed, 12 insertions(+), 14 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 11/13] block/mirror: remove block_job_sleep_ns calls

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> We're attempting to slacken the mirror loop in three different places,
> but we can combine these three attempts. Combine the early loop call to
> block_job_pause_point with the two late-loop calls to block_job_sleep_ns.
> 
> When delay_ns is 0 and it has not been SLICE_TIME since the last yield,
> block_job_relax is merely a call to block_job_pause_point, so this should
> be equivalent with the exception that if we have managed to not yield at
> all in the last SLICE_TIME ns, we will now do so.
> 
> I am not sure that condition was possible,
> so this loop should be equivalent.

Well, to me it even sounds like a positive change if it was a change.
We want the job to yield after SLICE_TIME ns, after all, and I don't
think it matters where that happens, exactly.

> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c | 22 +++---
>  block/trace-events |  2 +-
>  2 files changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index a0e0044de2..192e03694f 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -761,7 +761,7 @@ static void coroutine_fn mirror_run(void *opaque)
>  assert(!s->dbi);
>  s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
>  for (;;) {
> -uint64_t delay_ns = 0;
> +static uint64_t delay_ns = 0;

Errr.  Are you sure about that?

Now every mirror job in the qeny process will share this single
variable.  Was that your intention?

>  int64_t cnt, delta;
>  bool should_complete;
>  
> @@ -770,9 +770,16 @@ static void coroutine_fn mirror_run(void *opaque)
>  goto immediate_exit;
>  }
>  
> -block_job_pause_point(&s->common);
> -
>  cnt = bdrv_get_dirty_count(s->dirty_bitmap);
> +
> +trace_mirror_before_relax(s, cnt, s->synced, delay_ns);
> +if (block_job_relax(&s->common, delay_ns)) {

See the reply to that patch.

> +if (!s->synced) {
> +goto immediate_exit;
> +}
> +}
> +delay_ns = 0;
> +
>  /* s->common.offset contains the number of bytes already processed so
>   * far, cnt is the number of dirty bytes remaining and
>   * s->bytes_in_flight is the number of bytes currently being
> @@ -849,15 +856,8 @@ static void coroutine_fn mirror_run(void *opaque)
>  }
>  
>  ret = 0;
> -trace_mirror_before_sleep(s, cnt, s->synced, delay_ns);
> -if (!s->synced) {
> -block_job_sleep_ns(&s->common, delay_ns);
> -if (block_job_is_cancelled(&s->common)) {
> -break;
> -}
> -} else if (!should_complete) {
> +if (s->synced && !should_complete) {
>  delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
> -block_job_sleep_ns(&s->common, delay_ns);
>  }
>  }

Basic idea looks good to me (apart from the static delay_ns), but, well,
block-job-cancel on a busy job still breaks.

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 10/13] block/mirror: condense cancellation and relax calls

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> We can count on the relax call to check cancellation for us, so
> condense these concurrent calls.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 3c73caed5e..a0e0044de2 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -610,9 +610,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob 
> *s)
>  int bytes = MIN(s->bdev_length - offset,
>  QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
>  
> -block_job_relax(&s->common, 0);
> -
> -if (block_job_is_cancelled(&s->common)) {
> +if (block_job_relax(&s->common, 0)) {
>  s->initial_zeroing_ongoing = false;
>  return 0;
>  }
> @@ -638,9 +636,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob 
> *s)
>  int bytes = MIN(s->bdev_length - offset,
>  QEMU_ALIGN_DOWN(INT_MAX, s->granularity));
>  
> -block_job_relax(&s->common, 0);
> -
> -if (block_job_is_cancelled(&s->common)) {
> +if (block_job_relax(&s->common, 0)) {
>  return 0;
>  }

“See last patch.”



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 09/13] block/backup: remove yield_and_check

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> This is a respin of the same functionality as mirror_throttle,
> so trash this and replace it with the generic version.
> 
> yield_and_check returned true if canceled, false otherwise.
> block_job_relax returns -ECANCELED if canceled, 0 otherwise.
> 
> Signed-off-by: John Snow 
> ---
>  block/backup.c | 20 
>  1 file changed, 4 insertions(+), 16 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index b4204c0ee4..0624c3b322 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -334,29 +334,17 @@ static void backup_complete(BlockJob *job, void *opaque)
>  g_free(data);
>  }
>  
> -static bool coroutine_fn yield_and_check(BackupBlockJob *job)
> +static uint64_t get_delay_ns(BackupBlockJob *job)
>  {
>  uint64_t delay_ns = 0;
>  
> -if (block_job_is_cancelled(&job->common)) {
> -return true;
> -}
> -
> -/* we need to yield so that bdrv_drain_all() returns.
> - * (without, VM does not reboot)
> - */
>  if (job->common.speed) {
>  delay_ns = ratelimit_calculate_delay(&job->limit,
>   job->bytes_read);
>  job->bytes_read = 0;
>  }
>  
> -block_job_relax(&job->common, delay_ns);
> -if (block_job_is_cancelled(&job->common)) {
> -return true;
> -}
> -
> -return false;
> +return delay_ns;
>  }
>  
>  static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
> @@ -369,7 +357,7 @@ static int coroutine_fn 
> backup_run_incremental(BackupBlockJob *job)
>  hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  while ((cluster = hbitmap_iter_next(&hbi)) != -1) {
>  do {
> -if (yield_and_check(job)) {
> +if (block_job_relax(&job->common, get_delay_ns(job))) {
>  return 0;
>  }
>  ret = backup_do_cow(job, cluster * job->cluster_size,
> @@ -465,7 +453,7 @@ static void coroutine_fn backup_run(void *opaque)
>  bool error_is_read;
>  int alloced = 0;
>  
> -if (yield_and_check(job)) {
> +if (block_job_relax(&job->common, get_delay_ns(job))) {
>  break;
>  }
>  
> 

I'd very much prefer an explicit block_job_relax(...) == -ECANCELED, I
have to say.

If you coerce me, I can give an R-b, but you'll have to wrest it from me
by force!

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 08/13] allow block_job_relax to return -ECANCELED

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> This is just an optimization for callers who are likely going to
> want to check quite close to this call if the job was canceled or
> not anyway.

But jobs are “cancelled” and not “canceled”.

!!!

> 
> Along the same lines, add the return to block_job_pause_point and
> block_job_sleep_ns, so we don't have to re-check it quite so
> excessively.
> 
> Signed-off-by: John Snow 
> ---
>  blockjob.c   | 28 +---
>  include/block/blockjob_int.h |  8 +---
>  2 files changed, 22 insertions(+), 14 deletions(-)

[...]

> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index 5f1520fab7..1ceb47e1e6 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -147,7 +147,7 @@ void *block_job_create(const char *job_id, const 
> BlockJobDriver *driver,
>   * %QEMU_CLOCK_REALTIME nanoseconds.  Canceling the job will immediately
>   * interrupt the wait.
>   */
> -void block_job_sleep_ns(BlockJob *job, int64_t ns);
> +int block_job_sleep_ns(BlockJob *job, int64_t ns);
>  
>  /**
>   * block_job_yield:
> @@ -167,8 +167,10 @@ void block_job_yield(BlockJob *job);
>   * If delay_ns is 0, yield if it has been SLICE_TIME
>   * nanoseconds since the last yield. Otherwise, check
>   * if we need to yield for a pause event.
> + *
> + * returns ECANCELED if the job has been canceled.

-ECANCELED, please.

With that fixed:

Reviewed-by: Max Reitz 

>   */
> -void block_job_relax(BlockJob *job, int64_t delay_ns);
> +int block_job_relax(BlockJob *job, int64_t delay_ns);
>  
>  /**
>   * block_job_pause_all:
> @@ -217,7 +219,7 @@ bool block_job_is_cancelled(BlockJob *job);
>   * Pause now if block_job_pause() has been called.  Block jobs that perform
>   * lots of I/O must call this between requests so that the job can be paused.
>   */
> -void coroutine_fn block_job_pause_point(BlockJob *job);
> +int coroutine_fn block_job_pause_point(BlockJob *job);
>  
>  /**
>   * block_job_enter:
> 




signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 07/13] block/backup: use block_job_relax

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> See two commits back for justification.

When will git-log support hyperlinks so you can write "HEAD^^" here? ^^

Max



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 04/13] blockjob: allow block_job_throttle to take delay_ns

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> Instead of only sleeping for 0ms when we've hit a timeout, optionally
> take a longer more explicit delay_ns that always forces the sleep.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   |  4 ++--
>  blockjob.c   |  9 -
>  include/block/blockjob_int.h | 10 +++---
>  3 files changed, 13 insertions(+), 10 deletions(-)

[...]

> diff --git a/blockjob.c b/blockjob.c
> index 6f2e709b51..51c0eb5d9e 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -906,12 +906,11 @@ void block_job_yield(BlockJob *job)
>  block_job_pause_point(job);
>  }
>  
> -void block_job_relax(BlockJob *job)
> +void block_job_relax(BlockJob *job, int64_t delay_ns)
>  {
> -int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> -
> -if (now - job->last_enter_ns > SLICE_TIME) {
> -block_job_sleep_ns(job, 0);
> +if (delay_ns || (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - \
> + job->last_enter_ns > SLICE_TIME)) {
> +block_job_sleep_ns(job, delay_ns);

I can't say I like the readability of that any better...

(And I'd argue that if delay_ns > 0, the one superfluous call to
qemu_clock_get_ns() isn't going to harm performance.)

>  } else {
>  block_job_pause_point(job);
>  }
> diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
> index 553784d86f..5f1520fab7 100644
> --- a/include/block/blockjob_int.h
> +++ b/include/block/blockjob_int.h
> @@ -160,11 +160,15 @@ void block_job_yield(BlockJob *job);
>  /**
>   * block_job_relax:
>   * @job: The job that calls the function.
> + * @delay_ns: The amount of time to sleep for
>   *
> - * Yield if it has been SLICE_TIME nanoseconds since the last yield.
> - * Otherwise, check if we need to pause, and yield if so.
> + * Sleep for delay_ns nanoseconds.
> + *
> + * If delay_ns is 0, yield if it has been SLICE_TIME
> + * nanoseconds since the last yield. Otherwise, check
> + * if we need to yield for a pause event.

That "Otherwise" now sounds as if it refers to the "If delay_ns is 0".

Code change is OK, but this comment needs some fixing.

Max

>   */
> -void block_job_relax(BlockJob *job);
> +void block_job_relax(BlockJob *job, int64_t delay_ns);
>  
>  /**
>   * block_job_pause_all:
> 




signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 03/13] blockjob: create block_job_relax

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> This will replace mirror_throttle, for reuse in other jobs.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   | 15 ++-
>  blockjob.c   | 11 +++
>  include/block/blockjob_int.h |  9 +
>  3 files changed, 22 insertions(+), 13 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v2 01/13] blockjob: record time of last entrance

2018-02-07 Thread Max Reitz

On 2018-01-19 21:58, John Snow wrote:
> The mirror job makes a semi-inaccurate record of the last time we yielded
> by recording the last time we left a "pause", but this doesn't always
> correlate to the time we actually last successfully ceded control.
> 
> Record the time we last *exited* a yield centrally. In other words, record
> the time we began execution of this job to know how long we have been
> selfish for.
> 
> Signed-off-by: John Snow 
> ---
>  block/mirror.c   | 8 ++--
>  blockjob.c   | 2 ++
>  include/block/blockjob.h | 5 +
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index c9badc1203..88f4e8964d 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -63,7 +63,6 @@ typedef struct MirrorBlockJob {
>  QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
>  int buf_free_count;
>  
> -uint64_t last_pause_ns;
>  unsigned long *in_flight_bitmap;
>  int in_flight;
>  int64_t bytes_in_flight;
> @@ -596,8 +595,7 @@ static void mirror_throttle(MirrorBlockJob *s)
>  {
>  int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  
> -if (now - s->last_pause_ns > SLICE_TIME) {
> -s->last_pause_ns = now;
> +if (now - s->common.last_enter_ns > SLICE_TIME) {
>  block_job_sleep_ns(&s->common, 0);
>  } else {
>  block_job_pause_point(&s->common);
> @@ -769,7 +767,6 @@ static void coroutine_fn mirror_run(void *opaque)
>  
>  mirror_free_init(s);
>  
> -s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  if (!s->is_none_mode) {
>  ret = mirror_dirty_init(s);
>  if (ret < 0 || block_job_is_cancelled(&s->common)) {
> @@ -803,7 +800,7 @@ static void coroutine_fn mirror_run(void *opaque)
>   * We do so every SLICE_TIME nanoseconds, or when there is an error,
>   * or when the source is clean, whichever comes first.
>   */
> -delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->last_pause_ns;
> +delta = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - 
> s->common.last_enter_ns;

The horror.

>  if (delta < SLICE_TIME &&
>  s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
>  if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
> @@ -878,7 +875,6 @@ static void coroutine_fn mirror_run(void *opaque)
>  delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
>  block_job_sleep_ns(&s->common, delay_ns);
>  }
> -s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
>  }

Hmmm.  So you're now relying on block_job_sleep_ns() updating
last_enter_ns.  But it will only do so if we actually do sleep, which we
will not if the block job has been cancelled.

Then, last_enter_ns will stay at its current value indefinitely because
neither block_job_sleep_ns() nor block_job_pause_point() will yield.
OK, so at this point we should leave the mirror loop.  There are three
points where this is done:

(1) If s->ret < 0.  OK, let's hope it isn't.
(2) If cnt == 0 && should_complete.  We'll come back to that.
(3) If !s->synced && block_job_is_cancelled(...).  So basically now, if
the job has not emitted a READY event yet.

But if we have emitted the READY, we have to wait for cnt == 0 &&
should_complete (note that should_complete in turn will only be set if
s->in_flight == 0 && cnt == 0).  But unless delta < SLICE_TIME, we will
never do another mirror_iteration(), so unless we have already started
the necessary requests, they will never be started and we will loop forever.

So try this on master (I prefixed the QMP lines with in/out depending on
the direction -- note that it's important to have the block-job-cancel
on the same line the human-monitor-command finishes on so they are
executed right after each other!):

$ ./qemu-img create -f qcow2 foo.qcow2 64M
$ x86_64-softmmu/qemu-system-x86_64 -qmp stdio
In: {"QMP": {"version": {"qemu": {"micro": 50, "minor": 11, "major": 2},
"package": " (v2.9.0-632-g4a52d43-dirty)"}, "capabilities": []}}
Out: {"execute":"qmp_capabilities"}
In: {"return": {}}
Out: {"execute":"object-add","arguments":
  {"qom-type":"throttle-group","id":"tg0","props":{"limits":
 {"bps-total":16777216
In: {"return": {}}
Out: {"execute":"blockdev-add","arguments":
  {"node-name":"source","driver":"qcow2","file":
   {"driver":"file","filename":"foo.qcow2"}}}
In: {"return": {}}
Out: {"execute":"blockdev-add","arguments":
  {"node-name":"target","driver":"throttle",
   "throttle-group":"tg0","file":
   {"driver":"null-co","size":67108864}}}
In: {"return": {}}
Out: {"execute":"blockdev-mirror","arguments":
 {"job-id":"mirror","device":"source","target":"target",
  "sync":"full"}}
In: {"return": {}}
In: {"timestamp": {"seconds": 1518040566, "microseconds": 658111},
 "event": "BLOCK_JOB_READY", "data":
 {"device": "mirror", "len": 67108864, "offset": 67108864,
  "speed": 0, "t

Re: [Qemu-block] Block Migration and CPU throttling

2018-02-07 Thread Peter Lieven


> Am 07.02.2018 um 19:29 schrieb Dr. David Alan Gilbert :
> 
> * Peter Lieven (p...@kamp.de) wrote:
>> Am 12.12.2017 um 18:05 schrieb Dr. David Alan Gilbert:
>>> * Peter Lieven (p...@kamp.de) wrote:
 Am 21.09.2017 um 14:36 schrieb Dr. David Alan Gilbert:
> * Peter Lieven (p...@kamp.de) wrote:
>> Am 19.09.2017 um 16:41 schrieb Dr. David Alan Gilbert:
>>> * Peter Lieven (p...@kamp.de) wrote:
 Am 19.09.2017 um 16:38 schrieb Dr. David Alan Gilbert:
> * Peter Lieven (p...@kamp.de) wrote:
>> Hi,
>> 
>> I just noticed that CPU throttling and Block Migration don't work 
>> together very well.
>> During block migration the throttling heuristic detects that we 
>> obviously make no progress
>> in ram transfer. But the reason is the running block migration and 
>> not a too high dirty pages rate.
>> 
>> The result is that any VM is throttled by 99% during block migration.
> Hmm that's unfortunate; do you have a bandwidth set lower than your
> actual network connection? I'm just wondering if it's actually going
> between the block and RAM iterative sections or getting stuck in ne.
 It happens also if source and dest are on the same machine and speed 
 is set to 100G.
>>> But does it happen if they're not and the speed is set low?
>> Yes, it does. I noticed it in our test environment between different 
>> nodes with a 10G
>> link in between. But its totally clear why it happens. During block 
>> migration we transfer
>> all dirty memory pages in each round (if there is moderate memory load), 
>> but all dirty
>> pages are obviously more than 50% of the transferred ram in that round.
>> It is more exactly 100%. But the current logic triggers on this 
>> condition.
>> 
>> I think I will go forward and send a patch which disables auto converge 
>> during
>> block migration bulk stage.
> Yes, that's fair;  it probably would also make sense to throttle the RAM
> migration during the block migration bulk stage, since the chances are
> it's not going to get far.  (I think in the nbd setup, the main
> migration process isn't started until the end of bulk).
 Catching up with the idea of delaying ram migration until block bulk has 
 completed.
 What do you think is the easiest way to achieve this?
>>> 
>>> 
>>> I think the answer depends whether we think this is a 'special' or we
>>> need a new general purpose mechanism.
>>> 
>>> If it was really general then we'd probably want to split the iterative
>>> stage in two somehow, and only do RAM in the second half.
>>> 
>>> But I'm not sure it's worth it; I suspect the easiest way is:
>>> 
>>>a) Add a counter in migration/ram.c or in the RAM state somewhere
>>>b) Make ram_save_inhibit increment the counter
>>>c) Check the counter at the head of ram_save_iterate and just exit
>>>  if it's none 0
>>>d) Call ram_save_inhibit from block_save_setup
>>>e) Then release it when you've finished the bulk stage
>>> 
>>> Make sure you still count the RAM in the pending totals, otherwise
>>> migration might think it's finished a bit early.
>> 
>> Is there any culprit I don't see or is it as easy as this?
> 
> Hmm, looks promising doesn't it;  might need an include or two tidied
> up, but looks worth a try.   Just be careful that there are no cases
> where block migration can't transfer data in that state, otherwise we'll
> keep coming back to here and spewing empty sections.

I already tested it and it actually works.

What would you expect to be cleaned up before it would be a proper patch?

Are there any implications with RDMA and/or post copy migration?
Is block migration possible at all with those?

Peter

[Qemu-block] [ovirt-users] qcow2 images corruption

2018-02-07 Thread Nicolas Ecarnot

Hello,

TL; DR : qcow2 images keep getting corrupted. Any workaround?

Long version:
This discussion has already been launched by me on the oVirt and on
qemu-block mailing list, under similar circumstances but I learned
further things since months and here are some informations :

- We are using 2 oVirt 3.6.7.5-1.el7.centos datacenters, using CentOS
7.{2,3} hosts

- Hosts :
- CentOS 7.2 1511 :
- Kernel = 3.10.0 327
- KVM : 2.3.0-31
- libvirt : 1.2.17
- vdsm : 4.17.32-1
- CentOS 7.3 1611 :
- Kernel 3.10.0 514
- KVM : 2.3.0-31
- libvirt 2.0.0-10
- vdsm : 4.17.32-1
- Our storage is 2 Equallogic SANs connected via iSCSI on a dedicated
network
- Depends on weeks, but all in all, there are around 32 hosts, 8 storage
domains and for various reasons, very few VMs (less than 200).
- One peculiar point is that most of our VMs are provided an additional
dedicated network interface that is iSCSI-connected to some volumes of
our SAN - these volumes not being part of the oVirt setup. That could
lead to a lot of additional iSCSI traffic.

From times to times, a random VM appears paused by oVirt.
Digging into the oVirt engine logs, then into the host vdsm logs, it
appears that the host considers the qcow2 image as corrupted.
Along what I consider as a conservative behavior, vdsm stops any
interaction with this image and marks it as paused.

Any try to unpause it leads to the same conservative pause.

After having found (https://access.redhat.com/solutions/1173623) the
right logical volume hosting the qcow2 image, I can run qemu-img check
on it.

- On 80% of my VMs, I find no errors.
- On 15% of them, I find Leaked cluster errors that I can correct using
"qemu-img check -r all"
- On 5% of them, I find Leaked clusters errors and further fatal errors,
which can not be corrected with qemu-img.
In rare cases, qemu-img can correct them, but destroys large parts of
the image (becomes unusable), and on other cases it can not correct them
at all.

Months ago, I already sent a similar message but the error message was
about No space left on device
(https://www.mail-archive.com/qemu-block@gnu.org/msg00110.html).

This time, I don't have this message about space, but only corruption.

I kept reading and found a similar discussion in the Proxmox group :
https://lists.ovirt.org/pipermail/users/2018-February/086750.html

https://forum.proxmox.com/threads/qcow2-corruption-after-snapshot-or-heavy-disk-i-o.32865/page-2

What I read similar to my case is :
- usage of qcow2
- heavy disk I/O
- using the virtio-blk driver

In the proxmox thread, they tend to say that using virtio-scsi is the
solution. Having asked this question to oVirt experts
(https://lists.ovirt.org/pipermail/users/2018-February/086753.html) but
it's not clear the driver is to blame.

I agree with the answer Yaniv Kaul gave to me, saying I have to properly
report the issue, so I'm longing to know which peculiar information I
can give you now.

As you can imagine, all this setup is in production, and for most of the
VMs, I can not "play" with them. Moreover, we launched a campaign of
nightly stopping every VM, qemu-img check them one by one, then boot.

So it might take some time before I find another corrupted image.
(which I'll preciously store for debug)

Other informations : We very rarely do snapshots, but I'm close to
imagine that automated migrations of VMs could trigger similar behaviors
on qcow2 images.

Last point about the versions we use : yes that's old, yes we're
planning to upgrade, but we don't know when.

Regards,

--
Nicolas ECARNOT
___
Users mailing list
us...@ovirt.org
http://lists.ovirt.org/mailman/listinfo/users

Re: [Qemu-block] [Qemu-devel] [PATCH v2] iotests: 205: support luks format

2018-02-07 Thread Eric Blake


On 02/07/2018 02:37 PM, Eric Blake wrote:

On 02/06/2018 12:57 PM, Eric Blake wrote:

On 02/06/2018 12:26 PM, Daniel P. Berrangé wrote:
On Tue, Feb 06, 2018 at 09:25:07PM +0300, Vladimir 
Sementsov-Ogievskiy wrote:

Support default luks options in VM.add_drive and in new library
function qemu_img_create. Use it in 205 iotests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Reviewed-by: Daniel P. Berrange 


Thanks. I'll take this through my NBD queue.

git git://repo.or.cz/qemu/ericb.git nbd



I'm seeing this failure now :(

$ ./check -luks 205
QEMU  -- "/home/eblake/qemu/x86_64-softmmu/qemu-system-x86_64" 
-nodefaults -machine accel=qtest

QEMU_IMG  -- "/home/eblake/qemu/qemu-img"
QEMU_IO   -- "/home/eblake/qemu/qemu-io"  --cache writeback
QEMU_NBD  -- "/home/eblake/qemu/qemu-nbd"
IMGFMT    -- luks (iter-time=10)
IMGPROTO  -- file
PLATFORM  -- Linux/x86_64 red 4.14.16-300.fc27.x86_64
TEST_DIR  -- /home/eblake/qemu/tests/qemu-iotests/scratch
SOCKET_SCM_HELPER -- /home/eblake/qemu/tests/qemu-iotests/socket_scm_helper

205 [failed, exit status 1] - output mismatch (see 205.out.bad)
--- /home/eblake/qemu/tests/qemu-iotests/205.out    2018-02-07 
09:48:13.346107367 -0600
+++ /home/eblake/qemu/tests/qemu-iotests/205.out.bad    2018-02-07 
14:35:21.859890826 -0600

@@ -1,5 +1,159 @@
-...
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?


Perhaps a false alarm due to a stale qemu-system-x86 process left over 
from an earlier aborted test run.  When I retried on a fresh system, the 
test passed for me.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-block] [Qemu-devel] [PATCH v2] iotests: 205: support luks format

2018-02-07 Thread Eric Blake


On 02/06/2018 12:57 PM, Eric Blake wrote:

On 02/06/2018 12:26 PM, Daniel P. Berrangé wrote:
On Tue, Feb 06, 2018 at 09:25:07PM +0300, Vladimir Sementsov-Ogievskiy 
wrote:

Support default luks options in VM.add_drive and in new library
function qemu_img_create. Use it in 205 iotests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Reviewed-by: Daniel P. Berrange 


Thanks. I'll take this through my NBD queue.

git git://repo.or.cz/qemu/ericb.git nbd



I'm seeing this failure now :(

$ ./check -luks 205
QEMU  -- "/home/eblake/qemu/x86_64-softmmu/qemu-system-x86_64" 
-nodefaults -machine accel=qtest

QEMU_IMG  -- "/home/eblake/qemu/qemu-img"
QEMU_IO   -- "/home/eblake/qemu/qemu-io"  --cache writeback
QEMU_NBD  -- "/home/eblake/qemu/qemu-nbd"
IMGFMT-- luks (iter-time=10)
IMGPROTO  -- file
PLATFORM  -- Linux/x86_64 red 4.14.16-300.fc27.x86_64
TEST_DIR  -- /home/eblake/qemu/tests/qemu-iotests/scratch
SOCKET_SCM_HELPER -- /home/eblake/qemu/tests/qemu-iotests/socket_scm_helper

205 [failed, exit status 1] - output mismatch (see 205.out.bad)
--- /home/eblake/qemu/tests/qemu-iotests/205.out	2018-02-07 
09:48:13.346107367 -0600
+++ /home/eblake/qemu/tests/qemu-iotests/205.out.bad	2018-02-07 
14:35:21.859890826 -0600

@@ -1,5 +1,159 @@
-...
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+qemu-img: /home/eblake/qemu/tests/qemu-iotests/scratch/disk: Failed to 
get "write" lock

+Is another process using the image?
+EEE
+==
+ERROR: test_connect_after_remove_default (__main__.TestNbdServerRemove)
+--
+Traceback (most recent call last):
+  File "205", line 37, in setUp
+self.vm.launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 221, in launch

+self._launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 244, in _launch

+self._post_launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qtest.py", 
line 100, in _post_launch

+super(QEMUQtestMachine, self)._post_launch()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qemu.py", 
line 196, in _post_launch

+self._qmp.accept()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qmp/qmp.py", 
line 157, in accept

+return self.__negotiate_capabilities()
+  File "/home/eblake/qemu/tests/qemu-iotests/../../scripts/qmp/qmp.py", 
line 73, in __negotiate_capabilities

+raise QMPConnectError
+QMPConnectError
...

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

[Qemu-block] qcow2 images corruption