Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-18 Thread Max Reitz
On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
> Drop write notifiers and use filter node instead. Changes:
> 
> 1. copy-before-writes now handled by filter node, so, drop all
>is_write_notifier arguments.
> 
> 2. we don't have intersecting requests, so their handling is dropped.
> Instead, synchronization works as follows:
> when backup or backup-top starts copying of some area it firstly
> clears copy-bitmap bits, and nobody touches areas, not marked with
> dirty bits in copy-bitmap, so there is no intersection. Also, backup
> job copy operations are surrounded by bdrv region lock, which is
> actually serializing request, to not interfer with guest writes and
> not read changed data from source (before reading we clear
> corresponding bit in copy-bitmap, so, this area is not more handled by
> backup-top).
> 
> 3. To sync with in-flight requests we now just drain hook node, we
> don't need rw-lock.
> 
> 4. After the whole backup loop (top, full, incremental modes), we need
> to check for not copied clusters, which were under backup-top operation
> and we skipped them, but backup-top operation failed, error returned to
> the guest and dirty bits set back.
> 
> 5. Don't create additional blk, use backup-top children for copy
> operations.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/backup.c | 285 ++---
>  1 file changed, 149 insertions(+), 136 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index 88c0242b4e..e332909fb7 100644
> --- a/block/backup.c
> +++ b/block/backup.c

[...]

> @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
>  static void backup_clean(Job *job)
>  {
>  BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
> -assert(s->target);
> -blk_unref(s->target);
> +
> +/* We must clean it to not crash in backup_drain. */
>  s->target = NULL;

Why not set s->source to NULL along with it?  It makes sense if you're
going to drop the backup-top node because both of these are its children.

>  
>  if (s->copy_bitmap) {
>  hbitmap_free(s->copy_bitmap);
>  s->copy_bitmap = NULL;
>  }
> +
> +bdrv_backup_top_drop(s->backup_top);
>  }

[...]

> @@ -386,21 +319,45 @@ static int coroutine_fn 
> backup_run_incremental(BackupBlockJob *job)
>  bool error_is_read;
>  int64_t offset;
>  HBitmapIter hbi;
> +void *lock = NULL;
>  
>  hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
> -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
> +while (hbitmap_count(job->copy_bitmap)) {
> +offset = hbitmap_iter_next(&hbi);
> +if (offset == -1) {
> +/*
> + * we may have skipped some clusters, which were handled by
> + * backup-top, but failed and finished by returning error to
> + * the guest and set dirty bit back.
> + */
> +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
> +offset = hbitmap_iter_next(&hbi);
> +assert(offset);

I think you want to assert "offset >= 0".

> +}
> +
> +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
> +/*
> + * Dirty bit is set, which means that there are no in-flight
> + * write requests on this area. We must succeed.
> + */
> +assert(lock);

I'm not sure that is true right now, but more on that below in backup_run().

> +
>  do {
>  if (yield_and_check(job)) {
> +bdrv_co_unlock(lock);
>  return 0;
>  }
> -ret = backup_do_cow(job, offset,
> -job->cluster_size, &error_is_read, false);
> +ret = backup_do_cow(job, offset, job->cluster_size, 
> &error_is_read);
>  if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
> BLOCK_ERROR_ACTION_REPORT)
>  {
> +bdrv_co_unlock(lock);
>  return ret;
>  }
>  } while (ret < 0);
> +
> +bdrv_co_unlock(lock);
> +lock = NULL;

This statement seems unnecessary here.

>  }
>  
>  return 0;

[...]

> @@ -447,26 +402,39 @@ static int coroutine_fn backup_run(Job *job, Error 
> **errp)
>  hbitmap_set(s->copy_bitmap, 0, s->len);
>  }
>  
> -s->before_write.notify = backup_before_write_notify;
> -bdrv_add_before_write_notifier(bs, &s->before_write);
> -
>  if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
>  /* All bits are set in copy_bitmap to allow any cluster to be copied.
>   * This does not actually require them to be copied. */
>  while (!job_is_cancelled(job)) {
> -/* Yield until the job is cancelled.  We just let our 
> before_write
> - * notify callback service CoW requests. */
> +/*
> + * Yield until the job is cancelled.  We just let our backup-top
> +

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Vladimir Sementsov-Ogievskiy
18.01.2019 17:56, Max Reitz wrote:
> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
>> Drop write notifiers and use filter node instead. Changes:
>>
>> 1. copy-before-writes now handled by filter node, so, drop all
>> is_write_notifier arguments.
>>
>> 2. we don't have intersecting requests, so their handling is dropped.
>> Instead, synchronization works as follows:
>> when backup or backup-top starts copying of some area it firstly
>> clears copy-bitmap bits, and nobody touches areas, not marked with
>> dirty bits in copy-bitmap, so there is no intersection. Also, backup
>> job copy operations are surrounded by bdrv region lock, which is
>> actually serializing request, to not interfer with guest writes and
>> not read changed data from source (before reading we clear
>> corresponding bit in copy-bitmap, so, this area is not more handled by
>> backup-top).
>>
>> 3. To sync with in-flight requests we now just drain hook node, we
>> don't need rw-lock.
>>
>> 4. After the whole backup loop (top, full, incremental modes), we need
>> to check for not copied clusters, which were under backup-top operation
>> and we skipped them, but backup-top operation failed, error returned to
>> the guest and dirty bits set back.
>>
>> 5. Don't create additional blk, use backup-top children for copy
>> operations.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>   block/backup.c | 285 ++---
>>   1 file changed, 149 insertions(+), 136 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index 88c0242b4e..e332909fb7 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
> 
> [...]
> 
>> @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
>>   static void backup_clean(Job *job)
>>   {
>>   BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
>> -assert(s->target);
>> -blk_unref(s->target);
>> +
>> +/* We must clean it to not crash in backup_drain. */
>>   s->target = NULL;
> 
> Why not set s->source to NULL along with it?  It makes sense if you're
> going to drop the backup-top node because both of these are its children.

agree.

> 
>>   
>>   if (s->copy_bitmap) {
>>   hbitmap_free(s->copy_bitmap);
>>   s->copy_bitmap = NULL;
>>   }
>> +
>> +bdrv_backup_top_drop(s->backup_top);
>>   }
> 
> [...]
> 
>> @@ -386,21 +319,45 @@ static int coroutine_fn 
>> backup_run_incremental(BackupBlockJob *job)
>>   bool error_is_read;
>>   int64_t offset;
>>   HBitmapIter hbi;
>> +void *lock = NULL;
>>   
>>   hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>> -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
>> +while (hbitmap_count(job->copy_bitmap)) {
>> +offset = hbitmap_iter_next(&hbi);
>> +if (offset == -1) {
>> +/*
>> + * we may have skipped some clusters, which were handled by
>> + * backup-top, but failed and finished by returning error to
>> + * the guest and set dirty bit back.
>> + */
>> +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>> +offset = hbitmap_iter_next(&hbi);
>> +assert(offset);
> 
> I think you want to assert "offset >= 0".
> 
>> +}
>> +
>> +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
>> +/*
>> + * Dirty bit is set, which means that there are no in-flight
>> + * write requests on this area. We must succeed.
>> + */
>> +assert(lock);
> 
> I'm not sure that is true right now, but more on that below in backup_run().
> 
>> +
>>   do {
>>   if (yield_and_check(job)) {
>> +bdrv_co_unlock(lock);
>>   return 0;
>>   }
>> -ret = backup_do_cow(job, offset,
>> -job->cluster_size, &error_is_read, false);
>> +ret = backup_do_cow(job, offset, job->cluster_size, 
>> &error_is_read);
>>   if (ret < 0 && backup_error_action(job, error_is_read, -ret) ==
>>  BLOCK_ERROR_ACTION_REPORT)
>>   {
>> +bdrv_co_unlock(lock);
>>   return ret;
>>   }
>>   } while (ret < 0);
>> +
>> +bdrv_co_unlock(lock);
>> +lock = NULL;
> 
> This statement seems unnecessary here.
> 
>>   }
>>   
>>   return 0;
> 
> [...]
> 
>> @@ -447,26 +402,39 @@ static int coroutine_fn backup_run(Job *job, Error 
>> **errp)
>>   hbitmap_set(s->copy_bitmap, 0, s->len);
>>   }
>>   
>> -s->before_write.notify = backup_before_write_notify;
>> -bdrv_add_before_write_notifier(bs, &s->before_write);
>> -
>>   if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
>>   /* All bits are set in copy_bitmap to allow any cluster to be 
>> copied.
>>* This does not actually require them to be copied. */
>>   while (!job_is_cancelled(job)) {
>> -

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Max Reitz
On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
> 18.01.2019 17:56, Max Reitz wrote:
>> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
>>> Drop write notifiers and use filter node instead. Changes:
>>>
>>> 1. copy-before-writes now handled by filter node, so, drop all
>>> is_write_notifier arguments.
>>>
>>> 2. we don't have intersecting requests, so their handling is dropped.
>>> Instead, synchronization works as follows:
>>> when backup or backup-top starts copying of some area it firstly
>>> clears copy-bitmap bits, and nobody touches areas, not marked with
>>> dirty bits in copy-bitmap, so there is no intersection. Also, backup
>>> job copy operations are surrounded by bdrv region lock, which is
>>> actually serializing request, to not interfer with guest writes and
>>> not read changed data from source (before reading we clear
>>> corresponding bit in copy-bitmap, so, this area is not more handled by
>>> backup-top).
>>>
>>> 3. To sync with in-flight requests we now just drain hook node, we
>>> don't need rw-lock.
>>>
>>> 4. After the whole backup loop (top, full, incremental modes), we need
>>> to check for not copied clusters, which were under backup-top operation
>>> and we skipped them, but backup-top operation failed, error returned to
>>> the guest and dirty bits set back.
>>>
>>> 5. Don't create additional blk, use backup-top children for copy
>>> operations.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block/backup.c | 285 ++---
>>>   1 file changed, 149 insertions(+), 136 deletions(-)
>>>
>>> diff --git a/block/backup.c b/block/backup.c
>>> index 88c0242b4e..e332909fb7 100644
>>> --- a/block/backup.c
>>> +++ b/block/backup.c
>>
>> [...]
>>
>>> @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
>>>   static void backup_clean(Job *job)
>>>   {
>>>   BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
>>> -assert(s->target);
>>> -blk_unref(s->target);
>>> +
>>> +/* We must clean it to not crash in backup_drain. */
>>>   s->target = NULL;
>>
>> Why not set s->source to NULL along with it?  It makes sense if you're
>> going to drop the backup-top node because both of these are its children.
> 
> agree.
> 
>>
>>>   
>>>   if (s->copy_bitmap) {
>>>   hbitmap_free(s->copy_bitmap);
>>>   s->copy_bitmap = NULL;
>>>   }
>>> +
>>> +bdrv_backup_top_drop(s->backup_top);
>>>   }
>>
>> [...]
>>
>>> @@ -386,21 +319,45 @@ static int coroutine_fn 
>>> backup_run_incremental(BackupBlockJob *job)
>>>   bool error_is_read;
>>>   int64_t offset;
>>>   HBitmapIter hbi;
>>> +void *lock = NULL;
>>>   
>>>   hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>>> -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
>>> +while (hbitmap_count(job->copy_bitmap)) {
>>> +offset = hbitmap_iter_next(&hbi);
>>> +if (offset == -1) {
>>> +/*
>>> + * we may have skipped some clusters, which were handled by
>>> + * backup-top, but failed and finished by returning error to
>>> + * the guest and set dirty bit back.
>>> + */
>>> +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>>> +offset = hbitmap_iter_next(&hbi);
>>> +assert(offset);
>>
>> I think you want to assert "offset >= 0".
>>
>>> +}
>>> +
>>> +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
>>> +/*
>>> + * Dirty bit is set, which means that there are no in-flight
>>> + * write requests on this area. We must succeed.
>>> + */
>>> +assert(lock);
>>
>> I'm not sure that is true right now, but more on that below in backup_run().
>>
>>> +
>>>   do {
>>>   if (yield_and_check(job)) {
>>> +bdrv_co_unlock(lock);
>>>   return 0;
>>>   }
>>> -ret = backup_do_cow(job, offset,
>>> -job->cluster_size, &error_is_read, false);
>>> +ret = backup_do_cow(job, offset, job->cluster_size, 
>>> &error_is_read);
>>>   if (ret < 0 && backup_error_action(job, error_is_read, -ret) 
>>> ==
>>>  BLOCK_ERROR_ACTION_REPORT)
>>>   {
>>> +bdrv_co_unlock(lock);
>>>   return ret;
>>>   }
>>>   } while (ret < 0);
>>> +
>>> +bdrv_co_unlock(lock);
>>> +lock = NULL;
>>
>> This statement seems unnecessary here.
>>
>>>   }
>>>   
>>>   return 0;
>>
>> [...]
>>
>>> @@ -447,26 +402,39 @@ static int coroutine_fn backup_run(Job *job, Error 
>>> **errp)
>>>   hbitmap_set(s->copy_bitmap, 0, s->len);
>>>   }
>>>   
>>> -s->before_write.notify = backup_before_write_notify;
>>> -bdrv_add_before_write_notifier(bs, &s->before_write);
>>> -
>>>   if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
>>>   /*

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Vladimir Sementsov-Ogievskiy
28.01.2019 18:59, Max Reitz wrote:
> On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
>> 18.01.2019 17:56, Max Reitz wrote:
>>> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
 Drop write notifiers and use filter node instead. Changes:

 1. copy-before-writes now handled by filter node, so, drop all
  is_write_notifier arguments.

 2. we don't have intersecting requests, so their handling is dropped.
 Instead, synchronization works as follows:
 when backup or backup-top starts copying of some area it firstly
 clears copy-bitmap bits, and nobody touches areas, not marked with
 dirty bits in copy-bitmap, so there is no intersection. Also, backup
 job copy operations are surrounded by bdrv region lock, which is
 actually serializing request, to not interfer with guest writes and
 not read changed data from source (before reading we clear
 corresponding bit in copy-bitmap, so, this area is not more handled by
 backup-top).

 3. To sync with in-flight requests we now just drain hook node, we
 don't need rw-lock.

 4. After the whole backup loop (top, full, incremental modes), we need
 to check for not copied clusters, which were under backup-top operation
 and we skipped them, but backup-top operation failed, error returned to
 the guest and dirty bits set back.

 5. Don't create additional blk, use backup-top children for copy
 operations.

 Signed-off-by: Vladimir Sementsov-Ogievskiy 
 ---
block/backup.c | 285 ++---
1 file changed, 149 insertions(+), 136 deletions(-)

 diff --git a/block/backup.c b/block/backup.c
 index 88c0242b4e..e332909fb7 100644
 --- a/block/backup.c
 +++ b/block/backup.c
>>>
>>> [...]
>>>
 @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
static void backup_clean(Job *job)
{
BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
 -assert(s->target);
 -blk_unref(s->target);
 +
 +/* We must clean it to not crash in backup_drain. */
s->target = NULL;
>>>
>>> Why not set s->source to NULL along with it?  It makes sense if you're
>>> going to drop the backup-top node because both of these are its children.
>>
>> agree.
>>
>>>

if (s->copy_bitmap) {
hbitmap_free(s->copy_bitmap);
s->copy_bitmap = NULL;
}
 +
 +bdrv_backup_top_drop(s->backup_top);
}
>>>
>>> [...]
>>>
 @@ -386,21 +319,45 @@ static int coroutine_fn 
 backup_run_incremental(BackupBlockJob *job)
bool error_is_read;
int64_t offset;
HBitmapIter hbi;
 +void *lock = NULL;

hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
 -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
 +while (hbitmap_count(job->copy_bitmap)) {
 +offset = hbitmap_iter_next(&hbi);
 +if (offset == -1) {
 +/*
 + * we may have skipped some clusters, which were handled by
 + * backup-top, but failed and finished by returning error to
 + * the guest and set dirty bit back.
 + */
 +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
 +offset = hbitmap_iter_next(&hbi);
 +assert(offset);
>>>
>>> I think you want to assert "offset >= 0".
>>>
 +}
 +
 +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
 +/*
 + * Dirty bit is set, which means that there are no in-flight
 + * write requests on this area. We must succeed.
 + */
 +assert(lock);
>>>
>>> I'm not sure that is true right now, but more on that below in backup_run().
>>>
 +
do {
if (yield_and_check(job)) {
 +bdrv_co_unlock(lock);
return 0;
}
 -ret = backup_do_cow(job, offset,
 -job->cluster_size, &error_is_read, false);
 +ret = backup_do_cow(job, offset, job->cluster_size, 
 &error_is_read);
if (ret < 0 && backup_error_action(job, error_is_read, 
 -ret) ==
   BLOCK_ERROR_ACTION_REPORT)
{
 +bdrv_co_unlock(lock);
return ret;
}
} while (ret < 0);
 +
 +bdrv_co_unlock(lock);
 +lock = NULL;
>>>
>>> This statement seems unnecessary here.
>>>
}

return 0;
>>>
>>> [...]
>>>
 @@ -447,26 +402,39 @@ static int coroutine_fn backup_run(Job *job, Error 
 **errp)
hbitmap_set(s->copy_bitmap, 0, s->len);
}
 

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Max Reitz
On 28.01.19 17:44, Vladimir Sementsov-Ogievskiy wrote:
> 28.01.2019 18:59, Max Reitz wrote:
>> On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
>>> 18.01.2019 17:56, Max Reitz wrote:
 On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:

[...]

> @@ -505,8 +474,20 @@ static int coroutine_fn backup_run(Job *job, Error 
> **errp)
>if (alloced < 0) {
>ret = alloced;
>} else {
> +if (!hbitmap_get(s->copy_bitmap, offset)) {
> +trace_backup_do_cow_skip(job, offset);
> +continue; /* already copied */
> +}
> +if (!lock) {
> +lock = bdrv_co_try_lock(s->source, offset, 
> s->cluster_size);
> +/*
> + * Dirty bit is set, which means that there are no 
> in-flight
> + * write requests on this area. We must succeed.
> + */
> +assert(lock);

 What if I have a different parent node for the source that issues
 concurrent writes?  This used to work fine because the before_write
 notifier would still work.  After this patch, that would be broken
 because those writes would not cause a CbW.
>>>
>>> But haw could you have this different parent node? After appending filter,
>>> there should not be such nodes.
>>
>> Unless you append them afterwards:
>>
>>> And I think, during backup it should be
>>> forbidden to append new parents to source, ignoring filter, as it definitely
>>> breaks what filter does.
>>
>> Agreed, but then this needs to be implemented.
>>
>>> And it applies to other block-job with their filters.
>>> If we appended a filter, we don't want someone other to write omit our 
>>> filter.
>>>

 That's not so bad because we just have to make sure that all writes go
 through the backup-top node.  That would make this assertion valid
 again, too.  But that means we cannot share PERM_WRITE; see [1].
>>>
>>> But we don't share PERM_WRITE on source in backup_top, only on target.
>>
>> Are you sure?  The job itself shares it, and the filter shares it, too,
>> as far as I can see.  It uses bdrv_filter_default_perms(), and that does
>> seem to share PERM_WRITE.
> 
> And in bdrv_Filter_default_perms it does "*nshared = *nshared | 
> BLK_PERM_WRITE"
> only for child_file, it is target. Source is child_backing.

Hm?  bdrv_filter_default_perms() does this, unconditionally:

> *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) |
>(c->shared_perm & DEFAULT_PERM_UNCHANGED);

The backup_top filter does what you describe, but it just leaves
*nshared untouched after bdrv_filter_default_perms() has done the above.

[...]

> @@ -655,25 +656,31 @@ BlockJob *backup_job_create(const char *job_id, 
> BlockDriverState *bs,
>
>copy_bitmap = hbitmap_alloc(len, ctz32(cluster_size));
>
> -/* job->len is fixed, so we can't allow resize */
> -job = block_job_create(job_id, &backup_job_driver, txn, bs,
> -   BLK_PERM_CONSISTENT_READ,
> -   BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
> -   BLK_PERM_WRITE_UNCHANGED | BLK_PERM_GRAPH_MOD,
> -   speed, creation_flags, cb, opaque, errp);
> -if (!job) {
> +/*
> + * bdrv_get_device_name will not help to find device name starting 
> from
> + * @bs after backup-top append,

 Why not?  Since backup-top is appended, shouldn't all parents of @bs be
 parents of @backup_top then?  (Making bdrv_get_parent_name() return the
 same result)
>>>
>>> bdrv_get_device_name goes finally through role->get_name, and only root 
>>> role has
>>> this handler. After append we'll have backing role instead of root.
>>
>> Ah, I see, I asked the wrong question.
>>
>> Why is block_job_create() called on bs and not on backup_top?  mirror
>> calls it on mirror_top_bs.
> 
> Good question. I don't exactly remember, may be there are were more troubles 
> with
> permissions or somthing. So, I've to try it again..
> 
> What is more beneficial?
> 
> My current approach, is that job and filter are two sibling users of source 
> node,
> they do copying, they are synchronized. And in this way, it is better to read 
> from
> source directly, to not create extra intersection between job and filter..
> 
> On the other hand, if we read through the filter, we possible should do the 
> whole
> copy operation through the filter..
> 
> What is the difference between guest read and backup-job read, in filter POV? 
> I think:
> 
> For guest read, filter MUST read (as we must handle guest request), and than, 
> if
> we don't have too much in-flight requests, ram-cache is not full, etc, we can 
> handle
> already read data in terms of backup, so, copy it

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Vladimir Sementsov-Ogievskiy
28.01.2019 19:53, Max Reitz wrote:
> On 28.01.19 17:44, Vladimir Sementsov-Ogievskiy wrote:
>> 28.01.2019 18:59, Max Reitz wrote:
>>> On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
 18.01.2019 17:56, Max Reitz wrote:
> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
> 
> [...]
> 
>> @@ -505,8 +474,20 @@ static int coroutine_fn backup_run(Job *job, Error 
>> **errp)
>> if (alloced < 0) {
>> ret = alloced;
>> } else {
>> +if (!hbitmap_get(s->copy_bitmap, offset)) {
>> +trace_backup_do_cow_skip(job, offset);
>> +continue; /* already copied */
>> +}
>> +if (!lock) {
>> +lock = bdrv_co_try_lock(s->source, offset, 
>> s->cluster_size);
>> +/*
>> + * Dirty bit is set, which means that there are no 
>> in-flight
>> + * write requests on this area. We must succeed.
>> + */
>> +assert(lock);
>
> What if I have a different parent node for the source that issues
> concurrent writes?  This used to work fine because the before_write
> notifier would still work.  After this patch, that would be broken
> because those writes would not cause a CbW.

 But haw could you have this different parent node? After appending filter,
 there should not be such nodes.
>>>
>>> Unless you append them afterwards:
>>>
 And I think, during backup it should be
 forbidden to append new parents to source, ignoring filter, as it 
 definitely
 breaks what filter does.
>>>
>>> Agreed, but then this needs to be implemented.
>>>
 And it applies to other block-job with their filters.
 If we appended a filter, we don't want someone other to write omit our 
 filter.

>
> That's not so bad because we just have to make sure that all writes go
> through the backup-top node.  That would make this assertion valid
> again, too.  But that means we cannot share PERM_WRITE; see [1].

 But we don't share PERM_WRITE on source in backup_top, only on target.
>>>
>>> Are you sure?  The job itself shares it, and the filter shares it, too,
>>> as far as I can see.  It uses bdrv_filter_default_perms(), and that does
>>> seem to share PERM_WRITE.
>>
>> And in bdrv_Filter_default_perms it does "*nshared = *nshared | 
>> BLK_PERM_WRITE"
>> only for child_file, it is target. Source is child_backing.
> 
> Hm?  bdrv_filter_default_perms() does this, unconditionally:
> 
>>  *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) |
>> (c->shared_perm & DEFAULT_PERM_UNCHANGED);
> 
> The backup_top filter does what you describe, but it just leaves
> *nshared untouched after bdrv_filter_default_perms() has done the above.


Oops, yes, I meant my backup_top_child_perm(), not bdrv_filter_default_perms(), 
which it calls
too. So, OK, will try to fix it. Sorry for that long extra arguing :(

> 
> [...]
> 
>> @@ -655,25 +656,31 @@ BlockJob *backup_job_create(const char *job_id, 
>> BlockDriverState *bs,
>> 
>> copy_bitmap = hbitmap_alloc(len, ctz32(cluster_size));
>> 
>> -/* job->len is fixed, so we can't allow resize */
>> -job = block_job_create(job_id, &backup_job_driver, txn, bs,
>> -   BLK_PERM_CONSISTENT_READ,
>> -   BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE |
>> -   BLK_PERM_WRITE_UNCHANGED | 
>> BLK_PERM_GRAPH_MOD,
>> -   speed, creation_flags, cb, opaque, errp);
>> -if (!job) {
>> +/*
>> + * bdrv_get_device_name will not help to find device name starting 
>> from
>> + * @bs after backup-top append,
>
> Why not?  Since backup-top is appended, shouldn't all parents of @bs be
> parents of @backup_top then?  (Making bdrv_get_parent_name() return the
> same result)

 bdrv_get_device_name goes finally through role->get_name, and only root 
 role has
 this handler. After append we'll have backing role instead of root.
>>>
>>> Ah, I see, I asked the wrong question.
>>>
>>> Why is block_job_create() called on bs and not on backup_top?  mirror
>>> calls it on mirror_top_bs.
>>
>> Good question. I don't exactly remember, may be there are were more troubles 
>> with
>> permissions or somthing. So, I've to try it again..
>>
>> What is more beneficial?
>>
>> My current approach, is that job and filter are two sibling users of source 
>> node,
>> they do copying, they are synchronized. And in this way, it is better to 
>> read from
>> source directly, to not create extra intersection between job and filter..
>>
>> On the other hand, if we read through the filter, we possible should do the 
>> whole
>> copy opera

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Kevin Wolf
Am 28.01.2019 um 17:44 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 28.01.2019 18:59, Max Reitz wrote:
> > On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
> >> 18.01.2019 17:56, Max Reitz wrote:
> >>> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
>  Drop write notifiers and use filter node instead. Changes:
> 
>  1. copy-before-writes now handled by filter node, so, drop all
>   is_write_notifier arguments.
> 
>  2. we don't have intersecting requests, so their handling is dropped.
>  Instead, synchronization works as follows:
>  when backup or backup-top starts copying of some area it firstly
>  clears copy-bitmap bits, and nobody touches areas, not marked with
>  dirty bits in copy-bitmap, so there is no intersection. Also, backup
>  job copy operations are surrounded by bdrv region lock, which is
>  actually serializing request, to not interfer with guest writes and
>  not read changed data from source (before reading we clear
>  corresponding bit in copy-bitmap, so, this area is not more handled by
>  backup-top).
> 
>  3. To sync with in-flight requests we now just drain hook node, we
>  don't need rw-lock.
> 
>  4. After the whole backup loop (top, full, incremental modes), we need
>  to check for not copied clusters, which were under backup-top operation
>  and we skipped them, but backup-top operation failed, error returned to
>  the guest and dirty bits set back.
> 
>  5. Don't create additional blk, use backup-top children for copy
>  operations.
> 
>  Signed-off-by: Vladimir Sementsov-Ogievskiy 
>  ---
> block/backup.c | 285 ++---
> 1 file changed, 149 insertions(+), 136 deletions(-)
> 
>  diff --git a/block/backup.c b/block/backup.c
>  index 88c0242b4e..e332909fb7 100644
>  --- a/block/backup.c
>  +++ b/block/backup.c
> >>>
> >>> [...]
> >>>
>  @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
> static void backup_clean(Job *job)
> {
> BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
>  -assert(s->target);
>  -blk_unref(s->target);
>  +
>  +/* We must clean it to not crash in backup_drain. */
> s->target = NULL;
> >>>
> >>> Why not set s->source to NULL along with it?  It makes sense if you're
> >>> going to drop the backup-top node because both of these are its children.
> >>
> >> agree.
> >>
> >>>
> 
> if (s->copy_bitmap) {
> hbitmap_free(s->copy_bitmap);
> s->copy_bitmap = NULL;
> }
>  +
>  +bdrv_backup_top_drop(s->backup_top);
> }
> >>>
> >>> [...]
> >>>
>  @@ -386,21 +319,45 @@ static int coroutine_fn 
>  backup_run_incremental(BackupBlockJob *job)
> bool error_is_read;
> int64_t offset;
> HBitmapIter hbi;
>  +void *lock = NULL;
> 
> hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
>  +while (hbitmap_count(job->copy_bitmap)) {
>  +offset = hbitmap_iter_next(&hbi);
>  +if (offset == -1) {
>  +/*
>  + * we may have skipped some clusters, which were handled by
>  + * backup-top, but failed and finished by returning error to
>  + * the guest and set dirty bit back.
>  + */
>  +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  +offset = hbitmap_iter_next(&hbi);
>  +assert(offset);
> >>>
> >>> I think you want to assert "offset >= 0".
> >>>
>  +}
>  +
>  +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
>  +/*
>  + * Dirty bit is set, which means that there are no in-flight
>  + * write requests on this area. We must succeed.
>  + */
>  +assert(lock);
> >>>
> >>> I'm not sure that is true right now, but more on that below in 
> >>> backup_run().
> >>>
>  +
> do {
> if (yield_and_check(job)) {
>  +bdrv_co_unlock(lock);
> return 0;
> }
>  -ret = backup_do_cow(job, offset,
>  -job->cluster_size, &error_is_read, 
>  false);
>  +ret = backup_do_cow(job, offset, job->cluster_size, 
>  &error_is_read);
> if (ret < 0 && backup_error_action(job, error_is_read, 
>  -ret) ==
>    BLOCK_ERROR_ACTION_REPORT)
> {
>  +bdrv_co_unlock(lock);
> return ret;
> }
> } while (ret < 0);
>  +
>  +  

Re: [Qemu-block] [PATCH v5 11/11] block/backup: use backup-top instead of write notifiers

2019-01-28 Thread Vladimir Sementsov-Ogievskiy
28 янв. 2019 г. 20:40 пользователь Kevin Wolf  написал:

Am 28.01.2019 um 17:44 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 28.01.2019 18:59, Max Reitz wrote:
> > On 28.01.19 12:29, Vladimir Sementsov-Ogievskiy wrote:
> >> 18.01.2019 17:56, Max Reitz wrote:
> >>> On 29.12.18 13:20, Vladimir Sementsov-Ogievskiy wrote:
>  Drop write notifiers and use filter node instead. Changes:
> 
>  1. copy-before-writes now handled by filter node, so, drop all
>   is_write_notifier arguments.
> 
>  2. we don't have intersecting requests, so their handling is dropped.
>  Instead, synchronization works as follows:
>  when backup or backup-top starts copying of some area it firstly
>  clears copy-bitmap bits, and nobody touches areas, not marked with
>  dirty bits in copy-bitmap, so there is no intersection. Also, backup
>  job copy operations are surrounded by bdrv region lock, which is
>  actually serializing request, to not interfer with guest writes and
>  not read changed data from source (before reading we clear
>  corresponding bit in copy-bitmap, so, this area is not more handled by
>  backup-top).
> 
>  3. To sync with in-flight requests we now just drain hook node, we
>  don't need rw-lock.
> 
>  4. After the whole backup loop (top, full, incremental modes), we need
>  to check for not copied clusters, which were under backup-top operation
>  and we skipped them, but backup-top operation failed, error returned to
>  the guest and dirty bits set back.
> 
>  5. Don't create additional blk, use backup-top children for copy
>  operations.
> 
>  Signed-off-by: Vladimir Sementsov-Ogievskiy 
>  ---
> block/backup.c | 285 ++---
> 1 file changed, 149 insertions(+), 136 deletions(-)
> 
>  diff --git a/block/backup.c b/block/backup.c
>  index 88c0242b4e..e332909fb7 100644
>  --- a/block/backup.c
>  +++ b/block/backup.c
> >>>
> >>> [...]
> >>>
>  @@ -300,21 +231,23 @@ static void backup_abort(Job *job)
> static void backup_clean(Job *job)
> {
> BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
>  -assert(s->target);
>  -blk_unref(s->target);
>  +
>  +/* We must clean it to not crash in backup_drain. */
> s->target = NULL;
> >>>
> >>> Why not set s->source to NULL along with it?  It makes sense if you're
> >>> going to drop the backup-top node because both of these are its children.
> >>
> >> agree.
> >>
> >>>
> 
> if (s->copy_bitmap) {
> hbitmap_free(s->copy_bitmap);
> s->copy_bitmap = NULL;
> }
>  +
>  +bdrv_backup_top_drop(s->backup_top);
> }
> >>>
> >>> [...]
> >>>
>  @@ -386,21 +319,45 @@ static int coroutine_fn 
>  backup_run_incremental(BackupBlockJob *job)
> bool error_is_read;
> int64_t offset;
> HBitmapIter hbi;
>  +void *lock = NULL;
> 
> hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  -while ((offset = hbitmap_iter_next(&hbi)) != -1) {
>  +while (hbitmap_count(job->copy_bitmap)) {
>  +offset = hbitmap_iter_next(&hbi);
>  +if (offset == -1) {
>  +/*
>  + * we may have skipped some clusters, which were handled by
>  + * backup-top, but failed and finished by returning error to
>  + * the guest and set dirty bit back.
>  + */
>  +hbitmap_iter_init(&hbi, job->copy_bitmap, 0);
>  +offset = hbitmap_iter_next(&hbi);
>  +assert(offset);
> >>>
> >>> I think you want to assert "offset >= 0".
> >>>
>  +}
>  +
>  +lock = bdrv_co_try_lock(job->source, offset, job->cluster_size);
>  +/*
>  + * Dirty bit is set, which means that there are no in-flight
>  + * write requests on this area. We must succeed.
>  + */
>  +assert(lock);
> >>>
> >>> I'm not sure that is true right now, but more on that below in 
> >>> backup_run().
> >>>
>  +
> do {
> if (yield_and_check(job)) {
>  +bdrv_co_unlock(lock);
> return 0;
> }
>  -ret = backup_do_cow(job, offset,
>  -job->cluster_size, &error_is_read, 
>  false);
>  +ret = backup_do_cow(job, offset, job->cluster_size, 
>  &error_is_read);
> if (ret < 0 && backup_error_action(job, error_is_read, 
>  -ret) ==
>    BLOCK_ERROR_ACTION_REPORT)
> {
>  +bdrv_co_unlock(lock);
> return ret;
> }
>