[PATCH 3/3] qemu-img: Speed up checksum

2022-09-01 Thread Nir Soffer
Add coroutine based loop inspired by `qemu-img convert` design.

Changes compared to `qemu-img convert`:

- State for the entire image is kept in ImgChecksumState

- State for single worker coroutine is kept in ImgChecksumworker.

- "Writes" are always in-order, ensured using a queue.

- Calling block status once per image extent, when the current extent is
  consumed by the workers.

- Using 1m buffer size - testings shows that this gives best read
  performance both with buffered and direct I/O.

- Number of coroutines is not configurable. Testing does not show
  improvement when using more than 8 coroutines.

- Progress include entire image, not only the allocated state.

Comparing to the simple read loop shows that this version is up to 4.67
times faster when computing a checksum for an image full of zeroes. For
real images it is 1.59 times faster with direct I/O, and with buffered
I/O there is no difference.

Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:

| image| size | i/o   | before | after  | change |
|--|--|---||||
| zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67 |
| zero |   6g | direct| 4.684s ±0.093s | 2.211s ±0.009s |  x2.12 |
| real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02 |
| real |   6g | direct| 3.094s ±0.079s | 1.947s ±0.017s |  x1.59 |
| nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36 |
| nbd  |   6g | direct| 3.540s ±0.020s | 1.749s ±0.018s |  x2.02 |

[1] raw image full of zeroes
[2] raw fedora 35 image with additional random data, 50% full
[3] image [2] exported by qemu-nbd via unix socket

Signed-off-by: Nir Soffer 
---
 qemu-img.c | 343 +
 1 file changed, 270 insertions(+), 73 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 7edcfe4bc8..bfa8e2862f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1613,48 +1613,288 @@ out:
 qemu_vfree(buf2);
 blk_unref(blk2);
 out2:
 blk_unref(blk1);
 out3:
 qemu_progress_end();
 return ret;
 }
 
 #ifdef CONFIG_BLKHASH
+
+#define CHECKSUM_COROUTINES 8
+#define CHECKSUM_BUF_SIZE (1 * MiB)
+#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
+
+typedef struct ImgChecksumState ImgChecksumState;
+
+typedef struct ImgChecksumWorker {
+QTAILQ_ENTRY(ImgChecksumWorker) entry;
+ImgChecksumState *state;
+Coroutine *co;
+uint8_t *buf;
+
+/* The current chunk. */
+int64_t offset;
+int64_t length;
+bool zero;
+
+/* Always true for zero extent, false for data extent. Set to true
+ * when reading the chunk completes. */
+bool ready;
+} ImgChecksumWorker;
+
+struct ImgChecksumState {
+const char *filename;
+BlockBackend *blk;
+BlockDriverState *bs;
+int64_t total_size;
+
+/* Current extent, modified in checksum_co_next. */
+int64_t offset;
+int64_t length;
+bool zero;
+
+int running_coroutines;
+CoMutex lock;
+ImgChecksumWorker workers[CHECKSUM_COROUTINES];
+
+/* Ensure in-order updates. Update are scheduled at the tail of the
+ * queue and processed from the head of the queue when a worker is
+ * ready. */
+QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
+
+struct blkhash *hash;
+int ret;
+};
+
+static int checksum_block_status(ImgChecksumState *s)
+{
+int64_t length;
+int status;
+
+/* Must be called when current extent is consumed. */
+assert(s->length == 0);
+
+status = bdrv_block_status_above(s->bs, NULL, s->offset,
+ s->total_size - s->offset, &length, NULL,
+ NULL);
+if (status < 0) {
+error_report("Error checking status at offset %" PRId64 " for %s",
+ s->offset, s->filename);
+s->ret = status;
+return -1;
+}
+
+assert(length > 0);
+
+s->length = length;
+s->zero = !!(status & BDRV_BLOCK_ZERO);
+
+return 0;
+}
+
+/**
+ * Grab the next chunk from the current extent, getting the next extent if
+ * needed, and schecule the next update at the end fo the update queue.
+ *
+ * Retrun true if the worker has work to do, false if the worker has
+ * finished or there was an error getting the next extent.
+ */
+static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
+{
+ImgChecksumState *s = w->state;
+
+qemu_co_mutex_lock(&s->lock);
+
+if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
+qemu_co_mutex_unlock(&s->lock);
+return false;
+}
+
+if (s->length == 0 && checksum_block_status(s)) {
+qemu_co_mutex_unlock(&s->lock);
+return false;
+}
+
+/* Grab one chunk from current extent. */
+w->offset = s->offset;
+w->length = MIN(s->length,
+s->zero ?  CHECKSUM_ZERO_SIZE : CHECKSUM_BUF_SIZE);
+w->zero = s->zero;
+w->ready = s->

Re: [PATCH 3/3] qemu-img: Speed up checksum

2022-10-26 Thread Hanna Reitz

On 01.09.22 16:32, Nir Soffer wrote:

Add coroutine based loop inspired by `qemu-img convert` design.

Changes compared to `qemu-img convert`:

- State for the entire image is kept in ImgChecksumState

- State for single worker coroutine is kept in ImgChecksumworker.

- "Writes" are always in-order, ensured using a queue.

- Calling block status once per image extent, when the current extent is
   consumed by the workers.

- Using 1m buffer size - testings shows that this gives best read
   performance both with buffered and direct I/O.


Why does patch 1 then choose to use 2 MB?


- Number of coroutines is not configurable. Testing does not show
   improvement when using more than 8 coroutines.

- Progress include entire image, not only the allocated state.

Comparing to the simple read loop shows that this version is up to 4.67
times faster when computing a checksum for an image full of zeroes. For
real images it is 1.59 times faster with direct I/O, and with buffered
I/O there is no difference.

Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:

| image| size | i/o   | before | after  | change |
|--|--|---||||
| zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67 |
| zero |   6g | direct| 4.684s ±0.093s | 2.211s ±0.009s |  x2.12 |
| real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02 |
| real |   6g | direct| 3.094s ±0.079s | 1.947s ±0.017s |  x1.59 |
| nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36 |
| nbd  |   6g | direct| 3.540s ±0.020s | 1.749s ±0.018s |  x2.02 |

[1] raw image full of zeroes
[2] raw fedora 35 image with additional random data, 50% full
[3] image [2] exported by qemu-nbd via unix socket

Signed-off-by: Nir Soffer 
---
  qemu-img.c | 343 +
  1 file changed, 270 insertions(+), 73 deletions(-)


Looks good!

Just a couple of style comments below.


diff --git a/qemu-img.c b/qemu-img.c
index 7edcfe4bc8..bfa8e2862f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1613,48 +1613,288 @@ out:
  qemu_vfree(buf2);
  blk_unref(blk2);
  out2:
  blk_unref(blk1);
  out3:
  qemu_progress_end();
  return ret;
  }
  
  #ifdef CONFIG_BLKHASH

+
+#define CHECKSUM_COROUTINES 8
+#define CHECKSUM_BUF_SIZE (1 * MiB)
+#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
+
+typedef struct ImgChecksumState ImgChecksumState;
+
+typedef struct ImgChecksumWorker {
+QTAILQ_ENTRY(ImgChecksumWorker) entry;
+ImgChecksumState *state;
+Coroutine *co;
+uint8_t *buf;
+
+/* The current chunk. */
+int64_t offset;
+int64_t length;
+bool zero;
+
+/* Always true for zero extent, false for data extent. Set to true
+ * when reading the chunk completes. */


Qemu codestyle requires /* and */ to be on separate lines for multi-line 
comments (see checkpatch.pl).



+bool ready;
+} ImgChecksumWorker;
+
+struct ImgChecksumState {
+const char *filename;
+BlockBackend *blk;
+BlockDriverState *bs;
+int64_t total_size;
+
+/* Current extent, modified in checksum_co_next. */
+int64_t offset;
+int64_t length;
+bool zero;
+
+int running_coroutines;
+CoMutex lock;
+ImgChecksumWorker workers[CHECKSUM_COROUTINES];
+
+/* Ensure in-order updates. Update are scheduled at the tail of the
+ * queue and processed from the head of the queue when a worker is
+ * ready. */


Qemu codestyle requires /* and */ to be on separate lines for multi-line 
comments.



+QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
+
+struct blkhash *hash;
+int ret;
+};


[...]


+static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
+{
+ImgChecksumState *s = w->state;
+
+qemu_co_mutex_lock(&s->lock);
+
+if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
+qemu_co_mutex_unlock(&s->lock);
+return false;
+}
+
+if (s->length == 0 && checksum_block_status(s)) {


I’d prefer `checksum_block_status(s) < 0` so that it is clear that 
negative values indicate errors.  Otherwise, based on this code alone, 
it looks to me more like `checksum_block_status()` returned a boolean, 
where `false` would generally indicate an error, which is confusing.


Same in other places below.


+qemu_co_mutex_unlock(&s->lock);
+return false;
+}


[...]


+/* Enter the next worker coroutine if the worker is ready. */
+static void coroutine_fn checksum_co_enter_next(ImgChecksumWorker *w)
+{
+ImgChecksumState *s = w->state;
+ImgChecksumWorker *next;
+
+if (!QTAILQ_EMPTY(&s->update_queue)) {
+next = QTAILQ_FIRST(&s->update_queue);
+if (next->ready)
+qemu_coroutine_enter(next->co);


Qemu codestyle requires braces here.

Hanna




Re: [PATCH 3/3] qemu-img: Speed up checksum

2022-10-30 Thread Nir Soffer
On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz  wrote:

> On 01.09.22 16:32, Nir Soffer wrote:
> > Add coroutine based loop inspired by `qemu-img convert` design.
> >
> > Changes compared to `qemu-img convert`:
> >
> > - State for the entire image is kept in ImgChecksumState
> >
> > - State for single worker coroutine is kept in ImgChecksumworker.
> >
> > - "Writes" are always in-order, ensured using a queue.
> >
> > - Calling block status once per image extent, when the current extent is
> >consumed by the workers.
> >
> > - Using 1m buffer size - testings shows that this gives best read
> >performance both with buffered and direct I/O.
>
> Why does patch 1 then choose to use 2 MB?
>

The first patch uses sync I/O, and in this case 2 MB is a little faster.


> > - Number of coroutines is not configurable. Testing does not show
> >improvement when using more than 8 coroutines.
> >
> > - Progress include entire image, not only the allocated state.
> >
> > Comparing to the simple read loop shows that this version is up to 4.67
> > times faster when computing a checksum for an image full of zeroes. For
> > real images it is 1.59 times faster with direct I/O, and with buffered
> > I/O there is no difference.
> >
> > Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:
> >
> > | image| size | i/o   | before | after  | change
> |
> >
> |--|--|---||||
> > | zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67
> |
> > | zero |   6g | direct| 4.684s ±0.093s | 2.211s ±0.009s |  x2.12
> |
> > | real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02
> |
> > | real |   6g | direct| 3.094s ±0.079s | 1.947s ±0.017s |  x1.59
> |
> > | nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36
> |
> > | nbd  |   6g | direct| 3.540s ±0.020s | 1.749s ±0.018s |  x2.02
> |
> >
> > [1] raw image full of zeroes
> > [2] raw fedora 35 image with additional random data, 50% full
> > [3] image [2] exported by qemu-nbd via unix socket
> >
> > Signed-off-by: Nir Soffer 
> > ---
> >   qemu-img.c | 343 +
> >   1 file changed, 270 insertions(+), 73 deletions(-)
>
> Looks good!
>
> Just a couple of style comments below.
>
> > diff --git a/qemu-img.c b/qemu-img.c
> > index 7edcfe4bc8..bfa8e2862f 100644
> > --- a/qemu-img.c
> > +++ b/qemu-img.c
> > @@ -1613,48 +1613,288 @@ out:
> >   qemu_vfree(buf2);
> >   blk_unref(blk2);
> >   out2:
> >   blk_unref(blk1);
> >   out3:
> >   qemu_progress_end();
> >   return ret;
> >   }
> >
> >   #ifdef CONFIG_BLKHASH
> > +
> > +#define CHECKSUM_COROUTINES 8
> > +#define CHECKSUM_BUF_SIZE (1 * MiB)
> > +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
> > +
> > +typedef struct ImgChecksumState ImgChecksumState;
> > +
> > +typedef struct ImgChecksumWorker {
> > +QTAILQ_ENTRY(ImgChecksumWorker) entry;
> > +ImgChecksumState *state;
> > +Coroutine *co;
> > +uint8_t *buf;
> > +
> > +/* The current chunk. */
> > +int64_t offset;
> > +int64_t length;
> > +bool zero;
> > +
> > +/* Always true for zero extent, false for data extent. Set to true
> > + * when reading the chunk completes. */
>
> Qemu codestyle requires /* and */ to be on separate lines for multi-line
> comments (see checkpatch.pl).
>

I'll change that. Do we have a good way to run checkpatch.pl when using
git-publish?

Maybe a way to run checkpatch.pl on all patches generated by git publish
automatically?


> > +bool ready;
> > +} ImgChecksumWorker;
> > +
> > +struct ImgChecksumState {
> > +const char *filename;
> > +BlockBackend *blk;
> > +BlockDriverState *bs;
> > +int64_t total_size;
> > +
> > +/* Current extent, modified in checksum_co_next. */
> > +int64_t offset;
> > +int64_t length;
> > +bool zero;
> > +
> > +int running_coroutines;
> > +CoMutex lock;
> > +ImgChecksumWorker workers[CHECKSUM_COROUTINES];
> > +
> > +/* Ensure in-order updates. Update are scheduled at the tail of the
> > + * queue and processed from the head of the queue when a worker is
> > + * ready. */
>
> Qemu codestyle requires /* and */ to be on separate lines for multi-line
> comments.
>
> > +QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
> > +
> > +struct blkhash *hash;
> > +int ret;
> > +};
>
> [...]
>
> > +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
> > +{
> > +ImgChecksumState *s = w->state;
> > +
> > +qemu_co_mutex_lock(&s->lock);
> > +
> > +if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
> > +qemu_co_mutex_unlock(&s->lock);
> > +return false;
> > +}
> > +
> > +if (s->length == 0 && checksum_block_status(s)) {
>
> I’d prefer `checksum_block_status(s) < 0` so that it is clear that
> negative values indicate errors.  Otherwise, based on this code

Re: [PATCH 3/3] qemu-img: Speed up checksum

2022-10-30 Thread Nir Soffer
On Sun, Oct 30, 2022 at 7:38 PM Nir Soffer  wrote:

> On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz  wrote:
>
>> On 01.09.22 16:32, Nir Soffer wrote:
>>
> [...]

> > +/* The current chunk. */
>> > +int64_t offset;
>> > +int64_t length;
>> > +bool zero;
>> > +
>> > +/* Always true for zero extent, false for data extent. Set to true
>> > + * when reading the chunk completes. */
>>
>> Qemu codestyle requires /* and */ to be on separate lines for multi-line
>> comments (see checkpatch.pl).
>>
>
> I'll change that. Do we have a good way to run checkpatch.pl when using
> git-publish?
>
> Maybe a way to run checkpatch.pl on all patches generated by git publish
> automatically?
>

I found
https://blog.vmsplice.net/2011/03/how-to-automatically-run-checkpatchpl.html
and it seems to work well.


Re: [PATCH 3/3] qemu-img: Speed up checksum

2022-11-07 Thread Hanna Reitz

On 30.10.22 18:38, Nir Soffer wrote:

On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz  wrote:

On 01.09.22 16:32, Nir Soffer wrote:
> Add coroutine based loop inspired by `qemu-img convert` design.
>
> Changes compared to `qemu-img convert`:
>
> - State for the entire image is kept in ImgChecksumState
>
> - State for single worker coroutine is kept in ImgChecksumworker.
>
> - "Writes" are always in-order, ensured using a queue.
>
> - Calling block status once per image extent, when the current
extent is
>    consumed by the workers.
>
> - Using 1m buffer size - testings shows that this gives best read
>    performance both with buffered and direct I/O.

Why does patch 1 then choose to use 2 MB?


The first patch uses sync I/O, and in this case 2 MB is a little faster.


Interesting.


> - Number of coroutines is not configurable. Testing does not show
>    improvement when using more than 8 coroutines.
>
> - Progress include entire image, not only the allocated state.
>
> Comparing to the simple read loop shows that this version is up
to 4.67
> times faster when computing a checksum for an image full of
zeroes. For
> real images it is 1.59 times faster with direct I/O, and with
buffered
> I/O there is no difference.
>
> Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:
>
> | image    | size | i/o       | before         | after         |
change |
>
|--|--|---||||
> | zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s
|  x4.67 |
> | zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s
|  x2.12 |
> | real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s
|  x1.02 |
> | real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s
|  x1.59 |
> | nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s
|  x1.36 |
> | nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s
|  x2.02 |
>
> [1] raw image full of zeroes
> [2] raw fedora 35 image with additional random data, 50% full
> [3] image [2] exported by qemu-nbd via unix socket
>
> Signed-off-by: Nir Soffer 
> ---
>   qemu-img.c | 343
+
>   1 file changed, 270 insertions(+), 73 deletions(-)

Looks good!

Just a couple of style comments below.

> diff --git a/qemu-img.c b/qemu-img.c
> index 7edcfe4bc8..bfa8e2862f 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -1613,48 +1613,288 @@ out:
>       qemu_vfree(buf2);
>       blk_unref(blk2);
>   out2:
>       blk_unref(blk1);
>   out3:
>       qemu_progress_end();
>       return ret;
>   }
>
>   #ifdef CONFIG_BLKHASH
> +
> +#define CHECKSUM_COROUTINES 8
> +#define CHECKSUM_BUF_SIZE (1 * MiB)
> +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
> +
> +typedef struct ImgChecksumState ImgChecksumState;
> +
> +typedef struct ImgChecksumWorker {
> +    QTAILQ_ENTRY(ImgChecksumWorker) entry;
> +    ImgChecksumState *state;
> +    Coroutine *co;
> +    uint8_t *buf;
> +
> +    /* The current chunk. */
> +    int64_t offset;
> +    int64_t length;
> +    bool zero;
> +
> +    /* Always true for zero extent, false for data extent. Set
to true
> +     * when reading the chunk completes. */

Qemu codestyle requires /* and */ to be on separate lines for
multi-line
comments (see checkpatch.pl ).


I'll change that. Do we have a good way to run checkpatch.pl 
 when using

git-publish?

Maybe a way to run checkpatch.pl  on all patches 
generated by git publish

automatically?


I don’t know myself, I don’t use git-publish.  Stefan should know. ;)



> +    bool ready;
> +} ImgChecksumWorker;
> +
> +struct ImgChecksumState {
> +    const char *filename;
> +    BlockBackend *blk;
> +    BlockDriverState *bs;
> +    int64_t total_size;
> +
> +    /* Current extent, modified in checksum_co_next. */
> +    int64_t offset;
> +    int64_t length;
> +    bool zero;
> +
> +    int running_coroutines;
> +    CoMutex lock;
> +    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
> +
> +    /* Ensure in-order updates. Update are scheduled at the
tail of the
> +     * queue and processed from the head of the queue when a
worker is
> +     * ready. */

Qemu codestyle requires /* and */ to be on separate lines for
multi-line
comments.

> +    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
> +
> +    struct blkhash *hash;
> +    int ret;
> +};

[...]

> +static coroutine_fn bool checksum_co_next