Re: [PATCH v3 5/6] block/linux-aio: convert to blk_io_plug_call() API

2023-05-31 Thread Stefano Garzarella

On Tue, May 30, 2023 at 02:09:58PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Note that a dev_max_batch check is dropped in laio_io_unplug() because
the semantics of unplug_fn() are different from .bdrv_co_unplug():
1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
  not every time blk_io_unplug() is called.
2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
  way to get per-BlockDriverState fields like dev_max_batch.

Therefore this condition cannot be moved to laio_unplug_fn(). It is not
obvious that this condition affects performance in practice, so I am
removing it instead of trying to come up with a more complex mechanism
to preserve the condition.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
include/block/raw-aio.h |  7 ---
block/file-posix.c  | 28 
block/linux-aio.c   | 41 +++--
3 files changed, 11 insertions(+), 65 deletions(-)


LGTM!

Reviewed-by: Stefano Garzarella 




Re: [PATCH v2 5/6] block/linux-aio: convert to blk_io_plug_call() API

2023-05-29 Thread Stefano Garzarella

On Wed, May 24, 2023 at 03:36:34PM -0400, Stefan Hajnoczi wrote:

On Wed, May 24, 2023 at 10:52:03AM +0200, Stefano Garzarella wrote:

On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
> Stop using the .bdrv_co_io_plug() API because it is not multi-queue
> block layer friendly. Use the new blk_io_plug_call() API to batch I/O
> submission instead.
>
> Signed-off-by: Stefan Hajnoczi 
> Reviewed-by: Eric Blake 
> ---
> include/block/raw-aio.h |  7 ---
> block/file-posix.c  | 28 
> block/linux-aio.c   | 41 +++--
> 3 files changed, 11 insertions(+), 65 deletions(-)
>
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index da60ca13ef..0f63c2800c 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, 
QEMUIOVector *qiov,
>
> void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
> void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
> -
> -/*
> - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
> - * caller must ensure that they are paired in the same IOThread.
> - */
> -void laio_io_plug(void);
> -void laio_io_unplug(uint64_t dev_max_batch);
> #endif
> /* io_uring.c - Linux io_uring implementation */
> #ifdef CONFIG_LINUX_IO_URING
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7baa8491dd..ac1ed54811 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2550,26 +2550,6 @@ static int coroutine_fn 
raw_co_pwritev(BlockDriverState *bs, int64_t offset,
> return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
> }
>
> -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
> -{
> -BDRVRawState __attribute__((unused)) *s = bs->opaque;
> -#ifdef CONFIG_LINUX_AIO
> -if (s->use_linux_aio) {
> -laio_io_plug();
> -}
> -#endif
> -}
> -
> -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
> -{
> -BDRVRawState __attribute__((unused)) *s = bs->opaque;
> -#ifdef CONFIG_LINUX_AIO
> -if (s->use_linux_aio) {
> -laio_io_unplug(s->aio_max_batch);
> -}
> -#endif
> -}
> -
> static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
> {
> BDRVRawState *s = bs->opaque;
> @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
> .bdrv_co_copy_range_from = raw_co_copy_range_from,
> .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> .bdrv_refresh_limits = raw_refresh_limits,
> -.bdrv_co_io_plug= raw_co_io_plug,
> -.bdrv_co_io_unplug  = raw_co_io_unplug,
> .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
> .bdrv_co_truncate   = raw_co_truncate,
> @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
> .bdrv_co_copy_range_from = raw_co_copy_range_from,
> .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> .bdrv_refresh_limits = raw_refresh_limits,
> -.bdrv_co_io_plug= raw_co_io_plug,
> -.bdrv_co_io_unplug  = raw_co_io_unplug,
> .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
> .bdrv_co_truncate   = raw_co_truncate,
> @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
> .bdrv_co_pwritev= raw_co_pwritev,
> .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> .bdrv_refresh_limits= cdrom_refresh_limits,
> -.bdrv_co_io_plug= raw_co_io_plug,
> -.bdrv_co_io_unplug  = raw_co_io_unplug,
> .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
> .bdrv_co_truncate   = raw_co_truncate,
> @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
> .bdrv_co_pwritev= raw_co_pwritev,
> .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> .bdrv_refresh_limits= cdrom_refresh_limits,
> -.bdrv_co_io_plug= raw_co_io_plug,
> -.bdrv_co_io_unplug  = raw_co_io_unplug,
> .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
> .bdrv_co_truncate   = raw_co_truncate,
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 442c86209b..5021aed68f 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -15,6 +15,7 @@
> #include "qemu/event_notifier.h"
> #include "qemu/coroutine.h"
> #include "qapi/error.h"
> +#include "sysemu/block-backend.h"
>
> /* Only used for assertions.  */
> #include "qemu/coroutine_int.h"
> @@ -46,7 +47,6 @@ struct qemu_laiocb {
> };
>
> typedef struct {
> -int plugged;
> u

Re: [PATCH v2 5/6] block/linux-aio: convert to blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
include/block/raw-aio.h |  7 ---
block/file-posix.c  | 28 
block/linux-aio.c   | 41 +++--
3 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index da60ca13ef..0f63c2800c 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, 
QEMUIOVector *qiov,

void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-
-/*
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void laio_io_plug(void);
-void laio_io_unplug(uint64_t dev_max_batch);
#endif
/* io_uring.c - Linux io_uring implementation */
#ifdef CONFIG_LINUX_IO_URING
diff --git a/block/file-posix.c b/block/file-posix.c
index 7baa8491dd..ac1ed54811 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, int64_t offset,
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
}

-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
-BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-if (s->use_linux_aio) {
-laio_io_plug();
-}
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
-BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-if (s->use_linux_aio) {
-laio_io_unplug(s->aio_max_batch);
-}
-#endif
-}
-
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
{
BDRVRawState *s = bs->opaque;
@@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to  = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,

.bdrv_co_truncate   = raw_co_truncate,
@@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
.bdrv_co_copy_range_from = raw_co_copy_range_from,
.bdrv_co_copy_range_to  = raw_co_copy_range_to,
.bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,

.bdrv_co_truncate   = raw_co_truncate,
@@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev= raw_co_pwritev,
.bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
.bdrv_refresh_limits= cdrom_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,

.bdrv_co_truncate   = raw_co_truncate,
@@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_co_pwritev= raw_co_pwritev,
.bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
.bdrv_refresh_limits= cdrom_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
.bdrv_attach_aio_context = raw_aio_attach_aio_context,

.bdrv_co_truncate   = raw_co_truncate,
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 442c86209b..5021aed68f 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -15,6 +15,7 @@
#include "qemu/event_notifier.h"
#include "qemu/coroutine.h"
#include "qapi/error.h"
+#include "sysemu/block-backend.h"

/* Only used for assertions.  */
#include "qemu/coroutine_int.h"
@@ -46,7 +47,6 @@ struct qemu_laiocb {
};

typedef struct {
-int plugged;
unsigned int in_queue;
unsigned int in_flight;
bool blocked;
@@ -236,7 +236,7 @@ static void 
qemu_laio_process_completions_and_submit(LinuxAioState *s)
{
qemu_laio_process_completions(s);

-if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(>io_q.pending)) {
+if (!QSIMPLEQ_EMPTY(>io_q.pending)) {
ioq_submit(s);
}
}
@@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
static void ioq_init(LaioQueue *io_q)
{
QSIMPLEQ_INIT(_q->pending);
-io_q->plugged = 0;
io_q->in_queue = 0;
io_q->in_flight = 0;
io_q->blocked = false;
@@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t 
dev_max_batch)
return max_batch;
}

-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
{
- 

Re: [PATCH v2 6/6] block: remove bdrv_co_io_plug() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:13:00PM -0400, Stefan Hajnoczi wrote:

No block driver implements .bdrv_co_io_plug() anymore. Get rid of the
function pointers.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
include/block/block-io.h |  3 ---
include/block/block_int-common.h | 11 --
block/io.c   | 37 
3 files changed, 51 deletions(-)


Reviewed-by: Stefano Garzarella 




Re: [PATCH v2 4/6] block/io_uring: convert to blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:12:58PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
v2
- Removed whitespace hunk [Eric]
---
include/block/raw-aio.h |  7 ---
block/file-posix.c  | 10 --
block/io_uring.c| 44 -
block/trace-events  |  5 ++---
4 files changed, 19 insertions(+), 47 deletions(-)


Reviewed-by: Stefano Garzarella 




Re: [PATCH v2 3/6] block/blkio: convert to blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:12:57PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
v2
- Add missing #include and fix blkio_unplug_fn() prototype [Stefano]
---
block/blkio.c | 43 ---
1 file changed, 24 insertions(+), 19 deletions(-)


Reviewed-by: Stefano Garzarella 




Re: [PATCH v2 2/6] block/nvme: convert to blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:12:56PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
v2
- Remove unused nvme_process_completion_queue_plugged trace event
 [Stefano]
---
block/nvme.c   | 44 
block/trace-events |  1 -
2 files changed, 12 insertions(+), 33 deletions(-)


Reviewed-by: Stefano Garzarella 




Re: [PATCH v2 1/6] block: add blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 01:12:55PM -0400, Stefan Hajnoczi wrote:

Introduce a new API for thread-local blk_io_plug() that does not
traverse the block graph. The goal is to make blk_io_plug() multi-queue
friendly.

Instead of having block drivers track whether or not we're in a plugged
section, provide an API that allows them to defer a function call until
we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
called multiple times with the same fn/opaque pair, then fn() is only
called once at the end of the function - resulting in batching.

This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
because the plug state is now thread-local.

Later patches convert block drivers to blk_io_plug_call() and then we
can finally remove .bdrv_co_io_plug() once all block drivers have been
converted.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
v2
- "is not be freed" -> "is not freed" [Eric]
---
MAINTAINERS   |   1 +
include/sysemu/block-backend-io.h |  13 +--
block/block-backend.c |  22 -
block/plug.c  | 159 ++
hw/block/dataplane/xen-block.c|   8 +-
hw/block/virtio-blk.c |   4 +-
hw/scsi/virtio-scsi.c |   6 +-
block/meson.build |   1 +
8 files changed, 173 insertions(+), 41 deletions(-)
create mode 100644 block/plug.c


Reviewed-by: Stefano Garzarella 




Re: [PATCH 1/6] block: add blk_io_plug_call() API

2023-05-24 Thread Stefano Garzarella

On Tue, May 23, 2023 at 11:47:08AM -0400, Stefan Hajnoczi wrote:

On Fri, May 19, 2023 at 10:45:57AM +0200, Stefano Garzarella wrote:

On Wed, May 17, 2023 at 06:10:17PM -0400, Stefan Hajnoczi wrote:
> Introduce a new API for thread-local blk_io_plug() that does not
> traverse the block graph. The goal is to make blk_io_plug() multi-queue
> friendly.
>
> Instead of having block drivers track whether or not we're in a plugged
> section, provide an API that allows them to defer a function call until
> we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
> called multiple times with the same fn/opaque pair, then fn() is only
> called once at the end of the function - resulting in batching.
>
> This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
> blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
> because the plug state is now thread-local.
>
> Later patches convert block drivers to blk_io_plug_call() and then we
> can finally remove .bdrv_co_io_plug() once all block drivers have been
> converted.
>
> Signed-off-by: Stefan Hajnoczi 
> ---
> MAINTAINERS   |   1 +
> include/sysemu/block-backend-io.h |  13 +--
> block/block-backend.c |  22 -
> block/plug.c  | 159 ++
> hw/block/dataplane/xen-block.c|   8 +-
> hw/block/virtio-blk.c |   4 +-
> hw/scsi/virtio-scsi.c |   6 +-
> block/meson.build |   1 +
> 8 files changed, 173 insertions(+), 41 deletions(-)
> create mode 100644 block/plug.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 50585117a0..574202295c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2644,6 +2644,7 @@ F: util/aio-*.c
> F: util/aio-*.h
> F: util/fdmon-*.c
> F: block/io.c
> +F: block/plug.c
> F: migration/block*
> F: include/block/aio.h
> F: include/block/aio-wait.h
> diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
> index d62a7ee773..be4dcef59d 100644
> --- a/include/sysemu/block-backend-io.h
> +++ b/include/sysemu/block-backend-io.h
> @@ -100,16 +100,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
> int blk_get_max_iov(BlockBackend *blk);
> int blk_get_max_hw_iov(BlockBackend *blk);
>
> -/*
> - * blk_io_plug/unplug are thread-local operations. This means that multiple
> - * IOThreads can simultaneously call plug/unplug, but the caller must ensure
> - * that each unplug() is called in the same IOThread of the matching plug().
> - */
> -void coroutine_fn blk_co_io_plug(BlockBackend *blk);
> -void co_wrapper blk_io_plug(BlockBackend *blk);
> -
> -void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
> -void co_wrapper blk_io_unplug(BlockBackend *blk);
> +void blk_io_plug(void);
> +void blk_io_unplug(void);
> +void blk_io_plug_call(void (*fn)(void *), void *opaque);
>
> AioContext *blk_get_aio_context(BlockBackend *blk);
> BlockAcctStats *blk_get_stats(BlockBackend *blk);
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ca537cd0ad..1f1d226ba6 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -2568,28 +2568,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, 
Notifier *notify)
> notifier_list_add(>insert_bs_notifiers, notify);
> }
>
> -void coroutine_fn blk_co_io_plug(BlockBackend *blk)
> -{
> -BlockDriverState *bs = blk_bs(blk);
> -IO_CODE();
> -GRAPH_RDLOCK_GUARD();
> -
> -if (bs) {
> -bdrv_co_io_plug(bs);
> -}
> -}
> -
> -void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
> -{
> -BlockDriverState *bs = blk_bs(blk);
> -IO_CODE();
> -GRAPH_RDLOCK_GUARD();
> -
> -if (bs) {
> -bdrv_co_io_unplug(bs);
> -}
> -}
> -
> BlockAcctStats *blk_get_stats(BlockBackend *blk)
> {
> IO_CODE();
> diff --git a/block/plug.c b/block/plug.c
> new file mode 100644
> index 00..6738a568ba
> --- /dev/null
> +++ b/block/plug.c
> @@ -0,0 +1,159 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Block I/O plugging
> + *
> + * Copyright Red Hat.
> + *
> + * This API defers a function call within a blk_io_plug()/blk_io_unplug()
> + * section, allowing multiple calls to batch up. This is a performance
> + * optimization that is used in the block layer to submit several I/O 
requests
> + * at once instead of individually:
> + *
> + *   blk_io_plug(); <-- start of plugged region
> + *   ...
> + *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
> + *   blk_io_plug_call(my_func, my_obj); <-- another
> + *   blk_io_plug_call(my_func, my_obj); <--

Re: [PATCH 3/6] block/blkio: convert to blk_io_plug_call() API

2023-05-19 Thread Stefano Garzarella

On Wed, May 17, 2023 at 06:10:19PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
block/blkio.c | 40 +---
1 file changed, 21 insertions(+), 19 deletions(-)


With this patch, the build fails in several places, maybe it's an old
version:

../block/blkio.c:347:5: error: implicit declaration of function 
‘blk_io_plug_call’ [-Werror=implicit-function-declaration]

  347 | blk_io_plug_call(blkio_unplug_fn, bs);

../block/blkio.c:348:22: error: passing argument 1 of ‘blk_io_plug_call’ 
from incompatible pointer type [-Werror=incompatible-pointer-types]

  348 | blk_io_plug_call(blkio_unplug_fn, bs);

Thanks,
Stefano



diff --git a/block/blkio.c b/block/blkio.c
index 0cdc99a729..f2a1dc1fb2 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -325,16 +325,28 @@ static void blkio_detach_aio_context(BlockDriverState *bs)
   false, NULL, NULL, NULL, NULL, NULL);
}

-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
-static void blkio_submit_io(BlockDriverState *bs)
+/*
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
+ * blkio_lock.
+ */
+static void blkio_unplug_fn(BlockDriverState *bs)
{
-if (qatomic_read(>io_plugged) == 0) {
-BDRVBlkioState *s = bs->opaque;
+BDRVBlkioState *s = bs->opaque;

+WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
}
}

+/*
+ * Schedule I/O submission after enqueuing a new request. Called without
+ * blkio_lock.
+ */
+static void blkio_submit_io(BlockDriverState *bs)
+{
+blk_io_plug_call(blkio_unplug_fn, bs);
+}
+
static int coroutine_fn
blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
{
@@ -345,9 +357,9 @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, 
int64_t bytes)

WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_discard(s->blkioq, offset, bytes, , 0);
-blkio_submit_io(bs);
}

+blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}
@@ -378,9 +390,9 @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, 
int64_t bytes,

WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_readv(s->blkioq, offset, iov, iovcnt, , 0);
-blkio_submit_io(bs);
}

+blkio_submit_io(bs);
qemu_coroutine_yield();

if (use_bounce_buffer) {
@@ -423,9 +435,9 @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState 
*bs, int64_t offset,

WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_writev(s->blkioq, offset, iov, iovcnt, , blkio_flags);
-blkio_submit_io(bs);
}

+blkio_submit_io(bs);
qemu_coroutine_yield();

if (use_bounce_buffer) {
@@ -444,9 +456,9 @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)

WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_flush(s->blkioq, , 0);
-blkio_submit_io(bs);
}

+blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}
@@ -472,22 +484,13 @@ static int coroutine_fn 
blkio_co_pwrite_zeroes(BlockDriverState *bs,

WITH_QEMU_LOCK_GUARD(>blkio_lock) {
blkioq_write_zeroes(s->blkioq, offset, bytes, , blkio_flags);
-blkio_submit_io(bs);
}

+blkio_submit_io(bs);
qemu_coroutine_yield();
return cod.ret;
}

-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
-{
-BDRVBlkioState *s = bs->opaque;
-
-WITH_QEMU_LOCK_GUARD(>blkio_lock) {
-blkio_submit_io(bs);
-}
-}
-
typedef enum {
BMRR_OK,
BMRR_SKIP,
@@ -1009,7 +1012,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
.bdrv_co_pwritev = blkio_co_pwritev, \
.bdrv_co_flush_to_disk   = blkio_co_flush, \
.bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-.bdrv_co_io_unplug   = blkio_co_io_unplug, \
.bdrv_refresh_limits = blkio_refresh_limits, \
.bdrv_register_buf   = blkio_register_buf, \
.bdrv_unregister_buf = blkio_unregister_buf, \
--
2.40.1






Re: [PATCH 2/6] block/nvme: convert to blk_io_plug_call() API

2023-05-19 Thread Stefano Garzarella

On Wed, May 17, 2023 at 06:10:18PM -0400, Stefan Hajnoczi wrote:

Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
block/nvme.c | 44 
1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 5b744c2bda..100b38b592 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -25,6 +25,7 @@
#include "qemu/vfio-helpers.h"
#include "block/block-io.h"
#include "block/block_int.h"
+#include "sysemu/block-backend.h"
#include "sysemu/replay.h"
#include "trace.h"

@@ -119,7 +120,6 @@ struct BDRVNVMeState {
int blkshift;

uint64_t max_transfer;
-bool plugged;

bool supports_write_zeroes;
bool supports_discard;
@@ -282,7 +282,7 @@ static void nvme_kick(NVMeQueuePair *q)
{
BDRVNVMeState *s = q->s;

-if (s->plugged || !q->need_kick) {
+if (!q->need_kick) {
return;
}
trace_nvme_kick(s, q->index);
@@ -387,10 +387,6 @@ static bool nvme_process_completion(NVMeQueuePair *q)
NvmeCqe *c;

trace_nvme_process_completion(s, q->index, q->inflight);
-if (s->plugged) {
-trace_nvme_process_completion_queue_plugged(s, q->index);


Should we remove "nvme_process_completion_queue_plugged(void *s,
unsigned q_index) "s %p q #%u" from block/trace-events?

The rest LGTM!

Thanks,
Stefano




Re: [PATCH 1/6] block: add blk_io_plug_call() API

2023-05-19 Thread Stefano Garzarella

On Wed, May 17, 2023 at 06:10:17PM -0400, Stefan Hajnoczi wrote:

Introduce a new API for thread-local blk_io_plug() that does not
traverse the block graph. The goal is to make blk_io_plug() multi-queue
friendly.

Instead of having block drivers track whether or not we're in a plugged
section, provide an API that allows them to defer a function call until
we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
called multiple times with the same fn/opaque pair, then fn() is only
called once at the end of the function - resulting in batching.

This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
because the plug state is now thread-local.

Later patches convert block drivers to blk_io_plug_call() and then we
can finally remove .bdrv_co_io_plug() once all block drivers have been
converted.

Signed-off-by: Stefan Hajnoczi 
---
MAINTAINERS   |   1 +
include/sysemu/block-backend-io.h |  13 +--
block/block-backend.c |  22 -
block/plug.c  | 159 ++
hw/block/dataplane/xen-block.c|   8 +-
hw/block/virtio-blk.c |   4 +-
hw/scsi/virtio-scsi.c |   6 +-
block/meson.build |   1 +
8 files changed, 173 insertions(+), 41 deletions(-)
create mode 100644 block/plug.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 50585117a0..574202295c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2644,6 +2644,7 @@ F: util/aio-*.c
F: util/aio-*.h
F: util/fdmon-*.c
F: block/io.c
+F: block/plug.c
F: migration/block*
F: include/block/aio.h
F: include/block/aio-wait.h
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index d62a7ee773..be4dcef59d 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -100,16 +100,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
int blk_get_max_iov(BlockBackend *blk);
int blk_get_max_hw_iov(BlockBackend *blk);

-/*
- * blk_io_plug/unplug are thread-local operations. This means that multiple
- * IOThreads can simultaneously call plug/unplug, but the caller must ensure
- * that each unplug() is called in the same IOThread of the matching plug().
- */
-void coroutine_fn blk_co_io_plug(BlockBackend *blk);
-void co_wrapper blk_io_plug(BlockBackend *blk);
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
-void co_wrapper blk_io_unplug(BlockBackend *blk);
+void blk_io_plug(void);
+void blk_io_unplug(void);
+void blk_io_plug_call(void (*fn)(void *), void *opaque);

AioContext *blk_get_aio_context(BlockBackend *blk);
BlockAcctStats *blk_get_stats(BlockBackend *blk);
diff --git a/block/block-backend.c b/block/block-backend.c
index ca537cd0ad..1f1d226ba6 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2568,28 +2568,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, 
Notifier *notify)
notifier_list_add(>insert_bs_notifiers, notify);
}

-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
-{
-BlockDriverState *bs = blk_bs(blk);
-IO_CODE();
-GRAPH_RDLOCK_GUARD();
-
-if (bs) {
-bdrv_co_io_plug(bs);
-}
-}
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
-{
-BlockDriverState *bs = blk_bs(blk);
-IO_CODE();
-GRAPH_RDLOCK_GUARD();
-
-if (bs) {
-bdrv_co_io_unplug(bs);
-}
-}
-
BlockAcctStats *blk_get_stats(BlockBackend *blk)
{
IO_CODE();
diff --git a/block/plug.c b/block/plug.c
new file mode 100644
index 00..6738a568ba
--- /dev/null
+++ b/block/plug.c
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block I/O plugging
+ *
+ * Copyright Red Hat.
+ *
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
+ * section, allowing multiple calls to batch up. This is a performance
+ * optimization that is used in the block layer to submit several I/O requests
+ * at once instead of individually:
+ *
+ *   blk_io_plug(); <-- start of plugged region
+ *   ...
+ *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   ...
+ *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
+ *
+ * This code is actually generic and not tied to the block layer. If another
+ * subsystem needs this functionality, it could be renamed.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "sysemu/block-backend.h"
+
+/* A function call that has been deferred until unplug() */
+typedef struct {
+void (*fn)(void *);
+void *opaque;
+} UnplugFn;
+
+/* Per-thread state */
+typedef struct {
+unsigned count;   /* how many times has plug() been called? */
+GArray *unplug_fns;   /* functions to call at unplug time */
+} Plug;
+
+/* Use get_ptr_plug() to fetch this 

Re: [PATCH v3 0/6] aio-posix: split poll check from ready handler

2021-12-09 Thread Stefano Garzarella

On Tue, Dec 07, 2021 at 01:23:30PM +, Stefan Hajnoczi wrote:

v3:
- Fixed FUSE export aio_set_fd_handler() call that I missed and double-checked
 for any other missing call sites using Coccinelle [Rich]
v2:
- Cleaned up unused return values in nvme and virtio-blk [Stefano]
- Documented try_poll_mode() ready_list argument [Stefano]
- Unified virtio-blk/scsi dataplane and non-dataplane virtqueue handlers 
[Stefano]

The first patch improves AioContext's adaptive polling execution time
measurement. This can result in better performance because the algorithm makes
better decisions about when to poll versus when to fall back to file descriptor
monitoring.

The remaining patches unify the virtio-blk and virtio-scsi dataplane and
non-dataplane virtqueue handlers. This became possible because the dataplane
handler function now has the same function signature as the non-dataplane
handler function. Stefano Garzarella prompted me to make this refactoring.


Great clean up! Thanks for doing this, everything LGTM:

Reviewed-by: Stefano Garzarella 




Re: An error due to installation that require binutils package

2021-03-30 Thread Stefano Garzarella

Hi John,

On Mon, Mar 29, 2021 at 09:46:49PM +0300, John Simpson wrote:

Hello,

Kindly ask you to have a look at this bug.
Thank you for your replies.


It's already fixed in QEMU upstream and the fix will be released with 
the 6.0 version next month (the rc0 is already available):

https://gitlab.com/qemu-project/qemu/-/commit/bbd2d5a8120771ec59b86a80a1f51884e0a26e53

I guess xen-4.14.1 is using an older version, so if you want you can 
backport that patch in your version, the change should be simple.


Thanks,
Stefano



On Mon, Mar 29, 2021 at 7:07 PM George Dunlap 
wrote:


John,

Thanks for your report.  Can you post your bug report
xen-devel@lists.xenproject.org ?

The bug is in the compilation of QEMU, which is an external project; so
it’s possible that we’ll end up having to raise this with that community as
well.

Thanks,
 -George Dunlap

> On Mar 28, 2021, at 2:26 PM, John Simpson  wrote:
>
> Hello,
>
> Just forwarding this message to you. Can you give some thoughs about
this? Thanks a lot.
>
>
> -- Forwarded message -
> From: Alan Modra 
> Date: Sun, Mar 28, 2021 at 2:21 PM
> Subject: Re: An error due to installation that require binutils package.
> To: John Simpson 
> Cc: 
>
>
> On Sun, Mar 28, 2021 at 12:55:23PM +0300, John Simpson via Binutils
wrote:
> >   BUILD   pc-bios/optionrom/kvmvapic.img
> > ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
>
> -no-pie is a gcc option.  Neither -no-pie nor --no-pie is a valid ld
> option.  The fault lies with whatever passed -no-pie to ld.
>
> --
> Alan Modra
> Australia Development Lab, IBM
>
>
>
> -- Forwarded message -
> From: Andreas Schwab 
> Date: Sun, Mar 28, 2021 at 2:17 PM
> Subject: Re: An error due to installation that require binutils 
> package.

> To: John Simpson via Binutils 
> Cc: John Simpson 
>
>
> Please report that to the xen project.  ld -no-pie doesn't have a useful
> meaning.  It used to mean the same as ld -n -o-pie, which sets "-pie" as
> the output file name.
>
> Andreas.
>
> --
> Andreas Schwab, sch...@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
> "And now for something completely different."
>
>
>
> -- Forwarded message -
> From: John Simpson 
> Date: Sun, Mar 28, 2021 at 12:55 PM
> Subject: An error due to installation that require binutils package.
> To: 
>
>
> Hello,
>
> Recently I got a following error due to installation xen on
5.11.6-1-MANJARO kernel:
>
>   GEN target/riscv/trace.c
>   GEN target/s390x/trace.c
>   GEN target/sparc/trace.c
>   GEN util/trace.c
>   GEN config-all-devices.mak
> make[1]: Entering directory
'/home/username/xen/src/xen-4.14.1/tools/qemu-xen/slirp'
> make[1]: Nothing to be done for 'all'.
> make[1]: Leaving directory
'/home/username/xen/src/xen-4.14.1/tools/qemu-xen/slirp'
>   BUILD   pc-bios/optionrom/multiboot.img
>   BUILD   pc-bios/optionrom/linuxboot.img
>   BUILD   pc-bios/optionrom/linuxboot_dma.img
>   BUILD   pc-bios/optionrom/kvmvapic.img
> ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
> make[1]: *** [Makefile:53: multiboot.img] Error 1
> make[1]: *** Waiting for unfinished jobs
> ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
> make[1]: *** [Makefile:53: linuxboot_dma.img] Error 1
>   BUILD   pc-bios/optionrom/pvh.img
> ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
> make[1]: *** [Makefile:53: linuxboot.img] Error 1
> ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
> make[1]: *** [Makefile:53: kvmvapic.img] Error 1
> ld: Error: unable to disambiguate: -no-pie (did you mean --no-pie ?)
> make[1]: *** [Makefile:50: pvh.img] Error 1
> make: *** [Makefile:581: pc-bios/optionrom/all] Error 2
> make: Leaving directory
'/home/username/xen/src/xen-4.14.1/tools/qemu-xen-build'
> make[3]: *** [Makefile:218: subdir-all-qemu-xen-dir] Error 2
> make[3]: Leaving directory '/home/username/xen/src/xen-4.14.1/tools'
> make[2]: ***
[/home/username/xen/src/xen-4.14.1/tools/../tools/Rules.mk:235:
subdirs-install] Error 2
> make[2]: Leaving directory '/home/username/xen/src/xen-4.14.1/tools'
> make[1]: *** [Makefile:72: install] Error 2
> make[1]: Leaving directory '/home/username/xen/src/xen-4.14.1/tools'
> make: *** [Makefile:134: install-tools] Error 2
> ==> ERROR: A failure occurred in build().
> Aborting...
>
> Currently I have fresh binutils 2.36.1-2 and it seems to me that the
issue is related to this part of code:
>
> https://github.com/bminor/binutils-gdb/blob/master/ld/lexsup.c#L451
>
> It seems to me that this could impact far more users than just me.
>







Re: [Xen-devel] [RFC v2 0/4] QEMU changes to do PVH boot

2019-01-10 Thread Stefano Garzarella
On Wed, Jan 09, 2019 at 01:18:12PM -0800, Maran Wilson wrote:
> On 1/9/2019 11:53 AM, Boris Ostrovsky wrote:
> > On 1/9/19 6:53 AM, Stefano Garzarella wrote:
> > > Hi Liam,
> > > 
> > > On Tue, Jan 8, 2019 at 3:47 PM Liam Merwick  
> > > wrote:
> > > > QEMU sets the hvm_modlist_entry in load_linux() after the call to
> > > > load_elfboot() and then qboot loads it in boot_pvh_from_fw_cfg()
> > > > 
> > > > But the current PVH patches don't handle initrd (they have
> > > > start_info.nr_modules == 1).
> > > Looking in the linux kernel (arch/x86/platform/pvh/enlighten.c) I saw:
> > >  /* The first module is always ramdisk. */
> > >  if (pvh_start_info.nr_modules) {
> > >  struct hvm_modlist_entry *modaddr =
> > >  __va(pvh_start_info.modlist_paddr);
> > >  pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
> > >  pvh_bootparams.hdr.ramdisk_size = modaddr->size;
> > >  }
> > > 
> > > So, putting start_info.nr_modules = 1, means that the first
> > > hvm_modlist_entry should have the ramdisk paddr and size. Is it
> > > correct?
> 
> That's my understanding.
> 
> I think what's missing, is that we just need Qemu or qboot/seabios to
> properly populate the pvh_start_info.modlist_paddr with the address (as
> usable by the guest) of the hvm_modlist_entry which correctly defines the
> details of the initrd that has already been loaded into memory that is
> accessible by the guest.
> 
> -Maran
> 

I tried and it works, I modified QEMU to load the initrd and to expose it
through fw_cfg, then qboot loads it and set correctly the hvm_modlist_entry.

You can find the patch of QEMU at the end of this email and the qboot
patch here: 
https://github.com/stefano-garzarella/qboot/commit/41e1fd765c8419e270fd79d9b3af5d53576e88a8

Do you think can be a good approach? If you want, you can add this patch
to your series.

Thanks,
Stefano


From d5c0d51768f5a8fb214be6c2bb0cb7e86e9917b7 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella 
Date: Thu, 10 Jan 2019 15:16:44 +0100
Subject: [PATCH] pvh: load initrd and expose it through fw_cfg

When initrd is specified, load and expose it to the guest firmware
through fw_cfg. The firmware will fill the hvm_start_info for the
kernel.

Signed-off-by: Stefano Garzarella 
Based-on: <1545422632-2-5-git-send-email-liam.merw...@oracle.com>
---
 hw/i386/pc.c | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 06bce6a101..f6721f51be 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -986,25 +986,45 @@ static void load_linux(PCMachineState *pcms,
  */
 if (load_elfboot(kernel_filename, kernel_size,
  header, pvh_start_addr, fw_cfg)) {
-struct hvm_modlist_entry ramdisk_mod = { 0 };
-
 fclose(f);
 
 fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
 strlen(kernel_cmdline) + 1);
 fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, kernel_cmdline);
 
-assert(machine->device_memory != NULL);
-ramdisk_mod.paddr = machine->device_memory->base;
-ramdisk_mod.size =
-memory_region_size(>device_memory->mr);
-
-fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, _mod,
- sizeof(ramdisk_mod));
 fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
 fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
  header, sizeof(header));
 
+/* load initrd */
+if (initrd_filename) {
+gsize initrd_size;
+gchar *initrd_data;
+GError *gerr = NULL;
+
+if (!g_file_get_contents(initrd_filename, _data,
+_size, )) {
+fprintf(stderr, "qemu: error reading initrd %s: %s\n",
+initrd_filename, gerr->message);
+exit(1);
+}
+
+initrd_max = pcms->below_4g_mem_size - pcmc->acpi_data_size - 
1;
+if (initrd_size >= initrd_max) {
+fprintf(stderr, "qemu: initrd is too large, cannot 
support."
+"(max: %"PRIu32", need %"PRId64")\n",
+initrd_max, (uint64_t)initrd_size);
+exit(1);
+}
+
+initrd_addr = (initrd_max - initrd_size) & ~4095;
+
+fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
+fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
+fw_cf

Re: [Xen-devel] [RFC v2 0/4] QEMU changes to do PVH boot

2019-01-09 Thread Stefano Garzarella
Hi Liam,

On Tue, Jan 8, 2019 at 3:47 PM Liam Merwick  wrote:
> QEMU sets the hvm_modlist_entry in load_linux() after the call to
> load_elfboot() and then qboot loads it in boot_pvh_from_fw_cfg()
>
> But the current PVH patches don't handle initrd (they have
> start_info.nr_modules == 1).

Looking in the linux kernel (arch/x86/platform/pvh/enlighten.c) I saw:
/* The first module is always ramdisk. */
if (pvh_start_info.nr_modules) {
struct hvm_modlist_entry *modaddr =
__va(pvh_start_info.modlist_paddr);
pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
pvh_bootparams.hdr.ramdisk_size = modaddr->size;
}

So, putting start_info.nr_modules = 1, means that the first
hvm_modlist_entry should have the ramdisk paddr and size. Is it
correct?


>
> During (or after) the call to load_elfboot() it looks like we'd need to
> do something like what load_multiboot() does below (along with the
> associated initialisation)
>
> 400 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, ADDR_MBI);
> 401 fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, sizeof(bootinfo));
> 402 fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, mb_bootinfo_data,
> 403  sizeof(bootinfo));
>

In this case I think they used the FW_CFG_INITRD_* to pass bootinfo
varibales to the guest, maybe we could add something like what
linux_load() does:

/* load initrd */
if (initrd_filename) {
...
initrd_addr = (initrd_max-initrd_size) & ~4095;

fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_ADDR, initrd_addr);
fw_cfg_add_i32(fw_cfg, FW_CFG_INITRD_SIZE, initrd_size);
fw_cfg_add_bytes(fw_cfg, FW_CFG_INITRD_DATA, initrd_data, initrd_size);
...
}

Then we can load the initrd in qboot or in the optionrom that I'm writing.

What do you think?

Thanks,
Stefano

> I'm checking to see if that has any implications for the kernel side.
>
> Regards,
> Liam
>


-- 
Stefano Garzarella
Red Hat

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC v2 0/4] QEMU changes to do PVH boot

2019-01-03 Thread Stefano Garzarella
blaze/boot.c   |   7 +-
>  hw/mips/mips_fulong2e.c|   5 +-
>  hw/mips/mips_malta.c   |   5 +-
>  hw/mips/mips_mipssim.c |   5 +-
>  hw/mips/mips_r4k.c |   5 +-
>  hw/moxie/moxiesim.c|   2 +-
>  hw/nios2/boot.c|   7 +-
>  hw/openrisc/openrisc_sim.c |   2 +-
>  hw/pci-host/prep.c |   2 +-
>  hw/ppc/e500.c  |   3 +-
>  hw/ppc/mac_newworld.c  |   5 +-
>  hw/ppc/mac_oldworld.c  |   5 +-
>  hw/ppc/ppc440_bamboo.c |   2 +-
>  hw/ppc/sam460ex.c  |   3 +-
>  hw/ppc/spapr.c |   7 +-
>  hw/ppc/virtex_ml507.c  |   2 +-
>  hw/riscv/sifive_e.c|   2 +-
>  hw/riscv/sifive_u.c|   2 +-
>  hw/riscv/spike.c   |   2 +-
>  hw/riscv/virt.c|   2 +-
>  hw/s390x/ipl.c |   9 ++-
>  hw/sparc/leon3.c   |   3 +-
>  hw/sparc/sun4m.c   |   6 +-
>  hw/sparc64/sun4u.c |   4 +-
>  hw/tricore/tricore_testboard.c |   2 +-
>  hw/xtensa/sim.c|  12 ++--
>  hw/xtensa/xtfpga.c |   2 +-
>  include/elf.h  |  10 +++
>  include/hw/elf_ops.h   |  72 
>  include/hw/loader.h|   9 ++-
>  include/hw/xen/start_info.h| 146 
> +
>  44 files changed, 469 insertions(+), 71 deletions(-)
>  create mode 100644 include/hw/xen/start_info.h
>
> --
> 1.8.3.1
>

--
Stefano Garzarella
Red Hat

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 3/3] pvh: Boot uncompressed kernel using direct boot ABI

2018-12-13 Thread Stefano Garzarella
Hi Liam,
in order to support PVH also with SeaBIOS, I'm going to work on a new
option rom (like linuxboot/multiboot) that can be used in this case.

I'll keep you updated on it!

Cheers,
Stefano
On Wed, Dec 5, 2018 at 11:38 PM Liam Merwick  wrote:
>
> These changes (along with corresponding qboot and Linux kernel changes)
> enable a guest to be booted using the x86/HVM direct boot ABI.
>
> This commit adds a load_elfboot() routine to pass the size and
> location of the kernel entry point to qboot (which will fill in
> the start_info struct information needed to to boot the guest).
> Having loaded the ELF binary, load_linux() will run qboot
> which continues the boot.
>
> The address for the kernel entry point has already been read
> from an ELF Note in the uncompressed kernel binary earlier
> in pc_memory_init().
>
> Signed-off-by: George Kennedy 
> Signed-off-by: Liam Merwick 
> ---
>  hw/i386/pc.c | 72 
> 
>  1 file changed, 72 insertions(+)
>
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 056aa46d99b9..d3012cbd8597 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -54,6 +54,7 @@
>  #include "sysemu/qtest.h"
>  #include "kvm_i386.h"
>  #include "hw/xen/xen.h"
> +#include "hw/xen/start_info.h"
>  #include "ui/qemu-spice.h"
>  #include "exec/memory.h"
>  #include "exec/address-spaces.h"
> @@ -1098,6 +1099,50 @@ done:
>  return pvh_start_addr != 0;
>  }
>
> +static bool load_elfboot(const char *kernel_filename,
> +   int kernel_file_size,
> +   uint8_t *header,
> +   size_t pvh_xen_start_addr,
> +   FWCfgState *fw_cfg)
> +{
> +uint32_t flags = 0;
> +uint32_t mh_load_addr = 0;
> +uint32_t elf_kernel_size = 0;
> +uint64_t elf_entry;
> +uint64_t elf_low, elf_high;
> +int kernel_size;
> +
> +if (ldl_p(header) != 0x464c457f) {
> +return false; /* no elfboot */
> +}
> +
> +bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
> +flags = elf_is64 ?
> +((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
> +
> +if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
> +error_report("elfboot unsupported flags = %x", flags);
> +exit(1);
> +}
> +
> +kernel_size = load_elf(kernel_filename, NULL, NULL, _entry,
> +   _low, _high, 0, I386_ELF_MACHINE,
> +   0, 0);
> +
> +if (kernel_size < 0) {
> +error_report("Error while loading elf kernel");
> +exit(1);
> +}
> +mh_load_addr = elf_low;
> +elf_kernel_size = elf_high - elf_low;
> +
> +fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ENTRY, pvh_xen_start_addr);
> +fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_ADDR, mh_load_addr);
> +fw_cfg_add_i32(fw_cfg, FW_CFG_KERNEL_SIZE, elf_kernel_size);
> +
> +return true;
> +}
> +
>  static void load_linux(PCMachineState *pcms,
> FWCfgState *fw_cfg)
>  {
> @@ -1138,6 +1183,33 @@ static void load_linux(PCMachineState *pcms,
>  if (ldl_p(header+0x202) == 0x53726448) {
>  protocol = lduw_p(header+0x206);
>  } else {
> +/* If the kernel address for using the x86/HVM direct boot ABI has
> + * been saved then proceed with booting the uncompressed kernel */
> +if (pvh_start_addr) {
> +if (load_elfboot(kernel_filename, kernel_size,
> + header, pvh_start_addr, fw_cfg)) {
> +struct hvm_modlist_entry ramdisk_mod = { 0 };
> +
> +fclose(f);
> +
> +fw_cfg_add_i32(fw_cfg, FW_CFG_CMDLINE_SIZE,
> +strlen(kernel_cmdline) + 1);
> +fw_cfg_add_string(fw_cfg, FW_CFG_CMDLINE_DATA, 
> kernel_cmdline);
> +
> +assert(machine->device_memory != NULL);
> +ramdisk_mod.paddr = machine->device_memory->base;
> +ramdisk_mod.size =
> +memory_region_size(>device_memory->mr);
> +
> +fw_cfg_add_bytes(fw_cfg, FW_CFG_KERNEL_DATA, _mod,
> + sizeof(ramdisk_mod));
> +fw_cfg_add_i32(fw_cfg, FW_CFG_SETUP_SIZE, sizeof(header));
> +fw_cfg_add_bytes(fw_cfg, FW_CFG_SETUP_DATA,
> + header, sizeof(header));
> +
> +return;
> +}
> +}
>  /* This looks like a multiboot kernel. If it is, let's stop
> treating it like a Linux kernel. */
>  if (load_multiboot(fw_cfg, f, kernel_filename, initrd_filename,
> --
> 1.8.3.1
>


-- 
Stefano Garzarella
Red Hat

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 3/3] pvh: Boot uncompressed kernel using direct boot ABI

2018-12-13 Thread Stefano Garzarella
On Tue, Dec 11, 2018 at 7:35 PM Maran Wilson  wrote:
>
> On 12/11/2018 9:11 AM, Stefano Garzarella wrote:
> > Hi Liam,
> > in order to support PVH also with SeaBIOS, I'm going to work on a new
> > option rom (like linuxboot/multiboot) that can be used in this case.
>
> That is awesome. Yes, please keep us posted when you have something working.

Yes, I'll keep you updated!

>
> Just FYI, before switching over to using Qemu+qboot, we had been using a
> Qemu only solution (but not using an option rom) internally that worked
> very well using no FW at all. We had Qemu simply parse the ELF file and
> jump to the PVH entry point if one is found. The only gotcha was that we
> had to include a pair of patches that were originally written by folks
> at Intel as part of the clear containers work. Specifically, in order to
> be able to skip firmware entirely, we had to do 2 additional things: (1)
> ACPI tables generated by Qemu are usually patched up by FW. Since we
> were running no FW, we needed to do that patching up of the ACPI tables
> in Qemu when it was detected that we were going to enter the OS via the
> PVH entry point. (2) We also needed to add a patch to Qemu to enable a
> few PM registers -- something typically done by FW.

I had a look of qemu-lite, are you referring to this?

>
> But if SeaBIOS is involved in the solution you are working on, I guess
> you won't really need those extra patches. Just figured I'd mention it
> so you have the full picture.

Thank you very much to share with me these details!

Cheers,
Stefano

>
> Thanks,
> -Maran
>
> > I'll keep you updated on it!
> >
> > Cheers,
> > Stefano
> > On Wed, Dec 5, 2018 at 11:38 PM Liam Merwick  
> > wrote:
> >> These changes (along with corresponding qboot and Linux kernel changes)
> >> enable a guest to be booted using the x86/HVM direct boot ABI.
> >>
> >> This commit adds a load_elfboot() routine to pass the size and
> >> location of the kernel entry point to qboot (which will fill in
> >> the start_info struct information needed to to boot the guest).
> >> Having loaded the ELF binary, load_linux() will run qboot
> >> which continues the boot.
> >>
> >> The address for the kernel entry point has already been read
> >> from an ELF Note in the uncompressed kernel binary earlier
> >> in pc_memory_init().
> >>
> >> Signed-off-by: George Kennedy 
> >> Signed-off-by: Liam Merwick 
> >> ---
> >>   hw/i386/pc.c | 72 
> >> 
> >>   1 file changed, 72 insertions(+)
> >>
> >> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> >> index 056aa46d99b9..d3012cbd8597 100644
> >> --- a/hw/i386/pc.c
> >> +++ b/hw/i386/pc.c
> >> @@ -54,6 +54,7 @@
> >>   #include "sysemu/qtest.h"
> >>   #include "kvm_i386.h"
> >>   #include "hw/xen/xen.h"
> >> +#include "hw/xen/start_info.h"
> >>   #include "ui/qemu-spice.h"
> >>   #include "exec/memory.h"
> >>   #include "exec/address-spaces.h"
> >> @@ -1098,6 +1099,50 @@ done:
> >>   return pvh_start_addr != 0;
> >>   }
> >>
> >> +static bool load_elfboot(const char *kernel_filename,
> >> +   int kernel_file_size,
> >> +   uint8_t *header,
> >> +   size_t pvh_xen_start_addr,
> >> +   FWCfgState *fw_cfg)
> >> +{
> >> +uint32_t flags = 0;
> >> +uint32_t mh_load_addr = 0;
> >> +uint32_t elf_kernel_size = 0;
> >> +uint64_t elf_entry;
> >> +uint64_t elf_low, elf_high;
> >> +int kernel_size;
> >> +
> >> +if (ldl_p(header) != 0x464c457f) {
> >> +return false; /* no elfboot */
> >> +}
> >> +
> >> +bool elf_is64 = header[EI_CLASS] == ELFCLASS64;
> >> +flags = elf_is64 ?
> >> +((Elf64_Ehdr *)header)->e_flags : ((Elf32_Ehdr *)header)->e_flags;
> >> +
> >> +if (flags & 0x00010004) { /* LOAD_ELF_HEADER_HAS_ADDR */
> >> +error_report("elfboot unsupported flags = %x", flags);
> >> +exit(1);
> >> +}
> >> +
> >> +kernel_size = load_elf(kernel_filename, NULL, NULL, _entry,
> >> +   _low, _high, 0, I386_ELF_MACHINE,
> >> +   0, 0);
> >> +
> >> +if (kernel_size < 0) {
>