Re: [PATCH] hw/nvme: Add helper functions for qid-db conversion

2022-08-01 Thread Jinhao Fan
at 4:07 PM, Jinhao Fan  wrote:

> With the introduction of shadow doorbell and ioeventfd, we need to do
> frequent conversion between qid and its doorbell offset. The original
> hard-coded calculation is confusing and error-prone. Add several helper
> functions to do this task.
> 
> Signed-off-by: Jinhao Fan 
> ---
> hw/nvme/ctrl.c | 61 --
> 1 file changed, 39 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 533ad14e7a..6116c0e660 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -487,6 +487,29 @@ static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
> {
> return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
> }
> +static inline bool nvme_db_offset_is_cq(NvmeCtrl *n, hwaddr offset)
> +{
> +hwaddr stride = 4 << NVME_CAP_DSTRD(ldq_le_p(>bar.cap));
> +return (offset / stride) & 1;
> +}
> +
> +static inline uint16_t nvme_db_offset_to_qid(NvmeCtrl *n, hwaddr offset)
> +{
> +hwaddr stride = 4 << NVME_CAP_DSTRD(ldq_le_p(>bar.cap));
> +return offset / (2 * stride);
> +}
> +
> +static inline hwaddr nvme_cqid_to_db_offset(NvmeCtrl *n, uint16_t cqid)
> +{
> +hwaddr stride = 4 << NVME_CAP_DSTRD(ldq_le_p(>bar.cap));
> +return stride * (cqid * 2 + 1);
> +}
> +
> +static inline hwaddr nvme_sqid_to_db_offset(NvmeCtrl *n, uint16_t sqid)
> +{
> +hwaddr stride = 4 << NVME_CAP_DSTRD(ldq_le_p(>bar.cap));
> +return stride * sqid * 2;
> +}
> 
> static void nvme_inc_cq_tail(NvmeCQueue *cq)
> {
> @@ -4256,7 +4279,7 @@ static void nvme_cq_notifier(EventNotifier *e)
> static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
> {
> NvmeCtrl *n = cq->ctrl;
> -uint16_t offset = (cq->cqid << 3) + (1 << 2);
> +uint16_t offset = nvme_cqid_to_db_offset(n, cq->cqid);
> int ret;
> 
> ret = event_notifier_init(>notifier, 0);
> @@ -4283,7 +4306,7 @@ static void nvme_sq_notifier(EventNotifier *e)
> static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> {
> NvmeCtrl *n = sq->ctrl;
> -uint16_t offset = sq->sqid << 3;
> +uint16_t offset = nvme_sqid_to_db_offset(n, sq->sqid);
> int ret;
> 
> ret = event_notifier_init(>notifier, 0);
> @@ -4300,7 +4323,7 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
> 
> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
> {
> -uint16_t offset = sq->sqid << 3;
> +uint16_t offset = nvme_sqid_to_db_offset(n, sq->sqid);
> 
> n->sq[sq->sqid] = NULL;
> timer_free(sq->timer);
> @@ -4379,8 +4402,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
> uint64_t dma_addr,
> sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
> 
> if (n->dbbuf_enabled) {
> -sq->db_addr = n->dbbuf_dbs + (sqid << 3);
> -sq->ei_addr = n->dbbuf_eis + (sqid << 3);
> +sq->db_addr = n->dbbuf_dbs + nvme_sqid_to_db_offset(n, sqid);
> +sq->ei_addr = n->dbbuf_eis + nvme_sqid_to_db_offset(n, sqid);
> 
> if (n->params.ioeventfd && sq->sqid != 0) {
> if (!nvme_init_sq_ioeventfd(sq)) {
> @@ -4690,8 +4713,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
> *req)
> 
> static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
> {
> -uint16_t offset = (cq->cqid << 3) + (1 << 2);
> -
> +uint16_t offset = nvme_cqid_to_db_offset(n, cq->cqid);
> +
> n->cq[cq->cqid] = NULL;
> timer_free(cq->timer);
> if (cq->ioeventfd_enabled) {
> @@ -4755,8 +4778,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
> uint64_t dma_addr,
> QTAILQ_INIT(>req_list);
> QTAILQ_INIT(>sq_list);
> if (n->dbbuf_enabled) {
> -cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
> -cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
> +cq->db_addr = n->dbbuf_dbs + nvme_cqid_to_db_offset(n, cqid);
> +cq->ei_addr = n->dbbuf_eis + nvme_cqid_to_db_offset(n, cqid);
> 
> if (n->params.ioeventfd && cqid != 0) {
> if (!nvme_init_cq_ioeventfd(cq)) {
> @@ -6128,13 +6151,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
> NvmeRequest *req)
> NvmeCQueue *cq = n->cq[i];
> 
> if (sq) {
> -/*
> - * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
> - * nvme_process_db() uses this hard-coded way to calculate
> - * doorbell offsets. Be consistent with that here.
> - */
> -sq->db_addr = dbs_addr + (i << 3);
> -sq->ei_addr = eis_addr + (i << 3);
> +sq->db_addr = dbs_addr + nvme_sqid_to_db_offset(n, i);
> +sq->ei_addr = eis_addr + nvme_sqid_to_db_offset(n, i);
> pci_dma_write(>parent_obj, sq->db_addr, >tail,
> sizeof(sq->tail));
> 
> @@ -6146,9 +6164,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
> NvmeRequest *req)
> }
> 
> if (cq) {
> -/* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) 
> */
> -

Re: [RFC 0/3] Add Generic SPI GPIO model

2022-08-01 Thread Peter Delevoryas
On Mon, Aug 01, 2022 at 11:49:09AM +0930, Andrew Jeffery wrote:
> 
> 
> On Sun, 31 Jul 2022, at 06:48, Cédric Le Goater wrote:
> > On 7/29/22 19:30, Peter Delevoryas wrote:
> >> Certainly we'd like to use IRQ's instead, but she ran into correctness
> >> problems. Maybe we can investigate that further and fix it.
> 
> Yes, let's not work around problems that we have the ability to fix.

+1

> 
> >
> > So much is happening in that mode.
> 
> Yes, though while there's no-doubt accidental complexity in terms of 
> its implementation, the underlying hardware is also quite haphazard and 
> we need an approach that scales to the large number of GPIOs it 
> provides. So I'd argue there's a bunch of essential complexity involved 
> as well.

+1

> 
> Do we start with a fresh implementation that allows us to get the 
> expected behaviour and then build that out to replace the current 
> implementation?
> 
> Or, can we add more tests for the existing model to pin down where the 
> bugs are?

Mmmm good question, I think we might end up doing both. Tests would make
it much easier to develop either way though.

> 
> > We need more trace events in the Aspeed
> > GPIO model at least an extra in aspeed_gpio_update()
> 
> We can always fall back to this but my hope is we can get better test 
> coverage to iron out the bugs. Maybe that gets us a more refined and 
> maintainable model implementation along the way.

+1

> 
> Cheers,
> 
> Andrew
> 



Feature - Add delay to NVMe device model

2022-08-01 Thread Stephen Bates
Hi

I am interested in exploring the impact of increased storage IO delay on 
applications and was thinking of doing this by adding a delay argument to the 
NVMe device model in QEMU. Is this something that others would be interested 
in? If so I would be happy to work up a patch for upstream consideration.

If this is already do-able in QEMU I would appreciate a pointer to that.

Thanks

Stephen


Re: [RFC v5 00/11] Add support for zoned device

2022-08-01 Thread Stefan Hajnoczi
Hi Hannes, Damien, and Dmitry,
This patch series introduces zoned_host_device for passing through
host zoned storage devices.

How can one host zoned storage device be split up for multiple VMs?

For NVMe it may be possible to allocate multiple Namespaces on the
device using management tools. Then Linux sees individual /dev/nvme0nX
block device nodes and QEMU uses them with zoned_host_device.

For other types of devices, can dm-linear create separate
device-mapper targets? How do max open/active zones, etc work when
multiple untrusted users are sharing a device?

I'm asking because splitting up a single physical device for multiple
VMs is a common virtualization use case that we should document.

Thanks,
Stefan



Re: [RFC v5 11/11] docs/zoned-storage: add zoned device documentation

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:42, Sam Li  wrote:
>
> Add the documentation about the zoned device support to virtio-blk
> emulation.
>
> Signed-off-by: Sam Li 
> ---
>  docs/devel/zoned-storage.rst   | 68 ++
>  docs/system/qemu-block-drivers.rst.inc |  6 +++
>  2 files changed, 74 insertions(+)
>  create mode 100644 docs/devel/zoned-storage.rst
>
> diff --git a/docs/devel/zoned-storage.rst b/docs/devel/zoned-storage.rst
> new file mode 100644
> index 00..e62927dceb
> --- /dev/null
> +++ b/docs/devel/zoned-storage.rst
> @@ -0,0 +1,68 @@
> +=
> +zoned-storage
> +=
> +
> +Zoned Block Devices (ZBDs) devide the LBA space to block regions called zones

s/space to block/space into block/

> +that are larger than the LBA size. It can only allow sequential writes, which
> +reduces write amplification in SSD, leading to higher throughput and 
> increased

s/SSD/SSDs/

> +capacity. More details about ZBDs can be found at:
> +
> +https://zonedstorage.io/docs/introduction/zoned-storage
> +
> +zone emulation
> +--
> +In its current status, the virtio-blk device is not aware of ZBDs but the 
> guest
> +sees host-managed drives as regular drive that will runs correctly under the
> +most common write workloads.

I don't understand this paragraph. I suggest dropping it. virtio-blk
can be mentioned further down.

> +
> +The zoned device support aims to let guests (virtual machines) access zoned
> +storage devices on the host (hypervisor) through a virtio-blk device. This
> +involves extending QEMU's block layer and virtio-blk emulation code.

This text is specific to your Outreachy project with the goal of
passing through zoned devices via virtio-blk. I suggest writing this
documentation from a broader point of view (emulated storage
controllers and core block layer APIs) and not focussing specifically
on virtio-blk:

1. Block layer APIs for zoned storage
Discuss the QEMU block layer's zoned storage model here. Mention
BlockLimits and how block drivers declare support for zoned storage.

2. Emulating zoned storage controllers
Discuss how emulated storage controllers can use the block layer APIs
when the BlockBackend's BlockLimits model reports a zoned storage
device.

This will help developers who aren't familiar with virtio-blk.
References to virtio-blk make it harder for them to understand how
they should use zoned storage.

> +
> +If the host supports zoned block devices, it can set VIRTIO_BLK_F_ZONED. Then
> +in the guest side, it appears following situations:
> +1) If the guest virtio-blk driver sees the VIRTIO_BLK_F_ZONED bit set, then 
> it
> +will assume that the zoned characteristics fields of the config space are 
> valid.
> +2) If the guest virtio-blk driver sees a zoned model that is NONE, then it is
> +known that is a regular block device.
> +3) If the guest virtio-blk driver sees a zoned model that is HM(or HA), then 
> it
> +is known that is a zoned block device and probes the other zone fields.
> +
> +On QEMU sides,
> +1) The DEFINE PROP BIT macro must be used to declare that the host supports
> +zones.
> +2) BlockDrivers can declare zoned device support once known the zoned model
> +for the block device is not NONE.
> +
> +zoned storage APIs
> +--
> +
> +Zone emulation part extends the block layer APIs and virtio-blk emulation 
> section

It's often clearer to describe new features directly instead of the
extensions, changes, or old vs new. The person reading this might not
be familiar with QEMU's block APIs before zoned storage and they don't
care what changed. They just need to know what zoned storage APIs are
available and how to use them.

Here I would say only "The block layer APIs support commands needed
for zoned storage devices, including report zones, four zone
operations, and zone append".

When discussing the zoned storage APIs, avoid mentioning storage
controller emulation as the user of the block API. That is just one
use case. It's possible that zoned storage APIs will be used in other
ways in the future. For example, the qcow2 image format driver could
be optimized for zoned storage devices by laying out and accessing its
metadata on zoned devices using the block APIs. The documentation
should be general enough that someone intending to use the block APIs
differently shouldn't have to know about zoned storage controller
emulation in order to figure out how to use the block APIs for other
purposes.

> +with the minimum set of zoned commands that are necessary to support zoned
> +devices. The commands are - Report Zones, four zone operations and Zone 
> Append
> +(developing).
> +
> +testing
> +---
> +
> +It can be tested on a null_blk device using qemu-io, qemu-iotests or 
> blkzone(8)
> +command in the guest os.
> +
> +1. For example, the command line for zone report using qemu-io is:
> +
> +$ path/to/qemu-io --image-opts driver=zoned_host_device,filename=/dev/nullb0 
> -c
> +"zrp offset nr_zones"
> +

Re: [RFC v5 10/11] qemu-iotests: test new zone operations

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:39, Sam Li  wrote:
>
> We have added new block layer APIs of zoned block devices. Test it with:
> Create a null_blk device, run each zone operation on it and see
> whether reporting right zone information.
>
> Signed-off-by: Sam Li 
> ---
>  tests/qemu-iotests/tests/zoned.out | 53 ++
>  tests/qemu-iotests/tests/zoned.sh  | 86 ++
>  2 files changed, 139 insertions(+)
>  create mode 100644 tests/qemu-iotests/tests/zoned.out
>  create mode 100755 tests/qemu-iotests/tests/zoned.sh

Reviewed-by: Stefan Hajnoczi 



Re: [RFC v5 09/11] qemu-io: add zoned block device operations.

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:42, Sam Li  wrote:
>
> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo),
> zone_close(zc), zone_reset(zrs), zone_finish(zf).
>
> For example, to test zone_report, use following command:
> $ ./build/qemu-io --image-opts driver=zoned_host_device, filename=/dev/nullb0
> -c "zrp offset nr_zones"
>
> Signed-off-by: Sam Li 
> ---
>  block/io.c |  24 ++---
>  qemu-io-cmds.c | 144 +
>  2 files changed, 148 insertions(+), 20 deletions(-)
>
> diff --git a/block/io.c b/block/io.c
> index a4625fb0e1..de9ec1d740 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -3209,19 +3209,11 @@ int bdrv_co_zone_report(BlockDriverState *bs, int64_t 
> offset,
>  IO_CODE();
>
>  bdrv_inc_in_flight(bs);
> -if (!drv || (!drv->bdrv_co_zone_report)) {
> +if (!drv || !drv->bdrv_co_zone_report) {
>  co.ret = -ENOTSUP;
>  goto out;
>  }
> -
> -if (drv->bdrv_co_zone_report) {
> -co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
> -} else {
> -co.ret = -ENOTSUP;
> -goto out;
> -qemu_coroutine_yield();
> -}
> -
> +co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
>  out:
>  bdrv_dec_in_flight(bs);
>  return co.ret;
> @@ -3237,19 +3229,11 @@ int bdrv_co_zone_mgmt(BlockDriverState *bs, 
> BlockZoneOp op,
>  IO_CODE();
>
>  bdrv_inc_in_flight(bs);
> -if (!drv || (!drv->bdrv_co_zone_mgmt)) {
> +if (!drv || !drv->bdrv_co_zone_mgmt) {
>  co.ret = -ENOTSUP;
>  goto out;
>  }
> -
> -if (drv->bdrv_co_zone_mgmt) {
> -co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
> -} else {
> -co.ret = -ENOTSUP;
> -goto out;
> -qemu_coroutine_yield();
> -}
> -
> +co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
>  out:
>  bdrv_dec_in_flight(bs);
>  return co.ret;

Please squash these changes into the earlier patch that introduced
these functions.

Otherwise:

Reviewed-by: Stefan Hajnoczi 



Re: [PULL for-7.1 0/3] hw/nvme fixes

2022-08-01 Thread Richard Henderson

On 8/1/22 03:05, Klaus Jensen wrote:

From: Klaus Jensen 

Hi,

The following changes since commit 3916603e0c1d909e14e09d5ebcbdaa9c9e21adf3:

   Merge tag 'pull-la-20220729' of https://gitlab.com/rth7680/qemu into staging 
(2022-07-29 17:39:17 -0700)

are available in the Git repository at:

   git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request

for you to fetch changes up to e2e137f64282a2ee2f359b6df4cd93c83a308e7b:

   hw/nvme: do not enable ioeventfd by default (2022-08-01 12:01:21 +0200)


hw/nvme fixes

Some fixes for hw/nvme ioeventfd support.


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/7.1 as 
appropriate.


r~






Klaus Jensen (3):
   hw/nvme: skip queue processing if notifier is cleared
   hw/nvme: unregister the event notifier handler on the main loop
   hw/nvme: do not enable ioeventfd by default

  hw/nvme/ctrl.c | 12 +---
  1 file changed, 9 insertions(+), 3 deletions(-)






Re: [RFC v5 08/11] virtio-blk: add zoned storage APIs for zoned devices

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:43, Sam Li  wrote:
>
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, and four zone oparations (open,
> close, finish, reset). The virtio-blk zoned device command specifications
> is currently in the reviewing process.
>
> VIRTIO_BLK_F_ZONED will only be set if the host does support zoned block
> devices. The regular block device will not be set. The guest os having
> zoned device support can use blkzone(8) to test those commands.
>
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c |  92 
>  hw/block/virtio-blk.c | 172 +-
>  include/sysemu/block-backend-io.h |   6 ++
>  3 files changed, 268 insertions(+), 2 deletions(-)
>
> diff --git a/block/block-backend.c b/block/block-backend.c
> index ef6a1f33d5..8f2cfcbd9d 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1431,6 +1431,15 @@ typedef struct BlkRwCo {
>  void *iobuf;
>  int ret;
>  BdrvRequestFlags flags;
> +union {
> +struct {
> +unsigned int *nr_zones;
> +BlockZoneDescriptor *zones;
> +} zone_report;
> +struct {
> +BlockZoneOp op;
> +} zone_mgmt;
> +};
>  } BlkRwCo;
>
>  int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
> @@ -1775,6 +1784,89 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>
> +static void blk_aio_zone_report_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
> +   rwco->zone_report.nr_zones,
> +   rwco->zone_report.zones);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor  *zones,
> +BlockCompletionFunc *cb, void *opaque)
> +{
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.offset = offset,
> +.ret= NOT_DONE,
> +.zone_report = {
> +.zones = zones,
> +.nr_zones = nr_zones,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}
> +
> +static void blk_aio_zone_mgmt_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = >rwco;
> +
> +rwco->ret = blk_co_zone_mgmt(rwco->blk, rwco->zone_mgmt.op,
> + rwco->offset, acb->bytes);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len,
> +  BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.offset = offset,
> +.ret= NOT_DONE,
> +.zone_mgmt = {
> +.op = op,
> +},
> +};
> +acb->bytes = len;
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return >common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index e9ba752f6b..9722f447a2 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -37,6 +37,7 @@
>  /* Config size before the discard support (hide associated config fields) */
>  #define VIRTIO_BLK_CFG_SIZE offsetof(struct virtio_blk_config, \
>   max_discard_sectors)
> +
>  /*
>   * Starting from the discard feature, we can use this array to properly
>   * set the config size depending on the features enabled.
> @@ -46,6 +47,8 @@ static 

Re: [RFC v5 07/11] config: add check to block layer

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:39, Sam Li  wrote:
>
> Putting zoned/non-zoned BlockDrivers on top of each other is not
> allowed.
>
> Signed-off-by: Sam Li 
> ---
>  block.c  | 13 +
>  block/file-posix.c   |  2 ++
>  block/raw-format.c   |  1 +
>  include/block/block_int-common.h | 10 ++
>  4 files changed, 26 insertions(+)
>
> diff --git a/block.c b/block.c
> index bc85f46eed..8a259b158c 100644
> --- a/block.c
> +++ b/block.c
> @@ -7947,6 +7947,19 @@ void bdrv_add_child(BlockDriverState *parent_bs, 
> BlockDriverState *child_bs,
>  return;
>  }
>
> +/*
> + * Non-zoned block drivers do not follow zoned storage constraints
> + * (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
> + * drivers in a graph.
> + */
> +if (!parent_bs->drv->supports_zoned_children && child_bs->drv->is_zoned) 
> {
> +error_setg(errp, "Cannot add a %s child to a %s parent",
> +   child_bs->drv->is_zoned ? "zoned" : "non-zoned",
> +   parent_bs->drv->supports_zoned_children ?
> +   "support zoned children" : "not support zoned children");
> +return;
> +}
> +
>  if (!QLIST_EMPTY(_bs->parents)) {
>  error_setg(errp, "The node %s already has a parent",
> child_bs->node_name);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 6c045eb6e8..8eb0b7bc9b 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -4023,6 +4023,8 @@ static BlockDriver bdrv_zoned_host_device = {
>  .format_name = "zoned_host_device",
>  .protocol_name = "zoned_host_device",
>  .instance_size = sizeof(BDRVRawState),
> +.is_zoned = true,
> +.supports_zoned_children = true,

zoned_host_device never has children, so supports_zoned_children
should not be set.

>  .bdrv_needs_filename = true,
>  .bdrv_probe_device  = hdev_probe_device,
>  .bdrv_parse_filename = zoned_host_device_parse_filename,
> diff --git a/block/raw-format.c b/block/raw-format.c
> index 6b20bd22ef..9441536819 100644
> --- a/block/raw-format.c
> +++ b/block/raw-format.c
> @@ -614,6 +614,7 @@ static void raw_child_perm(BlockDriverState *bs, 
> BdrvChild *c,
>  BlockDriver bdrv_raw = {
>  .format_name  = "raw",
>  .instance_size= sizeof(BDRVRawState),
> +.supports_zoned_children = true,
>  .bdrv_probe   = _probe,
>  .bdrv_reopen_prepare  = _reopen_prepare,
>  .bdrv_reopen_commit   = _reopen_commit,
> diff --git a/include/block/block_int-common.h 
> b/include/block/block_int-common.h
> index de44c7b6f4..0476cd0491 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -126,6 +126,16 @@ struct BlockDriver {
>   */
>  bool is_format;
>
> +/*
> + * Set to true if the BlockDriver is a zoned block driver.
> + */
> +bool is_zoned;
> +
> +/*
> + * Set to true if the BlockDriver supports zoned children.
> + */
> +bool supports_zoned_children;
> +
>  /*
>   * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
>   * this field set to true, except ones that are defined only by their
> --
> 2.37.1
>
>



Re: [RFC v5 06/11] raw-format: add zone operations to pass through requests

2022-08-01 Thread Stefan Hajnoczi
Reviewed-by: Stefan Hajnoczi 



Re: [RFC v5 05/11] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:34, Sam Li  wrote:
>
> By adding zone management operations in BlockDriver, storage controller
> emulation can use the new block layer APIs including Report Zone and
> four zone management operations (open, close, finish, reset).
>
> BlockDriver can get zone information from null_blk device by refreshing
> BLockLimits.
>
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c|  47 ++
>  block/coroutines.h   |   6 +
>  block/file-posix.c   | 272 ++-
>  block/io.c   |  57 +++
>  include/block/block-common.h |   1 -
>  include/block/block-io.h |  13 ++
>  include/block/block_int-common.h |  22 ++-
>  include/block/raw-aio.h  |   6 +-
>  meson.build  |   1 +
>  qapi/block-core.json |   7 +-
>  10 files changed, 426 insertions(+), 6 deletions(-)
>
> diff --git a/block/block-backend.c b/block/block-backend.c
> index d4a5df2ac2..ef6a1f33d5 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1775,6 +1775,53 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
>  return ret;
>  }
>
> +/*
> + * Send a zone_report command.
> + * offset is a byte offset from the start of the device. No alignment
> + * required for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor *zones)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk); /* increase before waiting */
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +return -ENOMEDIUM;

Missing blk_dec_in_flight().

> +}
> +ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * offset is the starting zone specified as a sector offset.
> + * len is the maximum number of sectors the command should operate on.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +int64_t offset, int64_t len)
> +{
> +int ret;
> +IO_CODE();
> +
> +ret = blk_check_byte_request(blk, offset, len);
> +if (ret < 0)
> +return ret;

QEMU coding style uses {} around the if statement body.

> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +return -ENOMEDIUM;

Missing blk_dec_in_flight().

> +}
> +ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 3a2bad564f..e3f62d94e5 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -63,6 +63,12 @@ nbd_co_do_establish_connection(BlockDriverState *bs, bool 
> blocking,
> Error **errp);
>
>
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +unsigned int *nr_zones,
> +BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
> +  int64_t offset, int64_t len);
> +
>  /*
>   * "I/O or GS" API functions. These functions can run without
>   * the BQL, but only in one specific iothread/main loop.
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 0d8b4acdc7..6c045eb6e8 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,9 @@
>  #include 
>  #include 
>  #include 
> +#if defined(CONFIG_BLKZONED)
> +#include 
> +#endif
>  #include 
>  #include 
>  #include 
> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData {
>  PreallocMode prealloc;
>  Error **errp;
>  } truncate;
> +struct {
> +unsigned int *nr_zones;
> +BlockZoneDescriptor *zones;
> +} zone_report;
> +struct {
> +BlockZoneOp op;
> +} zone_mgmt;
>  };
>  } RawPosixAIOData;
>
> @@ -1386,7 +1396,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  #endif
>
>  if (bs->sg || S_ISBLK(st.st_mode)) {
> -int ret = hdev_get_max_hw_transfer(s->fd, );
> +ret = hdev_get_max_hw_transfer(s->fd, );
>
>  if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
>  bs->bl.max_hw_transfer = ret;
> @@ -1402,6 +1412,27 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  if (ret < 0)
>  zoned = BLK_Z_NONE;
>  bs->bl.zoned = zoned;
> +if (zoned != BLK_Z_NONE) {
> +ret = get_sysfs_long_val(s->fd, , "chunk_sectors");
> +if (ret > 0) {
> +bs->bl.zone_sectors = 

Re: [RFC v5 04/11] file-posix: introduce get_sysfs_str_val for device zoned model

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:34, Sam Li  wrote:
>
> Use sysfs attribute files to get the string value of device
> zoned model. Then get_sysfs_zoned_model can convert it to
> BlockZoneModel type in QEMU.
>
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 86 
>  include/block/block_int-common.h |  3 ++
>  2 files changed, 89 insertions(+)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index bcf898f0cb..0d8b4acdc7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1271,6 +1271,85 @@ out:
>  #endif
>  }
>
> +/*
> + * Convert the zoned attribute file in sysfs to internal value.
> + */
> +static int get_sysfs_str_val(int fd, struct stat *st,
> +  const char *attribute,
> +  char **val) {
> +#ifdef CONFIG_LINUX
> +char buf[32];
> +char *sysfspath = NULL;

This can be g_autofree and then the explicit g_free(sysfspath) call is
no longer necessary.

> +int ret, offset;
> +int sysfd = -1;
> +
> +if (S_ISCHR(st->st_mode)) {
> +if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
> +return ret;
> +}
> +return -ENOTSUP;
> +}

This is for max_segments only. See my comment on the previous patch.

> +
> +if (!S_ISBLK(st->st_mode)) {
> +return -ENOTSUP;
> +}
> +
> +sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +major(st->st_rdev), minor(st->st_rdev),
> +attribute);

The code that follows can be replaced by a call to g_file_get_contents():
https://docs.gtk.org/glib/func.file_get_contents.html

Then the caller doesn't need to pre-allocate a 32-byte buffer. Instead
this function returns a string on the heap. The caller can free it
with g_free().

> +sysfd = open(sysfspath, O_RDONLY);
> +if (sysfd == -1) {
> +ret = -errno;
> +goto out;
> +}
> +offset = 0;
> +do {
> +ret = read(sysfd, buf + offset, sizeof(buf) - 1 + offset);
> +if (ret > 0) {
> +offset += ret;
> +}
> +} while (ret == -1);

I think this should be:

  } while (ret == -1 && errno == EINTR);

  if (ret < 0) {
  ret = -errno;
  goto out;
  }

Otherwise there is an infinite loop when an unrecoverable error occurs.

> +/* The file is ended with '\n' */
> +if (buf[ret - 1] == '\n') {
> +buf[ret - 1] = '\0';
> +}
> +
> +if (!strncpy(*val, buf, ret)) {
> +goto out;
> +}
> +
> +out:
> +if (sysfd != -1) {
> +close(sysfd);
> +}
> +g_free(sysfspath);
> +return ret;
> +#else
> +return -ENOTSUP;
> +#endif
> +}
> +
> +static int get_sysfs_zoned_model(int fd, struct stat *st,
> + BlockZoneModel *zoned) {
> +g_autofree char *val = NULL;
> +val = g_malloc(32);
> +get_sysfs_str_val(fd, st, "zoned", );
> +if (!val) {
> +return -ENOTSUP;
> +}
> +
> +if (strcmp(val, "host-managed") == 0) {
> +*zoned = BLK_Z_HM;
> +} else if (strcmp(val, "host-aware") == 0) {
> +*zoned = BLK_Z_HA;
> +} else if (strcmp(val, "none") == 0) {
> +*zoned = BLK_Z_NONE;
> +} else {
> +return -ENOTSUP;
> +}
> +return 0;
> +}
> +
>  static int hdev_get_max_segments(int fd, struct stat *st) {
>  return get_sysfs_long_val(fd, st, "max_segments");
>  }
> @@ -1279,6 +1358,8 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
>  struct stat st;
> +int ret;
> +BlockZoneModel zoned;
>
>  s->needs_alignment = raw_needs_alignment(bs);
>  raw_probe_alignment(bs, s->fd, errp);
> @@ -1316,6 +1397,11 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  bs->bl.max_hw_iov = ret;
>  }
>  }
> +
> +ret = get_sysfs_zoned_model(s->fd, , );
> +if (ret < 0)
> +zoned = BLK_Z_NONE;

QEMU coding style always uses {}:

  if (ret < 0) {
  zoned = BLK_Z_NONE;
  }

> +bs->bl.zoned = zoned;
>  }
>
>  static int check_for_dasd(int fd)
> diff --git a/include/block/block_int-common.h 
> b/include/block/block_int-common.h
> index 8947abab76..7f7863cc9e 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -825,6 +825,9 @@ typedef struct BlockLimits {
>
>  /* maximum number of iovec elements */
>  int max_iov;
> +
> +/* device zone model */
> +BlockZoneModel zoned;
>  } BlockLimits;
>
>  typedef struct BdrvOpBlocker BdrvOpBlocker;
> --
> 2.37.1
>
>



Re: [RFC v5 03/11] file-posix: introduce get_sysfs_long_val for the long sysfs attribute

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:39, Sam Li  wrote:
>
> Use sysfs attribute files to get the long value of zoned device
> information.
>
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c | 23 ---
>  1 file changed, 16 insertions(+), 7 deletions(-)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..bcf898f0cb 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1210,15 +1210,19 @@ static int hdev_get_max_hw_transfer(int fd, struct 
> stat *st)
>  #endif
>  }
>
> -static int hdev_get_max_segments(int fd, struct stat *st)
> -{
> +/*
> + * Get zoned device information (chunk_sectors, zoned_append_max_bytes,
> + * max_open_zones, max_active_zones) through sysfs attribute files.
> + */
> +static long get_sysfs_long_val(int fd, struct stat *st,
> +   const char *attribute) {
>  #ifdef CONFIG_LINUX
>  char buf[32];
>  const char *end;
>  char *sysfspath = NULL;
>  int ret;
>  int sysfd = -1;
> -long max_segments;
> +long val;
>
>  if (S_ISCHR(st->st_mode)) {
>  if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {

This code is specific to max_segments and not relevant to other
attributes. It's an alternative code path for SCSI generic types of
devices. It should be moved into hdev_get_max_segments() before
get_sysfs_long_val() is called.

> @@ -1231,8 +1235,9 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st)
>  return -ENOTSUP;
>  }
>
> -sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
> -major(st->st_rdev), minor(st->st_rdev));
> +sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
> +major(st->st_rdev), minor(st->st_rdev),
> +attribute);
>  sysfd = open(sysfspath, O_RDONLY);
>  if (sysfd == -1) {
>  ret = -errno;
> @@ -1250,9 +1255,9 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st)
>  }
>  buf[ret] = 0;
>  /* The file is ended with '\n', pass 'end' to accept that. */
> -ret = qemu_strtol(buf, , 10, _segments);
> +ret = qemu_strtol(buf, , 10, );
>  if (ret == 0 && end && *end == '\n') {
> -ret = max_segments;
> +ret = val;
>  }
>
>  out:
> @@ -1266,6 +1271,10 @@ out:
>  #endif
>  }
>
> +static int hdev_get_max_segments(int fd, struct stat *st) {
> +return get_sysfs_long_val(fd, st, "max_segments");
> +}
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> --
> 2.37.1
>
>



Re: [RFC v5 02/11] include: import virtio_blk headers from linux with zoned storage support

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:33, Sam Li  wrote:
>
> Add file from Dmitry's "virtio-blk:add support for zoned block devices"
> linux patch using scripts/update-linux-headers.sh. There is a link for
> more information: https://github.com/dmitry-fomichev/virtblk-zbd
>
> Signed-off-by: Sam Li 
> ---
>  include/standard-headers/linux/virtio_blk.h | 118 
>  1 file changed, 118 insertions(+)

This version of the  header file is not yet
upstream. Please move this patch and the virtio-blk emulation code
into a separate patch series. It depends on Dmitry's VIRTIO spec
changes being accepted and the Linux code being merged.



Re: [RFC v5 01/11] include: add zoned device structs

2022-08-01 Thread Stefan Hajnoczi
On Sun, 31 Jul 2022 at 21:32, Sam Li  wrote:
>
> Signed-off-by: Sam Li 
> ---
>  include/block/block-common.h | 43 
>  1 file changed, 43 insertions(+)
>
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..c9d28b1c51 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -49,6 +49,49 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>
> +typedef enum BlockZoneOp {
> +BLK_ZO_OPEN,
> +BLK_ZO_CLOSE,
> +BLK_ZO_FINISH,
> +BLK_ZO_RESET,
> +} BlockZoneOp;
> +
> +typedef enum BlockZoneModel {
> +BLK_Z_NONE = 0x0, /* Regular block device */
> +BLK_Z_HM = 0x1, /* Host-aware zoned block device */
> +BLK_Z_HA = 0x2, /* Host-managed zoned block device */

The HM and HA comments are swapped.

> +} BlockZoneModel;
> +
> +typedef enum BlockZoneCondition {
> +BLK_ZS_NOT_WP = 0x0,
> +BLK_ZS_EMPTY = 0x1,
> +BLK_ZS_IOPEN = 0x2,
> +BLK_ZS_EOPEN = 0x3,
> +BLK_ZS_CLOSED = 0x4,
> +BLK_ZS_RDONLY = 0xD,
> +BLK_ZS_FULL = 0xE,
> +BLK_ZS_OFFLINE = 0xF,
> +} BlockZoneCondition;
> +
> +typedef enum BlockZoneType {
> +BLK_ZT_CONV = 0x1,

/* Conventional random writes supported */

> +BLK_ZT_SWR = 0x2,

/* Sequential writes required */

> +BLK_ZT_SWP = 0x3,

/* Sequential writes preferred */

> +} BlockZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.

s/Provide/Provides/

Once these items have been addressed feel free to add:

Reviewed-by: Stefan Hajnoczi 



Re: [RFC v5 05/11] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls

2022-08-01 Thread Eric Blake
On Mon, Aug 01, 2022 at 09:33:05AM +0800, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage controller
> emulation can use the new block layer APIs including Report Zone and
> four zone management operations (open, close, finish, reset).
> 
> BlockDriver can get zone information from null_blk device by refreshing
> BLockLimits.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c|  47 ++
>  block/coroutines.h   |   6 +
>  block/file-posix.c   | 272 ++-
>  block/io.c   |  57 +++
>  include/block/block-common.h |   1 -
>  include/block/block-io.h |  13 ++
>  include/block/block_int-common.h |  22 ++-
>  include/block/raw-aio.h  |   6 +-
>  meson.build  |   1 +
>  qapi/block-core.json |   7 +-
>  10 files changed, 426 insertions(+), 6 deletions(-)
> 

> +++ b/qapi/block-core.json
> @@ -2955,7 +2955,8 @@
>  'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
>  'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
>  { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
> -'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
> +'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
> +{ 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] }

Missing a documentation line of '# @zoned_host_deivce: Since 7.2'.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 0/2] migration: fix coverity nits

2022-08-01 Thread Peter Maydell
On Thu, 21 Jul 2022 at 12:52, Peter Maydell  wrote:
>
> This patchset fixes four Coverity nits in the migration code.
> The first patch is just adding an assert() to clue coverity in
> that an array index must be in-bounds. The second adds an ULL
> suffix to force a multiplication to be done at 64 bits.
>
> thanks
> -- PMM
>
> Peter Maydell (2):
>   migration: Assert that migrate_multifd_compression() returns an
> in-range value
>   migration: Define BLK_MIG_BLOCK_SIZE as unsigned long long

Ping? This series got reviewed but doesn't seem to have
made it into a migration pullreq yet.

thanks
-- PMM



[PULL for-7.1 3/3] hw/nvme: do not enable ioeventfd by default

2022-08-01 Thread Klaus Jensen
From: Klaus Jensen 

Do not enable ioeventfd by default. Let the feature mature a bit before
we consider enabling it by default.

Fixes: 2e53b0b45024 ("hw/nvme: Use ioeventfd to handle doorbell updates")
Reviewed-by: Keith Busch 
Reviewed-by: Jinhao Fan 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 70b454eedbd8..87aeba056499 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -7670,7 +7670,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
-DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
+DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
  params.auto_transition_zones, true),
-- 
2.36.1




[PULL for-7.1 0/3] hw/nvme fixes

2022-08-01 Thread Klaus Jensen
From: Klaus Jensen 

Hi,

The following changes since commit 3916603e0c1d909e14e09d5ebcbdaa9c9e21adf3:

  Merge tag 'pull-la-20220729' of https://gitlab.com/rth7680/qemu into staging 
(2022-07-29 17:39:17 -0700)

are available in the Git repository at:

  git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request

for you to fetch changes up to e2e137f64282a2ee2f359b6df4cd93c83a308e7b:

  hw/nvme: do not enable ioeventfd by default (2022-08-01 12:01:21 +0200)


hw/nvme fixes

Some fixes for hw/nvme ioeventfd support.



Klaus Jensen (3):
  hw/nvme: skip queue processing if notifier is cleared
  hw/nvme: unregister the event notifier handler on the main loop
  hw/nvme: do not enable ioeventfd by default

 hw/nvme/ctrl.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

-- 
2.36.1




[PULL for-7.1 1/3] hw/nvme: skip queue processing if notifier is cleared

2022-08-01 Thread Klaus Jensen
From: Klaus Jensen 

While it is safe to process the queues when they are empty, skip it if
the event notifier callback was invoked spuriously.

Reviewed-by: Keith Busch 
Reviewed-by: Jinhao Fan 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 533ad14e7a61..8aa73b048d51 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4238,7 +4238,9 @@ static void nvme_cq_notifier(EventNotifier *e)
 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
 NvmeCtrl *n = cq->ctrl;
 
-event_notifier_test_and_clear(>notifier);
+if (!event_notifier_test_and_clear(e)) {
+return;
+}
 
 nvme_update_cq_head(cq);
 
@@ -4275,7 +4277,9 @@ static void nvme_sq_notifier(EventNotifier *e)
 {
 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
 
-event_notifier_test_and_clear(>notifier);
+if (!event_notifier_test_and_clear(e)) {
+return;
+}
 
 nvme_process_sq(sq);
 }
-- 
2.36.1




[PULL for-7.1 2/3] hw/nvme: unregister the event notifier handler on the main loop

2022-08-01 Thread Klaus Jensen
From: Klaus Jensen 

Make sure the notifier handler is unregistered in the main loop prior to
cleaning it up.

Fixes: 2e53b0b45024 ("hw/nvme: Use ioeventfd to handle doorbell updates")
Reviewed-by: Keith Busch 
Reviewed-by: Jinhao Fan 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8aa73b048d51..70b454eedbd8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4311,6 +4311,7 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 if (sq->ioeventfd_enabled) {
 memory_region_del_eventfd(>iomem,
   0x1000 + offset, 4, false, 0, >notifier);
+event_notifier_set_handler(>notifier, NULL);
 event_notifier_cleanup(>notifier);
 }
 g_free(sq->io_req);
@@ -4701,6 +4702,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 if (cq->ioeventfd_enabled) {
 memory_region_del_eventfd(>iomem,
   0x1000 + offset, 4, false, 0, >notifier);
+event_notifier_set_handler(>notifier, NULL);
 event_notifier_cleanup(>notifier);
 }
 if (msix_enabled(>parent_obj)) {
-- 
2.36.1