date:20220301

[PATCH v3 4/7] hw/isa/isa-bus: Remove isabus_dev_print()

2022-03-01 Thread Bernhard Beschow

All isabus_dev_print() did was to print up to two IRQ numbers per
device. This is redundant if the IRQ numbers are present as QOM
properties (see e.g. the modified tests/qemu-iotests/172.out).

Now that the last devices relying on isabus_dev_print() had their IRQ
numbers QOM'ified, the contribution of this function ultimately became
redundant. Remove it.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/isa/isa-bus.c   | 16 
 tests/qemu-iotests/172.out | 26 --
 2 files changed, 42 deletions(-)

diff --git a/hw/isa/isa-bus.c b/hw/isa/isa-bus.c
index 6c31398dda..af5add6a26 100644
--- a/hw/isa/isa-bus.c
+++ b/hw/isa/isa-bus.c
@@ -21,21 +21,18 @@
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qapi/error.h"
-#include "monitor/monitor.h"
 #include "hw/sysbus.h"
 #include "sysemu/sysemu.h"
 #include "hw/isa/isa.h"
 
 static ISABus *isabus;
 
-static void isabus_dev_print(Monitor *mon, DeviceState *dev, int indent);
 static char *isabus_get_fw_dev_path(DeviceState *dev);
 
 static void isa_bus_class_init(ObjectClass *klass, void *data)
 {
 BusClass *k = BUS_CLASS(klass);
 
-k->print_dev = isabus_dev_print;
 k->get_fw_dev_path = isabus_get_fw_dev_path;
 }
 
@@ -222,19 +219,6 @@ void isa_build_aml(ISABus *bus, Aml *scope)
 }
 }
 
-static void isabus_dev_print(Monitor *mon, DeviceState *dev, int indent)
-{
-ISADevice *d = ISA_DEVICE(dev);
-
-if (d->isairq[1] != -1) {
-monitor_printf(mon, "%*sisa irqs %d,%d\n", indent, "",
-   d->isairq[0], d->isairq[1]);
-} else if (d->isairq[0] != -1) {
-monitor_printf(mon, "%*sisa irq %d\n", indent, "",
-   d->isairq[0]);
-}
-}
-
 static void isabus_bridge_class_init(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
diff --git a/tests/qemu-iotests/172.out b/tests/qemu-iotests/172.out
index 4cf4d536b4..9479b92185 100644
--- a/tests/qemu-iotests/172.out
+++ b/tests/qemu-iotests/172.out
@@ -15,7 +15,6 @@ Testing:
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -43,7 +42,6 @@ Testing: -fda TEST_DIR/t.qcow2
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -81,7 +79,6 @@ Testing: -fdb TEST_DIR/t.qcow2
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -135,7 +132,6 @@ Testing: -fda TEST_DIR/t.qcow2 -fdb TEST_DIR/t.qcow2.2
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -190,7 +186,6 @@ Testing: -fdb
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -230,7 +225,6 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -268,7 +262,6 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2,index=1
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -322,7 +315,6 @@ Testing: -drive if=floppy,file=TEST_DIR/t.qcow2 -drive 
if=floppy,file=TEST_DIR/t
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -380,7 +372,6 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device 
floppy,drive=none0
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -418,7 +409,6 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -device 
floppy,drive=none0,unit=1
 fdtypeA = "auto"
 fdtypeB = "auto"
 fallback = "288"
-isa irq 6
 bus: floppy-bus.0
   type floppy-bus
   dev: floppy, id ""
@@ -456,7 +446,6 @@ Testing: -drive if=none,file=TEST_DIR/t.qcow2 -drive 
if=none,file=TEST_DIR/t.qco
 fdtypeA = "auto"
 fdtypeB = "auto"
 fal

[PATCH v3 7/7] isa: Inline and remove one-line isa_init_irq()

2022-03-01 Thread Bernhard Beschow

isa_init_irq() has become a trivial one-line wrapper for isa_get_irq().
It can therefore be removed.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Stefan Berger  (tpm_tis_isa)
Acked-by: Corey Minyard  (isa_ipmi_bt,
isa_ipmi_kcs)
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/audio/cs4231a.c | 2 +-
 hw/audio/gus.c | 2 +-
 hw/audio/sb16.c| 2 +-
 hw/block/fdc-isa.c | 2 +-
 hw/char/parallel.c | 2 +-
 hw/char/serial-isa.c   | 2 +-
 hw/ide/isa.c   | 2 +-
 hw/input/pckbd.c   | 4 ++--
 hw/ipmi/isa_ipmi_bt.c  | 2 +-
 hw/ipmi/isa_ipmi_kcs.c | 2 +-
 hw/isa/isa-bus.c   | 8 +---
 hw/isa/piix4.c | 2 +-
 hw/net/ne2000-isa.c| 2 +-
 hw/rtc/m48t59-isa.c| 2 +-
 hw/tpm/tpm_tis_isa.c   | 2 +-
 include/hw/isa/isa.h   | 1 -
 16 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/hw/audio/cs4231a.c b/hw/audio/cs4231a.c
index 7d60ce6f0e..0723e39430 100644
--- a/hw/audio/cs4231a.c
+++ b/hw/audio/cs4231a.c
@@ -677,7 +677,7 @@ static void cs4231a_realizefn (DeviceState *dev, Error 
**errp)
 return;
 }
 
-isa_init_irq(d, &s->pic, s->irq);
+s->pic = isa_get_irq(d, s->irq);
 k = ISADMA_GET_CLASS(s->isa_dma);
 k->register_channel(s->isa_dma, s->dma, cs_dma_read, s);
 
diff --git a/hw/audio/gus.c b/hw/audio/gus.c
index e8719ee117..42f010b671 100644
--- a/hw/audio/gus.c
+++ b/hw/audio/gus.c
@@ -282,7 +282,7 @@ static void gus_realizefn (DeviceState *dev, Error **errp)
 s->emu.himemaddr = s->himem;
 s->emu.gusdatapos = s->emu.himemaddr + 1024 * 1024 + 32;
 s->emu.opaque = s;
-isa_init_irq (d, &s->pic, s->emu.gusirq);
+s->pic = isa_get_irq(d, s->emu.gusirq);
 
 AUD_set_active_out (s->voice, 1);
 }
diff --git a/hw/audio/sb16.c b/hw/audio/sb16.c
index 60f1f75e3a..2215386ddb 100644
--- a/hw/audio/sb16.c
+++ b/hw/audio/sb16.c
@@ -1408,7 +1408,7 @@ static void sb16_realizefn (DeviceState *dev, Error 
**errp)
 return;
 }
 
-isa_init_irq (isadev, &s->pic, s->irq);
+s->pic = isa_get_irq(isadev, s->irq);
 
 s->mixer_regs[0x80] = magic_of_irq (s->irq);
 s->mixer_regs[0x81] = (1 << s->dma) | (1 << s->hdma);
diff --git a/hw/block/fdc-isa.c b/hw/block/fdc-isa.c
index ab663dce93..fa20450747 100644
--- a/hw/block/fdc-isa.c
+++ b/hw/block/fdc-isa.c
@@ -94,7 +94,7 @@ static void isabus_fdc_realize(DeviceState *dev, Error **errp)
  isa->iobase, fdc_portio_list, fdctrl,
  "fdc");
 
-isa_init_irq(isadev, &fdctrl->irq, isa->irq);
+fdctrl->irq = isa_get_irq(isadev, isa->irq);
 fdctrl->dma_chann = isa->dma;
 if (fdctrl->dma_chann != -1) {
 IsaDmaClass *k;
diff --git a/hw/char/parallel.c b/hw/char/parallel.c
index b45e67bfbb..adb9bd9be3 100644
--- a/hw/char/parallel.c
+++ b/hw/char/parallel.c
@@ -553,7 +553,7 @@ static void parallel_isa_realizefn(DeviceState *dev, Error 
**errp)
 index++;
 
 base = isa->iobase;
-isa_init_irq(isadev, &s->irq, isa->isairq);
+s->irq = isa_get_irq(isadev, isa->isairq);
 qemu_register_reset(parallel_reset, s);
 
 qemu_chr_fe_set_handlers(&s->chr, parallel_can_receive, NULL,
diff --git a/hw/char/serial-isa.c b/hw/char/serial-isa.c
index 1b8b303079..7a7ed239cd 100644
--- a/hw/char/serial-isa.c
+++ b/hw/char/serial-isa.c
@@ -75,7 +75,7 @@ static void serial_isa_realizefn(DeviceState *dev, Error 
**errp)
 }
 index++;
 
-isa_init_irq(isadev, &s->irq, isa->isairq);
+s->irq = isa_get_irq(isadev, isa->isairq);
 qdev_realize(DEVICE(s), NULL, errp);
 qdev_set_legacy_instance_id(dev, isa->iobase, 3);
 
diff --git a/hw/ide/isa.c b/hw/ide/isa.c
index 24bbde24c2..8bedbd13f1 100644
--- a/hw/ide/isa.c
+++ b/hw/ide/isa.c
@@ -75,7 +75,7 @@ static void isa_ide_realizefn(DeviceState *dev, Error **errp)
 
 ide_bus_init(&s->bus, sizeof(s->bus), dev, 0, 2);
 ide_init_ioport(&s->bus, isadev, s->iobase, s->iobase2);
-isa_init_irq(isadev, &s->irq, s->isairq);
+s->irq = isa_get_irq(isadev, s->isairq);
 ide_init2(&s->bus, s->irq);
 vmstate_register(VMSTATE_IF(dev), 0, &vmstate_ide_isa, s);
 ide_register_restart_cb(&s->bus);
diff --git a/hw/input/pckbd.c b/hw/input/pckbd.c
index eb77e12f6f..1773db0d25 100644
--- a/hw/input/pckbd.c
+++ b/hw/input/pckbd.c
@@ -749,8 +749,8 @@ static void i8042_realizefn(DeviceState *dev, Error **errp)
 return;
 }
 
-isa_init_irq(isadev, &s->irq_kbd, isa_s->kbd_irq);
-isa_init_irq(isadev, &s->irq_mouse, isa_s->mouse_irq);
+s->irq_kbd = isa_get_irq(isadev, isa_s->kbd_irq);
+s->irq_mouse = isa_get_irq(isadev, isa_s->mouse_irq);
 
 isa_register_ioport(isadev, isa_s->io + 0, 0x60);
 isa_register_ioport(isadev, isa_s->io + 1, 0x64);
diff --git a/hw/ipmi/isa_ipmi_bt.c b/hw/ipmi/isa_ipmi_bt.c
index 02625eb94e..88aa734e9e 100644
--- a/hw/ipmi/isa_ipmi_bt.c
+++ b/hw/ipmi/isa_ipmi_bt.c
@@ -92,7 +92,7 @@ static void isa_ipmi_bt_realize(DeviceState *dev, Error 
**errp)
 }
 
 if (iib->isairq > 0) {
-

[PATCH 2/3] block/copy-before-write: add on-cbw-error open parameter

2022-03-01 Thread Vladimir Sementsov-Ogievskiy

Currently, behavior on copy-before-write operation failure is simple:
report error to the guest.

Let's implement alternative behavior: break the whole copy-before-write
process (and corresponding backup job or NBD client) but keep guest
working. It's needed if we consider guest stability as more important.

The realisation is simple: on copy-before-write failure we immediately
continue guest write operation and set s->snapshot_ret variable which
will lead to all further and in-flight snapshot-API requests failure.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json  | 27 ++-
 block/copy-before-write.c | 57 +--
 2 files changed, 75 insertions(+), 9 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index f13b5ff942..e5206272aa 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4159,6 +4159,27 @@
   'base': 'BlockdevOptionsGenericFormat',
   'data': { '*bottom': 'str' } }
 
+##
+# @OnCbwError:
+#
+# An enumeration of possible behaviors for copy-before-write operation
+# failures.
+#
+# @break-guest-write: report the error to the guest. This way the state
+# of copy-before-write process is kept OK and
+# copy-before-write filter continues to work normally.
+#
+# @break-snapshot: continue guest write. Since this, the snapshot state
+#  provided by copy-before-write filter becomes broken.
+#  So, all in-flight and all further snapshot-access
+#  operations (through snapshot-access block driver)
+#  will fail.
+#
+# Since: 7.0
+##
+{ 'enum': 'OnCbwError',
+  'data': [ 'break-guest-write', 'break-snapshot' ] }
+
 ##
 # @BlockdevOptionsCbw:
 #
@@ -4180,11 +4201,15 @@
 #  modifications (or removing) of specified bitmap doesn't
 #  influence the filter. (Since 7.0)
 #
+# @on-cbw-error: Behavior on failure of copy-before-write operation.
+#Default is @break-guest-write. (Since 7.0)
+#
 # Since: 6.2
 ##
 { 'struct': 'BlockdevOptionsCbw',
   'base': 'BlockdevOptionsGenericFormat',
-  'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap' } }
+  'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
+'*on-cbw-error': 'OnCbwError' } }
 
 ##
 # @BlockdevOptions:
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 56aa7577c3..e89cc9799c 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -41,6 +41,7 @@
 typedef struct BDRVCopyBeforeWriteState {
 BlockCopyState *bcs;
 BdrvChild *target;
+OnCbwError on_cbw_error;
 
 /*
  * @lock: protects access to @access_bitmap, @done_bitmap and
@@ -65,6 +66,14 @@ typedef struct BDRVCopyBeforeWriteState {
  * node. These areas must not be rewritten by guest.
  */
 BlockReqList frozen_read_reqs;
+
+/*
+ * @snapshot_error is normally zero. But on first copy-before-write failure
+ * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
+ * value of this error (<0). After that all in-flight and further
+ * snaoshot-API requests will fail with that error.
+ */
+int snapshot_error;
 } BDRVCopyBeforeWriteState;
 
 static coroutine_fn int cbw_co_preadv(
@@ -99,11 +108,25 @@ static coroutine_fn int 
cbw_do_copy_before_write(BlockDriverState *bs,
 end = QEMU_ALIGN_UP(offset + bytes, cluster_size);
 
 ret = block_copy(s->bcs, off, end - off, true);
-if (ret < 0) {
+if (ret < 0 && s->on_cbw_error == ON_CBW_ERROR_BREAK_GUEST_WRITE) {
 return ret;
 }
 
 WITH_QEMU_LOCK_GUARD(&s->lock) {
+if (ret < 0) {
+assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT);
+if (!s->snapshot_error) {
+s->snapshot_error = ret;
+}
+/*
+ * No need to wait for s->frozen_read_reqs: they will fail anyway,
+ * as s->snapshot_error is set.
+ *
+ * We return 0, as error is handled. Guest operation should be
+ * continued.
+ */
+return 0;
+}
 bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
 reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock);
 }
@@ -176,6 +199,11 @@ static BlockReq *cbw_snapshot_read_lock(BlockDriverState 
*bs,
 
 QEMU_LOCK_GUARD(&s->lock);
 
+if (s->snapshot_error) {
+g_free(req);
+return NULL;
+}
+
 if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
 g_free(req);
 return NULL;
@@ -198,19 +226,26 @@ static BlockReq *cbw_snapshot_read_lock(BlockDriverState 
*bs,
 return req;
 }
 
-static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
+static int cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
 {
 BDRVCopyBeforeWriteState *s = bs->opaque;
 
 if (req->offset == -1 && req->by

[PATCH 1/3] block/copy-before-write: refactor option parsing

2022-03-01 Thread Vladimir Sementsov-Ogievskiy

We are going to add one more option of enum type. Let's refactor option
parsing so that we can simply work with BlockdevOptionsCbw object.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-before-write.c | 66 ---
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 0b6d26605c..56aa7577c3 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -24,6 +24,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qapi/qmp/qjson.h"
 
 #include "sysemu/block-backend.h"
 #include "qemu/cutils.h"
@@ -328,46 +329,49 @@ static void cbw_child_perm(BlockDriverState *bs, 
BdrvChild *c,
 }
 }
 
-static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap,
-Error **errp)
+static BlockdevOptionsCbw *cbw_parse_options(QDict *options, Error **errp)
 {
-QDict *bitmap_qdict = NULL;
-BlockDirtyBitmap *bmp_param = NULL;
+QDict *cbw_qdict = NULL;
+BlockdevOptionsCbw *opts = NULL;
 Visitor *v = NULL;
-bool ret = false;
 
-*bitmap = NULL;
+cbw_qdict = qdict_clone_shallow(options);
 
-qdict_extract_subqdict(options, &bitmap_qdict, "bitmap.");
-if (!qdict_size(bitmap_qdict)) {
-ret = true;
-goto out;
-}
+/*
+ * Delete BlockdevOptions base fields, that are not part of
+ * BlockdevOptionsCbw.
+ */
+qdict_del(cbw_qdict, "driver");
+qdict_del(cbw_qdict, "node-name");
+qdict_del(cbw_qdict, "discard");
+qdict_del(cbw_qdict, "cache");
+qdict_extract_subqdict(cbw_qdict, NULL, "cache.");
+qdict_del(cbw_qdict, "read-only");
+qdict_del(cbw_qdict, "auto-read-only");
+qdict_del(cbw_qdict, "force-share");
+qdict_del(cbw_qdict, "detect-zeroes");
 
-v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp);
+v = qobject_input_visitor_new_flat_confused(cbw_qdict, errp);
 if (!v) {
 goto out;
 }
 
-visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp);
-if (!bmp_param) {
+visit_type_BlockdevOptionsCbw(v, NULL, &opts, errp);
+if (!opts) {
 goto out;
 }
 
-*bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL,
-errp);
-if (!*bitmap) {
-goto out;
-}
-
-ret = true;
+/*
+ * Delete options which we are going to parse through BlockdevOptionsCbw
+ * object for original options.
+ */
+qdict_extract_subqdict(options, NULL, "bitmap");
 
 out:
-qapi_free_BlockDirtyBitmap(bmp_param);
 visit_free(v);
-qobject_unref(bitmap_qdict);
+qobject_unref(cbw_qdict);
 
-return ret;
+return opts;
 }
 
 static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
@@ -376,6 +380,12 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 BDRVCopyBeforeWriteState *s = bs->opaque;
 BdrvDirtyBitmap *bitmap = NULL;
 int64_t cluster_size;
+g_autoptr(BlockdevOptionsCbw) opts = NULL;
+
+opts = cbw_parse_options(options, errp);
+if (!opts) {
+return -EINVAL;
+}
 
 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
@@ -390,8 +400,12 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 return -EINVAL;
 }
 
-if (!cbw_parse_bitmap_option(options, &bitmap, errp)) {
-return -EINVAL;
+if (opts->has_bitmap) {
+bitmap = block_dirty_bitmap_lookup(opts->bitmap->node,
+   opts->bitmap->name, NULL, errp);
+if (!bitmap) {
+return -EINVAL;
+}
 }
 
 bs->total_sectors = bs->file->bs->total_sectors;
-- 
2.31.1

[PATCH 3/3] iotests: add copy-before-write: on-cbw-error tests

2022-03-01 Thread Vladimir Sementsov-Ogievskiy

Add tests for new option of copy-before-write filter: on-cbw-error.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/tests/copy-before-write| 128 ++
 .../qemu-iotests/tests/copy-before-write.out  |   5 +
 2 files changed, 133 insertions(+)
 create mode 100755 tests/qemu-iotests/tests/copy-before-write
 create mode 100644 tests/qemu-iotests/tests/copy-before-write.out

diff --git a/tests/qemu-iotests/tests/copy-before-write 
b/tests/qemu-iotests/tests/copy-before-write
new file mode 100755
index 00..a32608f597
--- /dev/null
+++ b/tests/qemu-iotests/tests/copy-before-write
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# group: auto backup
+#
+# Copyright (c) 2022 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+import re
+
+import iotests
+from iotests import qemu_img_create, qemu_io
+
+
+temp_img = os.path.join(iotests.test_dir, 'temp')
+source_img = os.path.join(iotests.test_dir, 'source')
+size = '1M'
+
+
+class TestCbwError(iotests.QMPTestCase):
+def tearDown(self):
+self.vm.shutdown()
+os.remove(temp_img)
+os.remove(source_img)
+
+def setUp(self):
+qemu_img_create('-f', iotests.imgfmt, source_img, size)
+qemu_img_create('-f', iotests.imgfmt, temp_img, size)
+qemu_io('-c', 'write 0 1M', source_img)
+
+self.vm = iotests.VM()
+self.vm.launch()
+
+def do_cbw_error(self, on_cbw_error):
+result = self.vm.qmp('blockdev-add', {
+'node-name': 'cbw',
+'driver': 'copy-before-write',
+'on-cbw-error': on_cbw_error,
+'file': {
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': source_img,
+}
+},
+'target': {
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'blkdebug',
+'image': {
+'driver': 'file',
+'filename': temp_img
+},
+'inject-error': [
+{
+'event': 'write_aio',
+'errno': 5,
+'immediately': False,
+'once': True
+}
+]
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-add', {
+'node-name': 'access',
+'driver': 'snapshot-access',
+'file': 'cbw'
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.hmp_qemu_io('cbw', 'write 0 1M')
+self.assert_qmp(result, 'return', '')
+
+result = self.vm.hmp_qemu_io('access', 'read 0 1M')
+self.assert_qmp(result, 'return', '')
+
+self.vm.shutdown()
+log = self.vm.get_log()
+log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log)
+log = re.sub(r'\[I \+\d+\.\d+\] CLOSED\n?$', '', log)
+log = iotests.filter_qemu_io(log)
+return log
+
+def test_break_snapshot_on_cbw_error(self):
+"""break-snapshot behavior:
+Guest write succeed, but further snapshot-read fails, as snapshot is
+broken.
+"""
+log = self.do_cbw_error('break-snapshot')
+
+self.assertEqual(log, """\
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read failed: Permission denied
+""")
+
+def test_break_guest_write_on_cbw_error(self):
+"""break-guest-write behavior:
+Guest write fails, but snapshot-access continues working and further
+snapshot-read succeeds.
+"""
+log = self.do_cbw_error('break-guest-write')
+
+self.assertEqual(log, """\
+write failed: Input/output error
+read 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+""")
+
+
+if __name__ == '__main__':
+iotests.main(supported_fmts=['qcow2'],
+ supported_protocols=['file'])
diff --git a/tests/qemu-iotests/tests/copy-before-write.out 
b/tests/qemu-iotests/tests/copy-before-write.out
new file mode 100644
index 00..fbc63e62f8
--- /dev/null
+++ b/tests/qemu-iotests

[PATCH 0/3] block: copy-before-write: on-cbw-error behavior

2022-03-01 Thread Vladimir Sementsov-Ogievskiy

Hi all!

Here is a new option for copy-before-write filter, to alter its behavior
on copy-before-write operation failure.

Based-on: <20220228113927.1852146-1-vsement...@virtuozzo.com>
   ([PATCH v5 00/16] Make image fleecing more usable)

Vladimir Sementsov-Ogievskiy (3):
  block/copy-before-write: refactor option parsing
  block/copy-before-write: add on-cbw-error open parameter
  iotests: add copy-before-write: on-cbw-error tests

 qapi/block-core.json  |  27 +++-
 block/copy-before-write.c | 123 -
 tests/qemu-iotests/tests/copy-before-write| 128 ++
 .../qemu-iotests/tests/copy-before-write.out  |   5 +
 4 files changed, 248 insertions(+), 35 deletions(-)
 create mode 100755 tests/qemu-iotests/tests/copy-before-write
 create mode 100644 tests/qemu-iotests/tests/copy-before-write.out

-- 
2.31.1

[PATCH] block: Fix BB.root changing across bdrv_next()

2022-03-01 Thread Hanna Reitz

bdrv_next() has no guarantee that its caller has stopped all block graph
operations; for example, bdrv_flush_all() does not.

The latter can actually provoke such operations, because its
bdrv_flush() call, which runs a coroutine (bdrv_co_flush()), may run
this coroutine in a different AioContext than the main one, and then
when this coroutine is done and invokes aio_wait_kick(), the monitor may
get a chance to run and start executing some graph-modifying QMP
command.

One example for this is when the VM encounters an I/O error on a block
device and stops, triggering a bdrv_flush_all(), and a blockdev-mirror
is started simultaneously on a block node in an I/O thread.  When
bdrv_flush_all() comes to that node[1] and flushes it, the
aio_wait_kick() at the end of bdrv_co_flush_entry() may cause the
monitor to process the mirror request, and mirror_start_job() will then
replace the node by a mirror filter node, before bdrv_flush_all()
resumes and can invoke bdrv_next() again to continue iterating.

[1] Say there is a BlockBackend on top of the node in question, and so
bdrv_next() finds that BB and returns the node as the BB's blk_bs().
bdrv_next() will bdrv_ref() the node such that it remains valid through
bdrv_flush_all()'s iteration, and drop the reference when it is called
the next time.

The problem is that bdrv_next() does not store to which BDS it retains a
strong reference when the BDS is a BB's child, so on the subsequent
call, it will just invoke blk_bs() again and bdrv_unref() the returned
node -- but as the example above shows, this node might be a different
one than the one that was bdrv_ref()-ed before.  This can lead to a
use-after-free (for the mirror filter node in our example), because this
negligent bdrv_unref() would steal a strong reference from someone else.

We can solve this problem by always storing the returned (and strongly
referenced) BDS in BdrvNextIterator.bs.  When we want to drop the strong
reference of a BDS previously returned, always drop BdrvNextIterator.bs
instead of using other ways of trying to figure out what that BDS was
that we returned last time.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2058457
Signed-off-by: Hanna Reitz 
---
Sadly, I didn't find a nice way to test this, primarily because I
believe reproducing this requires a bdrv_flush_all() to come from the
VM (without QMP command).  The only way I know that this can happen is
when the VM encounters an I/O error and responds with stopping the
guest.
It's also anything but easily reproducible, and I can't think of a way
to improve on that, because it really seems to be based on chance
whether the aio_wait_kick() wakes up the monitor and has it process an
incoming QMP command.
---
 block/block-backend.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 4ff6b4d785..c822f257dc 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -575,7 +575,7 @@ BlockBackend *blk_next(BlockBackend *blk)
  * the monitor or attached to a BlockBackend */
 BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-BlockDriverState *bs, *old_bs;
+BlockDriverState *bs, *old_bs = it->bs;
 
 /* Must be called from the main loop */
 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
@@ -586,8 +586,6 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it)
 if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 BlockBackend *old_blk = it->blk;
 
-old_bs = old_blk ? blk_bs(old_blk) : NULL;
-
 do {
 it->blk = blk_all_next(it->blk);
 bs = it->blk ? blk_bs(it->blk) : NULL;
@@ -601,11 +599,12 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it)
 if (bs) {
 bdrv_ref(bs);
 bdrv_unref(old_bs);
+it->bs = bs;
 return bs;
 }
 it->phase = BDRV_NEXT_MONITOR_OWNED;
-} else {
-old_bs = it->bs;
+/* Start from the first monitor-owned BDS */
+it->bs = NULL;
 }
 
 /* Then return the monitor-owned BDSes without a BB attached. Ignore all
-- 
2.34.1

Re: [PATCH v7 00/31] block layer: split block APIs in global state and I/O

2022-03-01 Thread Kevin Wolf

Am 11.02.2022 um 15:51 hat Emanuele Giuseppe Esposito geschrieben:
> Currently, block layer APIs like block.h contain a mix of
> functions that are either running in the main loop and under the
> BQL, or are thread-safe functions and run in iothreads performing I/O.
> The functions running under BQL also take care of modifying the
> block graph, by using drain and/or aio_context_acquire/release.
> This makes it very confusing to understand where each function
> runs, and what assumptions it provided with regards to thread
> safety.
> 
> We call the functions running under BQL "global state (GS) API", and
> distinguish them from the thread-safe "I/O API".
> 
> The aim of this series is to split the relevant block headers in
> global state and I/O sub-headers. The division will be done in
> this way:
> header.h will be split in header-global-state.h, header-io.h and
> header-common.h. The latter will just contain the data structures
> needed by header-global-state and header-io, and common helpers
> that are neither in GS nor in I/O. header.h will remain for
> legacy and to avoid changing all includes in all QEMU c files,
> but will only include the two new headers. No function shall be
> added in header.c .
> Once we split all relevant headers, it will be much easier to see what
> uses the AioContext lock and remove it, which is the overall main
> goal of this and other series that I posted/will post.
> 
> In addition to splitting the relevant headers shown in this series,
> it is also very helpful splitting the function pointers in some
> block structures, to understand what runs under AioContext lock and
> what doesn't. This is what patches 21-27 do.
> 
> Each function in the GS API will have an assertion, checking
> that it is always running under BQL.
> I/O functions are instead thread safe (or so should be), meaning
> that they *can* run under BQL, but also in an iothread in another
> AioContext. Therefore they do not provide any assertion, and
> need to be audited manually to verify the correctness.
> 
> Adding assetions has helped finding 2 bugs already, as shown in
> my series "Migration: fix missing iothread locking".
> 
> Tested this series by running unit tests, qemu-iotests and qtests
> (x86_64).
> Some functions in the GS API are used everywhere but not
> properly tested. Therefore their assertion is never actually run in
> the tests, so despite my very careful auditing, it is not impossible
> to exclude that some will trigger while actually using QEMU.
> 
> Patch 1 introduces qemu_in_main_thread(), the function used in
> all assertions. This had to be introduced otherwise all unit tests
> would fail, since they run in the main loop but use the code in
> stubs/iothread.c
> Patches 2-27 (with the exception of patch 9-10, that are an additional
> assert) are all structured in the same way: first we split the header
> and in the next (or same, if small) patch we add assertions.
> Patch 28-31 take care instead of the block layer permission API,
> fixing some bugs where they are used in I/O functions.
> 
> This serie depends on my previous serie "block layer: permission API
> refactoring in preparation to the API split"
> 
> Based-on: <20220209105452.1694545-1-eespo...@redhat.com>
> 
> Signed-off-by: Emanuele Giuseppe Esposito 

Verifying the correctness of all the categorisations looked hard, so
instead of that I'll give you something to review now. :-)

The best way I could think of is to hack up a small script that checks
the consistency of all the macro annotations, i.e. no direct caller of
IO_CODE() may indirectly call GLOBAL_STATE_CODE() etc.

I'll attach that script so you can verify if the approach makes sense to
you and if my code is correct.

The good news is that the resulting list of errors is relatively small,
but it's not entirely empty. A good number of them look like false
positives (probably everything going through bdrv_co_drain_bh_cb()), but
there seem to be a few real ones, too:

Error: bdrv_get_full_backing_filename() is IO_CODE(), but calls 
GLOBAL_STATE_CODE() code
   bdrv_get_full_backing_filename() -> bdrv_make_absolute_filename() -> 
bdrv_dirname() -> GLOBAL_STATE_CODE()

Error: blk_error_action() is IO_CODE(), but calls GLOBAL_STATE_CODE() code
   blk_error_action() -> send_qmp_error_event() -> 
blk_iostatus_is_enabled() -> GLOBAL_STATE_CODE()

Error: bdrv_drained_end_no_poll() is IO_CODE(), but calls GLOBAL_STATE_CODE() 
code
   bdrv_drained_end_no_poll() -> bdrv_do_drained_end() -> 
bdrv_do_drained_end() -> bdrv_co_yield_to_drain() -> bdrv_co_drain_bh_cb() -> 
bdrv_drain_all_begin() -> GLOBAL_STATE_CODE()

Error: nbd_co_do_establish_connection() is IO_CODE(), but calls 
GLOBAL_STATE_CODE() code
   nbd_co_do_establish_connection() -> nbd_handle_updated_info() -> 
bdrv_apply_auto_read_only() -> GLOBAL_STATE_CODE()

Error: bdrv_drained_end_no_poll() is IO_CODE(), but calls IO_OR_GS_CODE() code
   bdrv_drained_end_no_poll() -> bdrv_do_drained_end() -> 
bd

Re: [PATCH v2 0/6] hw/nvme: enhanced protection information (64-bit guard)

2022-03-01 Thread Keith Busch

On Tue, Mar 01, 2022 at 11:44:22AM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This adds support for one possible new protection information format
> introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
> and 48-bit reference tag. This version does not support storage tags.
> 
> Like the CRC16 support already present, this uses a software
> implementation of CRC64 (so it is naturally pretty slow). But its good
> enough for verification purposes.
> 
> This goes hand-in-hand with the support that Keith submitted for the
> Linux kernel[1].
> 
>   [1]: 
> https://lore.kernel.org/linux-nvme/20220201190128.3075065-1-kbu...@kernel.org/

Thanks Klaus, this looks good to me.

Reviewed-by: Keith Busch

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-03-01 Thread Emanuele Giuseppe Esposito

I would really love to hear opinions on this, since we already had some
discussions on other similar patches.

Thank you,
Emanuele

On 01/03/2022 15:21, Emanuele Giuseppe Esposito wrote:
> This serie tries to provide a proof of concept and a clear explanation
> on why we need to use drains (and more precisely subtree_drains)
> to replace the aiocontext lock, especially to protect BlockDriverState
> ->children and ->parent lists.
> 
> Just a small recap on the key concepts:
> * We split block layer APIs in "global state" (GS), "I/O", and
> "global state or I/O".
>   GS are running in the main loop, under BQL, and are the only
>   one allowed to modify the BlockDriverState graph.
> 
>   I/O APIs are thread safe and can run in any thread
> 
>   "global state or I/O" are essentially all APIs that use
>   BDRV_POLL_WHILE. This is because there can be only 2 threads
>   that can use BDRV_POLL_WHILE: main loop and the iothread that
>   runs the aiocontext.
> 
> * Drains allow the caller (either main loop or iothread running
> the context) to wait all in_flights requests and operations
> of a BDS: normal drains target a given node and is parents, while
> subtree ones also include the subgraph of the node. Siblings are
> not affected by any of these two kind of drains.
> After bdrv_drained_begin, no more request is allowed to come
> from the affected nodes. Therefore the only actor left working
> on a drained part of the graph should be the main loop.
> 
> What do we intend to do
> ---
> We want to remove the AioContext lock. It is not 100% clear on how
> many things we are protecting with it, and why.
> As a starter, we want to protect BlockDriverState ->parents and
> ->children lists, since they are read by main loop and I/O but
> only written by main loop under BQL. The function that modifies
> these lists is bdrv_replace_child_common().
> 
> How do we want to do it
> ---
> We individuated as ideal subtitute of AioContext lock
> the subtree_drain API. The reason is simple: draining prevents the iothread 
> to read or write the nodes, so once the main loop finishes
> executing bdrv_drained_begin() on the interested graph, we are sure that
> the iothread is not going to look or even interfere with that part of the 
> graph.
> We are also sure that the only two actors that can look at a specific
> BlockDriverState in any given context are the main loop and the
> iothread running the AioContext (ensured by "global state or IO" logic).
> 
> Why use _subtree_ instead of normal drain
> -
> A simple drain "blocks" a given node and all its parents.
> But it doesn't touch the child.
> This means that if we use a simple drain, a child can always
> keep processing requests, and eventually end up calling itself
> bdrv_drained_begin, ending up reading the parents node while the main loop
> is modifying them. Therefore a subtree drain is necessary.
> 
> Possible scenarios
> ---
> Keeping in mind that we can only have an iothread and the main loop
> draining on a certain node, we could have:
> 
> main loop successfully drains and then iothread tries to drain:
>   impossible scenario, as iothread is already stopped once main
>   successfully drains.
> 
> iothread successfully drains and then main loop drains:
>   should not be a problem, as:
>   1) the iothread should be already "blocked" by its own drain
>   2) main loop would still wait for it to completely block
>   There is the issue of mirror overriding such scenario to avoid
>   having deadlocks, but that is handled in the next section.
> 
> main loop and iothread try to drain together:
>   As above, this case doens't really matter. As long as
>   bdrv_drained_begin invariant is respected, the main loop will
>   continue only once the iothread is "blocked" on that part of the graph.
> 
> A note on iothread draining
> ---
> Theoretically draining from an iothread should not be possible,
> as the iothread would be scheduling a bh in the main loop waiting
> for itself to stop, even though it is not yet stopped since it is waiting for 
> the bh.
> 
> This is what would happen in the tests in patch 5 if .drained_poll
> was not implemented.
> 
> Therefore, one solution is to use .drained_poll callback in BlockJobDriver.
> This callback overrides the default job poll() behavior, and
> allows the polling condition to stop waiting for the job.
> It is actually used only in mirror.
> This however breaks bdrv_drained_begin invariant, because the
> iothread is not really blocked on that node but continues running.
> In order to fix this, patch 4 allows the polling condition to be
> used only by the iothread, and not the main loop too, preventing
> the drain to return before the iothread is effectively stopped.
> This is also shown in the tests in patch 5. If the fix in patch
> 4 is removed, then the main loop drain will return earlier and
> allow the iothread to run

[RFC PATCH 4/5] child_job_drained_poll: override polling condition only when in home thread

2022-03-01 Thread Emanuele Giuseppe Esposito

drv->drained_poll() is only implemented in mirror, and allows
it to drain from the coroutine. The mirror implementation uses
in_drain flag to recognize when it is draining from coroutine,
and consequently avoid deadlocking (wait the poll condition in
child_job_drained_poll to wait for itself).

The problem is that this flag is dangerous, because it breaks
bdrv_drained_begin() invariants: once drained_begin ends, all
jobs, in_flight requests, and anything running in the iothread
are blocked.

This can be broken in such way:
iothread(mirror): s->in_drain = true; // mirror.c:1112
main loop: bdrv_drained_begin(mirror_bs);
/*
 * drained_begin wait for bdrv_drain_poll_top_level() condition,
 * that translates in child_job_drained_poll() for jobs, but
 * mirror implements drv->drained_poll() so it returns
 * !!in_flight_requests, which his 0 (assertion in mirror.c:1105).
 */
main loop: thinks iothread is stopped and is modifying the graph...
iothread(mirror): *continues*, as nothing is stopping it
iothread(mirror): bdrv_drained_begin(bs);
/* draining reads the graph while it is modified!! */
main loop: done modifying the graph...

In order to fix this, we can simply allow drv->drained_poll()
to be called only by the iothread, and not the main loop.
We distinguish it by using in_aio_context_home_thread(), that
returns false if @ctx is not the same as the thread that runs it.

Co-Developed-by: Paolo Bonzini 
Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/blockjob.c b/blockjob.c
index 10815a89fe..e132d9587e 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -107,6 +107,7 @@ static bool child_job_drained_poll(BdrvChild *c)
 BlockJob *bjob = c->opaque;
 Job *job = &bjob->job;
 const BlockJobDriver *drv = block_job_driver(bjob);
+AioContext *ctx = block_job_get_aio_context(bjob);
 
 /* An inactive or completed job doesn't have any pending requests. Jobs
  * with !job->busy are either already paused or have a pause point after
@@ -117,7 +118,7 @@ static bool child_job_drained_poll(BdrvChild *c)
 
 /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
  * override this assumption. */
-if (drv->drained_poll) {
+if (in_aio_context_home_thread(ctx) && drv->drained_poll) {
 return drv->drained_poll(bjob);
 } else {
 return true;
-- 
2.31.1

[RFC PATCH 1/5] aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED

2022-03-01 Thread Emanuele Giuseppe Esposito

Same as AIO_WAIT_WHILE macro, but if we are in the Main loop
do not release and then acquire ctx_ 's aiocontext.

Once all Aiocontext locks go away, this macro will replace
AIO_WAIT_WHILE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/aio-wait.h | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
index b39eefb38d..ff27fe4eab 100644
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@@ -59,10 +59,11 @@ typedef struct {
 extern AioWait global_aio_wait;
 
 /**
- * AIO_WAIT_WHILE:
+ * _AIO_WAIT_WHILE:
  * @ctx: the aio context, or NULL if multiple aio contexts (for which the
  *   caller does not hold a lock) are involved in the polling condition.
  * @cond: wait while this conditional expression is true
+ * @unlock: whether to unlock and then lock again @ctx
  *
  * Wait while a condition is true.  Use this to implement synchronous
  * operations that require event loop activity.
@@ -75,7 +76,7 @@ extern AioWait global_aio_wait;
  * wait on conditions between two IOThreads since that could lead to deadlock,
  * go via the main loop instead.
  */
-#define AIO_WAIT_WHILE(ctx, cond) ({   \
+#define _AIO_WAIT_WHILE(ctx, cond, unlock) ({  \
 bool waited_ = false;  \
 AioWait *wait_ = &global_aio_wait; \
 AioContext *ctx_ = (ctx);  \
@@ -90,11 +91,11 @@ extern AioWait global_aio_wait;
 assert(qemu_get_current_aio_context() ==   \
qemu_get_aio_context());\
 while ((cond)) {   \
-if (ctx_) {\
+if (unlock && ctx_) {  \
 aio_context_release(ctx_); \
 }  \
 aio_poll(qemu_get_aio_context(), true);\
-if (ctx_) {\
+if (unlock && ctx_) {  \
 aio_context_acquire(ctx_); \
 }  \
 waited_ = true;\
@@ -103,6 +104,12 @@ extern AioWait global_aio_wait;
 qatomic_dec(&wait_->num_waiters);  \
 waited_; })
 
+#define AIO_WAIT_WHILE(ctx, cond)  \
+_AIO_WAIT_WHILE(ctx, cond, true)
+
+#define AIO_WAIT_WHILE_UNLOCKED(ctx, cond) \
+_AIO_WAIT_WHILE(ctx, cond, false)
+
 /**
  * aio_wait_kick:
  * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
-- 
2.31.1

[RFC PATCH 5/5] test-bdrv-drain: ensure draining from main loop stops iothreads

2022-03-01 Thread Emanuele Giuseppe Esposito

Add 2 tests: test_main_and_then_iothread_drain ensures that if the
main thread drains, the iothread cannot drain (and thus read
the graph). test_main_and_iothread_drain instead lets main loop
and iothread to drain together, and makes sure that no drain
happens in parallel.

Note that we are using bdrv_subtree_drained_{begin/end}_unlocked
to try avoid using the aiocontext lock as much as possible, since
it will eventually go away.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 tests/unit/test-bdrv-drain.c | 218 +++
 1 file changed, 218 insertions(+)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 36be84ae55..bf7fdcb568 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -1534,6 +1534,219 @@ static void test_set_aio_context(void)
 iothread_join(b);
 }
 
+typedef struct ParallelDrainJob {
+BlockJob common;
+BlockBackend *blk;
+BlockDriverState *bs;
+IOThread *a;
+bool should_fail;
+bool allowed_to_run;
+bool conclude_test;
+bool job_started;
+} ParallelDrainJob;
+
+typedef struct BDRVParallelTestState {
+ParallelDrainJob *job;
+} BDRVParallelTestState;
+
+static void coroutine_fn bdrv_test_par_co_drain(BlockDriverState *bs)
+{
+BDRVParallelTestState *s = bs->opaque;
+ParallelDrainJob *job = s->job;
+assert(!qatomic_read(&job->should_fail));
+}
+
+static int parallel_run_test(Job *job, Error **errp)
+{
+ParallelDrainJob *s = container_of(job, ParallelDrainJob, common.job);
+s->job_started = true;
+
+while (!s->conclude_test) {
+if (s->allowed_to_run) {
+bdrv_subtree_drained_begin_unlocked(s->bs);
+bdrv_subtree_drained_end_unlocked(s->bs);
+}
+job_pause_point(&s->common.job);
+}
+s->job_started = false;
+
+return 0;
+}
+
+static BlockDriver bdrv_test_parallel = {
+.format_name= "test",
+.instance_size  = sizeof(BDRVParallelTestState),
+.supports_backing   = true,
+
+.bdrv_co_drain_begin= bdrv_test_par_co_drain,
+.bdrv_co_drain_end  = bdrv_test_par_co_drain,
+
+.bdrv_child_perm= bdrv_default_perms,
+};
+
+static bool parallel_drained_poll(BlockJob *job)
+{
+/* need to return false otherwise a drain in coroutine deadlocks */
+return false;
+}
+
+static const BlockJobDriver test_block_job_driver_parallel = {
+.job_driver = {
+.instance_size = sizeof(ParallelDrainJob),
+.run   = parallel_run_test,
+.user_resume   = block_job_user_resume,
+.free  = block_job_free,
+},
+.drained_poll  = parallel_drained_poll,
+};
+
+static ParallelDrainJob *setup_job_test(void)
+{
+BlockBackend *blk;
+BlockDriverState *bs;
+Error *err = NULL;
+IOThread *a = iothread_new();
+AioContext *ctx_a = iothread_get_aio_context(a);
+ParallelDrainJob *s;
+BDRVParallelTestState *p;
+int ret;
+
+blk = blk_new(qemu_get_aio_context(),
+ BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
+blk_set_allow_aio_context_change(blk, true);
+
+bs = bdrv_new_open_driver(&bdrv_test_parallel, "parent", 0,
+ &error_abort);
+p = bs->opaque;
+
+ret = blk_insert_bs(blk, bs, &error_abort);
+assert(ret == 0);
+
+s = block_job_create("job1", &test_block_job_driver_parallel,
+ NULL, blk_bs(blk), 0, BLK_PERM_ALL, 0, JOB_DEFAULT,
+ NULL, NULL, &err);
+s->bs = bs;
+s->a = a;
+s->blk = blk;
+p->job = s;
+
+ret = bdrv_try_set_aio_context(bs, ctx_a, &error_abort);
+assert(ret == 0);
+assert(blk_get_aio_context(blk) == ctx_a);
+assert(s->common.job.aio_context == ctx_a);
+
+return s;
+}
+
+static void stop_and_tear_down_test(ParallelDrainJob *s)
+{
+AioContext *ctx = iothread_get_aio_context(s->a);
+
+/* stop iothread */
+s->conclude_test = true;
+
+/* wait that it's stopped */
+while (s->job_started) {
+aio_poll(qemu_get_current_aio_context(), false);
+}
+
+aio_context_acquire(ctx);
+bdrv_unref(s->bs);
+blk_unref(s->blk);
+aio_context_release(ctx);
+iothread_join(s->a);
+}
+
+/**
+ * test_main_and_then_iothread_drain: test that if the main
+ * loop drains, iothread cannot possibly drain.
+ *
+ * In this test, make sure that the main loop has firstly
+ * drained, and then allow the iothread to try and drain.
+ *
+ * Therefore, if the main loop drains, there is no way that
+ * the graph can be read or written by the iothread.
+ */
+static void test_main_and_then_iothread_drain(void)
+{
+ParallelDrainJob *s = setup_job_test();
+
+s->allowed_to_run = false;
+job_start(&s->common.job);
+
+/*
+ * Wait that the iothread starts, even though it just
+ * loops without doing anything (allowed_to_run is false)
+ */
+while (!s->job_started) {
+aio_poll(qemu_g

[RFC PATCH 3/5] block/io.c: introduce bdrv_subtree_drained_{begin/end}_unlocked

2022-03-01 Thread Emanuele Giuseppe Esposito

Same as the locked version, but use BDRV_POLL_UNLOCKED.
We are going to add drains to all graph modifications, and they are
generally performed without the AioContext lock taken.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/io.c| 48 ---
 include/block/block.h |  2 ++
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/block/io.c b/block/io.c
index 4e4cb556c5..d474449d2d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -242,6 +242,7 @@ typedef struct {
 bool begin;
 bool recursive;
 bool poll;
+bool unlocked;
 BdrvChild *parent;
 bool ignore_bds_parents;
 int *drained_end_counter;
@@ -332,7 +333,7 @@ static bool bdrv_drain_poll_top_level(BlockDriverState *bs, 
bool recursive,
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
   BdrvChild *parent, bool ignore_bds_parents,
-  bool poll);
+  bool poll, bool unlocked);
 static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 BdrvChild *parent, bool ignore_bds_parents,
 int *drained_end_counter);
@@ -350,7 +351,8 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 if (data->begin) {
 assert(!data->drained_end_counter);
 bdrv_do_drained_begin(bs, data->recursive, data->parent,
-  data->ignore_bds_parents, data->poll);
+  data->ignore_bds_parents, data->poll,
+  data->unlocked);
 } else {
 assert(!data->poll);
 bdrv_do_drained_end(bs, data->recursive, data->parent,
@@ -372,6 +374,7 @@ static void coroutine_fn 
bdrv_co_yield_to_drain(BlockDriverState *bs,
 BdrvChild *parent,
 bool ignore_bds_parents,
 bool poll,
+bool unlocked,
 int *drained_end_counter)
 {
 BdrvCoDrainData data;
@@ -392,6 +395,7 @@ static void coroutine_fn 
bdrv_co_yield_to_drain(BlockDriverState *bs,
 .parent = parent,
 .ignore_bds_parents = ignore_bds_parents,
 .poll = poll,
+.unlocked = unlocked,
 .drained_end_counter = drained_end_counter,
 };
 
@@ -439,13 +443,13 @@ void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 
 static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
   BdrvChild *parent, bool ignore_bds_parents,
-  bool poll)
+  bool poll, bool unlocked)
 {
 BdrvChild *child, *next;
 
 if (qemu_in_coroutine()) {
 bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
-   poll, NULL);
+   poll, unlocked, NULL);
 return;
 }
 
@@ -456,7 +460,7 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, 
bool recursive,
 bs->recursive_quiesce_counter++;
 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
-  false);
+  false, false);
 }
 }
 
@@ -471,18 +475,30 @@ static void bdrv_do_drained_begin(BlockDriverState *bs, 
bool recursive,
  */
 if (poll) {
 assert(!ignore_bds_parents);
-BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
+if (unlocked) {
+BDRV_POLL_WHILE_UNLOCKED(bs,
+ bdrv_drain_poll_top_level(bs, recursive,
+   parent));
+} else {
+BDRV_POLL_WHILE(bs,
+bdrv_drain_poll_top_level(bs, recursive, parent));
+}
 }
 }
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
-bdrv_do_drained_begin(bs, false, NULL, false, true);
+bdrv_do_drained_begin(bs, false, NULL, false, true, false);
 }
 
 void bdrv_subtree_drained_begin(BlockDriverState *bs)
 {
-bdrv_do_drained_begin(bs, true, NULL, false, true);
+bdrv_do_drained_begin(bs, true, NULL, false, true, false);
+}
+
+void bdrv_subtree_drained_begin_unlocked(BlockDriverState *bs)
+{
+bdrv_do_drained_begin(bs, true, NULL, false, true, true);
 }
 
 /**
@@ -510,7 +526,7 @@ static void bdrv_do_drained_end(BlockDriverState *bs, bool 
recursive,
 
 if (qemu_in_coroutine()) {
 bdrv_co_yield_to_drain(bs, false, recursive, parent, 
ignore_bds_parents,
-   false, drained_end_counter);
+   false, false, drained_end_counter);

[RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

2022-03-01 Thread Emanuele Giuseppe Esposito

This serie tries to provide a proof of concept and a clear explanation
on why we need to use drains (and more precisely subtree_drains)
to replace the aiocontext lock, especially to protect BlockDriverState
->children and ->parent lists.

Just a small recap on the key concepts:
* We split block layer APIs in "global state" (GS), "I/O", and
"global state or I/O".
  GS are running in the main loop, under BQL, and are the only
  one allowed to modify the BlockDriverState graph.

  I/O APIs are thread safe and can run in any thread

  "global state or I/O" are essentially all APIs that use
  BDRV_POLL_WHILE. This is because there can be only 2 threads
  that can use BDRV_POLL_WHILE: main loop and the iothread that
  runs the aiocontext.

* Drains allow the caller (either main loop or iothread running
the context) to wait all in_flights requests and operations
of a BDS: normal drains target a given node and is parents, while
subtree ones also include the subgraph of the node. Siblings are
not affected by any of these two kind of drains.
After bdrv_drained_begin, no more request is allowed to come
from the affected nodes. Therefore the only actor left working
on a drained part of the graph should be the main loop.

What do we intend to do
---
We want to remove the AioContext lock. It is not 100% clear on how
many things we are protecting with it, and why.
As a starter, we want to protect BlockDriverState ->parents and
->children lists, since they are read by main loop and I/O but
only written by main loop under BQL. The function that modifies
these lists is bdrv_replace_child_common().

How do we want to do it
---
We individuated as ideal subtitute of AioContext lock
the subtree_drain API. The reason is simple: draining prevents the iothread to 
read or write the nodes, so once the main loop finishes
executing bdrv_drained_begin() on the interested graph, we are sure that
the iothread is not going to look or even interfere with that part of the graph.
We are also sure that the only two actors that can look at a specific
BlockDriverState in any given context are the main loop and the
iothread running the AioContext (ensured by "global state or IO" logic).

Why use _subtree_ instead of normal drain
-
A simple drain "blocks" a given node and all its parents.
But it doesn't touch the child.
This means that if we use a simple drain, a child can always
keep processing requests, and eventually end up calling itself
bdrv_drained_begin, ending up reading the parents node while the main loop
is modifying them. Therefore a subtree drain is necessary.

Possible scenarios
---
Keeping in mind that we can only have an iothread and the main loop
draining on a certain node, we could have:

main loop successfully drains and then iothread tries to drain:
  impossible scenario, as iothread is already stopped once main
  successfully drains.

iothread successfully drains and then main loop drains:
  should not be a problem, as:
  1) the iothread should be already "blocked" by its own drain
  2) main loop would still wait for it to completely block
  There is the issue of mirror overriding such scenario to avoid
  having deadlocks, but that is handled in the next section.

main loop and iothread try to drain together:
  As above, this case doens't really matter. As long as
  bdrv_drained_begin invariant is respected, the main loop will
  continue only once the iothread is "blocked" on that part of the graph.

A note on iothread draining
---
Theoretically draining from an iothread should not be possible,
as the iothread would be scheduling a bh in the main loop waiting
for itself to stop, even though it is not yet stopped since it is waiting for 
the bh.

This is what would happen in the tests in patch 5 if .drained_poll
was not implemented.

Therefore, one solution is to use .drained_poll callback in BlockJobDriver.
This callback overrides the default job poll() behavior, and
allows the polling condition to stop waiting for the job.
It is actually used only in mirror.
This however breaks bdrv_drained_begin invariant, because the
iothread is not really blocked on that node but continues running.
In order to fix this, patch 4 allows the polling condition to be
used only by the iothread, and not the main loop too, preventing
the drain to return before the iothread is effectively stopped.
This is also shown in the tests in patch 5. If the fix in patch
4 is removed, then the main loop drain will return earlier and
allow the iothread to run and drain together.

The other patches in this serie are cherry-picked from the various
series I already sent, and are included here just to allow
subtree_drained_begin/end_unlocked implementation.

Emanuele Giuseppe Esposito (5):
  aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED
  introduce BDRV_POLL_WHILE_UNLOCKED
  block/io.c: introduce bdrv_subtree_drained_{begin/end}_unlocked
  chil

[RFC PATCH 2/5] introduce BDRV_POLL_WHILE_UNLOCKED

2022-03-01 Thread Emanuele Giuseppe Esposito

Same as BDRV_POLL_WHILE, but uses AIO_WAIT_WHILE_UNLOCKED.
See doc comment for more info.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/block.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index e1713ee306..5a7a850c16 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -512,6 +512,11 @@ void bdrv_drain_all(void);
 AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),  \
cond); })
 
+#define BDRV_POLL_WHILE_UNLOCKED(bs, cond) ({  \
+BlockDriverState *bs_ = (bs);  \
+AIO_WAIT_WHILE_UNLOCKED(bdrv_get_aio_context(bs_), \
+cond); })
+
 int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
int64_t bytes);
 int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
-- 
2.31.1

Re: [PATCH v5 13/15] hw/nvme: Add support for the Virtualization Management command

2022-03-01 Thread Klaus Jensen

On Feb 17 18:45, Lukasz Maniak wrote:
> From: Łukasz Gieryk 
> 
> With the new command one can:
>  - assign flexible resources (queues, interrupts) to primary and
>secondary controllers,
>  - toggle the online/offline state of given controller.
> 

QEMU segfaults (or asserts depending on the wind blowing) if the SR-IOV
enabled device is hotplugged after being configured (i.e. follow the
docs for a simple setup and then do a `device_del ` in the
monitor. I suspect this is related to freeing the queues and something
getting double-freed.

The device can be removed just fine if SR-IOV is configured (as in,
parameters are set), but no resources are reserved, onlined etc.


Snip from the backtrace (assert):

qemu-system-x86_64: ../util/qemu-thread-posix.c:78: qemu_mutex_lock_impl: 
Assertion `mutex->initialized' failed.

Thread 4 "qemu-system-x86" received signal SIGABRT, Aborted.
[Switching to Thread 0x7fcb8640 (LWP 174907)]
0x7fcb9a85534c in __pthread_kill_implementation () from /usr/lib64/libc.so.6
(gdb) bt
#0  0x7fcb9a85534c in __pthread_kill_implementation () at 
/usr/lib64/libc.so.6
#1  0x7fcb9a8084b8 in raise () at /usr/lib64/libc.so.6
#2  0x7fcb9a7f2534 in abort () at /usr/lib64/libc.so.6
#3  0x7fcb9a7f245c in _nl_load_domain.cold () at /usr/lib64/libc.so.6
#4  0x7fcb9a801116 in  () at /usr/lib64/libc.so.6
#5  0x556c1fffc342 in qemu_mutex_lock_impl (mutex=, 
file=, line=) at ../util/qemu-thread-posix.c:78
#6  qemu_mutex_lock_impl (mutex=, file=, 
line=) at ../util/qemu-thread-posix.c:74
#7  0x556c2001af05 in timer_del (ts=ts@entry=0x7fc978a0) at 
../util/qemu-timer.c:432
#8  0x556c1fc28657 in timer_free (ts=0x7fc978a0) at 
/home/kbj/work/src/qemu/include/qemu/timer.h:633
#9  timer_free (ts=0x7fc978a0) at 
/home/kbj/work/src/qemu/include/qemu/timer.h:630
#10 nvme_free_sq (sq=0x7fc97890, n=, n=) at 
../hw/nvme/ctrl.c:4129
#11 0x556c1fc2a369 in nvme_ctrl_reset (n=0x7fc978436e70, 
rst=NVME_RESET_FUNCTION) at ../hw/nvme/ctrl.c:6007
#12 0x556c1fc2a84c in nvme_virt_set_state (n=n@entry=0x556c22d486b0, 
cntlid=, online=online@entry=0x0) at ../hw/nvme/ctrl.c:5815
#13 0x556c1fc2a5c6 in nvme_ctrl_reset (n=0x556c22d486b0, 
rst=NVME_RESET_FUNCTION) at ../hw/nvme/ctrl.c:6026
#14 0x556c1fc2a9e3 in nvme_exit (pci_dev=0x556c22d486b0) at 
../hw/nvme/ctrl.c:7265
#15 0x556c1fc450e3 in pci_qdev_unrealize (dev=) at 
../hw/pci/pci.c:1200
... more here


Snip from the backtrace (segfault)

Thread 7 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7f6c635fe640 (LWP 175552)]
0x555e275ab63a in nvme_free_sq (sq=0xfea03000, n=, 
n=) at ../hw/nvme/ctrl.c:4128
4128n->sq[sq->sqid] = NULL;
(gdb) bt
#0  0x555e275ab63a in nvme_free_sq (sq=0xfea03000, n=, 
n=) at ../hw/nvme/ctrl.c:4128
#1  0x555e275ad369 in nvme_ctrl_reset (n=0x7f6e683793e0, 
rst=NVME_RESET_FUNCTION) at ../hw/nvme/ctrl.c:6007
#2  0x555e275ad84c in nvme_virt_set_state (n=n@entry=0x555e2a2626b0, 
cntlid=, online=online@entry=0x0) at ../hw/nvme/ctrl.c:5815
#3  0x555e275ad5c6 in nvme_ctrl_reset (n=0x555e2a2626b0, 
rst=NVME_RESET_FUNCTION) at ../hw/nvme/ctrl.c:6026
#4  0x555e275ad9e3 in nvme_exit (pci_dev=0x555e2a2626b0) at 
../hw/nvme/ctrl.c:7265
#5  0x555e275c80e3 in pci_qdev_unrealize (dev=) at 
../hw/pci/pci.c:1200
... more here


signature.asc
Description: PGP signature

Re: [PATCH v5 14/15] docs: Add documentation for SR-IOV and Virtualization Enhancements

2022-03-01 Thread Klaus Jensen

On Feb 17 18:45, Lukasz Maniak wrote:
> Signed-off-by: Lukasz Maniak 

Please add a short commit description as well. Otherwise,

Reviewed-by: Klaus Jensen 

> ---
>  docs/system/devices/nvme.rst | 82 
>  1 file changed, 82 insertions(+)
> 
> diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
> index b5acb2a9c19..aba253304e4 100644
> --- a/docs/system/devices/nvme.rst
> +++ b/docs/system/devices/nvme.rst
> @@ -239,3 +239,85 @@ The virtual namespace device supports DIF- and DIX-based 
> protection information
>to ``1`` to transfer protection information as the first eight bytes of
>metadata. Otherwise, the protection information is transferred as the last
>eight bytes.
> +
> +Virtualization Enhancements and SR-IOV (Experimental Support)
> +-
> +
> +The ``nvme`` device supports Single Root I/O Virtualization and Sharing
> +along with Virtualization Enhancements. The controller has to be linked to
> +an NVM Subsystem device (``nvme-subsys``) for use with SR-IOV.
> +
> +A number of parameters are present (**please note, that they may be
> +subject to change**):
> +
> +``sriov_max_vfs`` (default: ``0``)
> +  Indicates the maximum number of PCIe virtual functions supported
> +  by the controller. Specifying a non-zero value enables reporting of both
> +  SR-IOV and ARI (Alternative Routing-ID Interpretation) capabilities
> +  by the NVMe device. Virtual function controllers will not report SR-IOV.
> +
> +``sriov_vq_flexible``
> +  Indicates the total number of flexible queue resources assignable to all
> +  the secondary controllers. Implicitly sets the number of primary
> +  controller's private resources to ``(max_ioqpairs - sriov_vq_flexible)``.
> +
> +``sriov_vi_flexible``
> +  Indicates the total number of flexible interrupt resources assignable to
> +  all the secondary controllers. Implicitly sets the number of primary
> +  controller's private resources to ``(msix_qsize - sriov_vi_flexible)``.
> +
> +``sriov_max_vi_per_vf`` (default: ``0``)
> +  Indicates the maximum number of virtual interrupt resources assignable
> +  to a secondary controller. The default ``0`` resolves to
> +  ``(sriov_vi_flexible / sriov_max_vfs)``
> +
> +``sriov_max_vq_per_vf`` (default: ``0``)
> +  Indicates the maximum number of virtual queue resources assignable to
> +  a secondary controller. The default ``0`` resolves to
> +  ``(sriov_vq_flexible / sriov_max_vfs)``
> +
> +The simplest possible invocation enables the capability to set up one VF
> +controller and assign an admin queue, an IO queue, and a MSI-X interrupt.
> +
> +.. code-block:: console
> +
> +   -device nvme-subsys,id=subsys0
> +   -device nvme,serial=deadbeef,subsys=subsys0,sriov_max_vfs=1,
> +sriov_vq_flexible=2,sriov_vi_flexible=1
> +
> +The minimum steps required to configure a functional NVMe secondary
> +controller are:
> +
> +  * unbind flexible resources from the primary controller
> +
> +.. code-block:: console
> +
> +   nvme virt-mgmt /dev/nvme0 -c 0 -r 1 -a 1 -n 0
> +   nvme virt-mgmt /dev/nvme0 -c 0 -r 0 -a 1 -n 0
> +
> +  * perform a Function Level Reset on the primary controller to actually
> +release the resources
> +
> +.. code-block:: console
> +
> +   echo 1 > /sys/bus/pci/devices/:01:00.0/reset
> +
> +  * enable VF
> +
> +.. code-block:: console
> +
> +   echo 1 > /sys/bus/pci/devices/:01:00.0/sriov_numvfs
> +
> +  * assign the flexible resources to the VF and set it ONLINE
> +
> +.. code-block:: console
> +
> +   nvme virt-mgmt /dev/nvme0 -c 1 -r 1 -a 8 -n 1
> +   nvme virt-mgmt /dev/nvme0 -c 1 -r 0 -a 8 -n 2
> +   nvme virt-mgmt /dev/nvme0 -c 1 -r 0 -a 9 -n 0
> +
> +  * bind the NVMe driver to the VF
> +
> +.. code-block:: console
> +
> +   echo :01:00.1 > /sys/bus/pci/drivers/nvme/bind
> \ No newline at end of file
> -- 
> 2.25.1
> 

-- 
One of us - No more doubt, silence or taboo about mental illness.


signature.asc
Description: PGP signature

Re: [PATCH v5 09/15] hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime

2022-03-01 Thread Klaus Jensen

On Feb 17 18:44, Lukasz Maniak wrote:
> From: Łukasz Gieryk 
> 
> The NVMe device defines two properties: max_ioqpairs, msix_qsize. Having
> them as constants is problematic for SR-IOV support.
> 
> SR-IOV introduces virtual resources (queues, interrupts) that can be
> assigned to PF and its dependent VFs. Each device, following a reset,
> should work with the configured number of queues. A single constant is
> no longer sufficient to hold the whole state.
> 
> This patch tries to solve the problem by introducing additional
> variables in NvmeCtrl’s state. The variables for, e.g., managing queues
> are therefore organized as:
>  - n->params.max_ioqpairs – no changes, constant set by the user
>  - n->(mutable_state) – (not a part of this patch) user-configurable,
> specifies number of queues available _after_
> reset
>  - n->conf_ioqpairs - (new) used in all the places instead of the ‘old’
>   n->params.max_ioqpairs; initialized in realize()
>   and updated during reset() to reflect user’s
>   changes to the mutable state
> 
> Since the number of available i/o queues and interrupts can change in
> runtime, buffers for sq/cqs and the MSIX-related structures are
> allocated big enough to handle the limits, to completely avoid the
> complicated reallocation. A helper function (nvme_update_msixcap_ts)
> updates the corresponding capability register, to signal configuration
> changes.
> 
> Signed-off-by: Łukasz Gieryk 

LGTM.

Reviewed-by: Klaus Jensen 

> ---
>  hw/nvme/ctrl.c | 52 ++
>  hw/nvme/nvme.h |  2 ++
>  2 files changed, 38 insertions(+), 16 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 7c1dd80f21d..f1b4026e4f8 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -445,12 +445,12 @@ static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
>  {
> -return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
> +return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
>  }
>  
>  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
>  {
> -return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
> +return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
>  }
>  
>  static void nvme_inc_cq_tail(NvmeCQueue *cq)
> @@ -4188,8 +4188,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest 
> *req)
>  trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
>  return NVME_INVALID_CQID | NVME_DNR;
>  }
> -if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
> -n->sq[sqid] != NULL)) {
> +if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
>  trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
>  return NVME_INVALID_QID | NVME_DNR;
>  }
> @@ -4541,8 +4540,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
> *req)
>  trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
>   NVME_CQ_FLAGS_IEN(qflags) != 0);
>  
> -if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
> -n->cq[cqid] != NULL)) {
> +if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
>  trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
>  return NVME_INVALID_QID | NVME_DNR;
>  }
> @@ -4558,7 +4556,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
> *req)
>  trace_pci_nvme_err_invalid_create_cq_vector(vector);
>  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
>  }
> -if (unlikely(vector >= n->params.msix_qsize)) {
> +if (unlikely(vector >= n->conf_msix_qsize)) {
>  trace_pci_nvme_err_invalid_create_cq_vector(vector);
>  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
>  }
> @@ -5155,13 +5153,12 @@ defaults:
>  
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> -result = (n->params.max_ioqpairs - 1) |
> -((n->params.max_ioqpairs - 1) << 16);
> +result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
>  trace_pci_nvme_getfeat_numq(result);
>  break;
>  case NVME_INTERRUPT_VECTOR_CONF:
>  iv = dw11 & 0x;
> -if (iv >= n->params.max_ioqpairs + 1) {
> +if (iv >= n->conf_ioqpairs + 1) {
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> @@ -5316,10 +5313,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> NvmeRequest *req)
>  
>  trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
>  ((dw11 >> 16) & 0x) + 1,
> -n->params.max_ioqpairs,
> -n->params.max_ioqpairs);
> -req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> -  ((n->params.max_ioqpairs

Re: [PATCH v7 02/31] main loop: macros to mark GS and I/O functions

2022-03-01 Thread Kevin Wolf

Am 11.02.2022 um 15:51 hat Emanuele Giuseppe Esposito geschrieben:
> Righ now, IO_CODE and IO_OR_GS_CODE are nop, as there isn't
> really a way to check that a function is only called in I/O.
> On the other side, we can use qemu_in_main_thread to check if
> we are in the main loop.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  include/qemu/main-loop.h | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
> index bc42b5939d..77adc51627 100644
> --- a/include/qemu/main-loop.h
> +++ b/include/qemu/main-loop.h
> @@ -269,6 +269,15 @@ bool qemu_mutex_iothread_locked(void);
>   */
>  bool qemu_in_main_thread(void);
>  
> +/* Mark and check that the function is part of the global state API. */
> +#define GLOBAL_STATE_CODE() assert(qemu_in_main_thread())
> +
> +/* Mark and check that the function is part of the I/O API. */
> +#define IO_CODE() /* nop */
> +
> +/* Mark and check that the function is part of the "I/O OR GS" API. */
> +#define IO_OR_GS_CODE() /* nop */
> +

I don't think it is actually a problem with the current macro expansions
and the places where they are used are limited, but if you have to
respin, I'd consider wrapping things in the usual do { ... } while (0)
just to be sure.

Kevin

[PATCH v2 4/6] hw/nvme: add support for the lbafee hbs feature

2022-03-01 Thread Klaus Jensen

From: Naveen Nagar 

Add support for up to 64 LBA formats through the LBAFEE field of the
Host Behavior Support feature.

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 34 +++---
 hw/nvme/ns.c | 15 +--
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h |  7 +--
 4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d8701ebf2fa8..52ab3450b975 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5165,6 +5165,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 uint32_t nsid = le32_to_cpu(cmd->nsid);
 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
 uint8_t save = NVME_SETFEAT_SAVE(dw10);
+uint16_t status;
 int i;
 
 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
@@ -5287,8 +5288,26 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
 case NVME_HOST_BEHAVIOR_SUPPORT:
-return nvme_h2c(n, (uint8_t *)&n->features.hbs,
-sizeof(n->features.hbs), req);
+status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
+  sizeof(n->features.hbs), req);
+if (status) {
+return status;
+}
+
+for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+
+if (!ns) {
+continue;
+}
+
+ns->id_ns.nlbaf = ns->nlbaf - 1;
+if (!n->features.hbs.lbafee) {
+ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
+}
+}
+
+return status;
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
@@ -5479,10 +5498,13 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
 uint8_t pi, uint8_t pil)
 {
+uint8_t lbafl = lbaf & 0xf;
+uint8_t lbafu = lbaf >> 4;
+
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
-ns->id_ns.flbas = lbaf | (mset << 4);
+ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
 
 nvme_ns_init_format(ns);
 }
@@ -5596,6 +5618,7 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
 uint8_t mset = (dw10 >> 4) & 0x1;
 uint8_t pi = (dw10 >> 5) & 0x7;
 uint8_t pil = (dw10 >> 8) & 0x1;
+uint8_t lbafu = (dw10 >> 12) & 0x3;
 uint16_t status;
 
 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5612,6 +5635,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
+if (n->features.hbs.lbafee) {
+iocb->lbaf |= lbafu << 4;
+}
+
 if (!iocb->broadcast) {
 if (!nvme_nsid_valid(n, nsid)) {
 status = NVME_INVALID_NSID | NVME_DNR;
@@ -6587,6 +6614,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cntlid = cpu_to_le16(n->cntlid);
 
 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
+id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
 
 id->rab = 6;
 
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index ee673f1a5bef..8dfb55130beb 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -112,10 +112,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 [7] = { .ds = 12, .ms = 64 },
 };
 
+ns->nlbaf = 8;
+
 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
-id_ns->nlbaf = 7;
 
-for (i = 0; i <= id_ns->nlbaf; i++) {
+for (i = 0; i < ns->nlbaf; i++) {
 NvmeLBAF *lbaf = &id_ns->lbaf[i];
 if (lbaf->ds == ds) {
 if (lbaf->ms == ms) {
@@ -126,12 +127,14 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 }
 
 /* add non-standard lba format */
-id_ns->nlbaf++;
-id_ns->lbaf[id_ns->nlbaf].ds = ds;
-id_ns->lbaf[id_ns->nlbaf].ms = ms;
-id_ns->flbas |= id_ns->nlbaf;
+id_ns->lbaf[ns->nlbaf].ds = ds;
+id_ns->lbaf[ns->nlbaf].ms = ms;
+ns->nlbaf++;
+
+id_ns->flbas |= i;
 
 lbaf_found:
+id_ns->nlbaf = ns->nlbaf - 1;
 nvme_ns_init_format(ns);
 
 return 0;
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 103407038e74..e715c3255a29 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -128,6 +128,7 @@ typedef struct NvmeNamespace {
 int64_t  moff;
 NvmeIdNs id_ns;
 NvmeLBAF lbaf;
+unsigned int nlbaf;
 size_t   lbasz;
 const uint32_t *iocs;
 uint8_t  csi;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e527c728f975..37afc9be9b18 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -,6 +,10 @@ enum NvmeIdCtrlOaes {
 NVME_OAES_NS_ATTR   = 1 << 8,
 };
 
+enum NvmeIdCtrlCtratt {
+NVME_CTRATT_ELBAS   = 1 << 15,

[PATCH v2 3/6] hw/nvme: move format parameter parsing

2022-03-01 Thread Klaus Jensen

From: Klaus Jensen 

There is no need to extract the format command parameters for each
namespace. Move it to the entry point.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 71c60482c75f..d8701ebf2fa8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5452,6 +5452,11 @@ typedef struct NvmeFormatAIOCB {
 uint32_t nsid;
 bool broadcast;
 int64_t offset;
+
+uint8_t lbaf;
+uint8_t mset;
+uint8_t pi;
+uint8_t pil;
 } NvmeFormatAIOCB;
 
 static void nvme_format_bh(void *opaque);
@@ -5471,14 +5476,9 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 .get_aio_context = nvme_get_aio_context,
 };
 
-static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
+static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
+uint8_t pi, uint8_t pil)
 {
-uint32_t dw10 = le32_to_cpu(cmd->cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
-uint8_t mset = (dw10 >> 4) & 0x1;
-uint8_t pil = (dw10 >> 8) & 0x1;
-
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
@@ -5490,7 +5490,6 @@ static void nvme_format_set(NvmeNamespace *ns, NvmeCmd 
*cmd)
 static void nvme_format_ns_cb(void *opaque, int ret)
 {
 NvmeFormatAIOCB *iocb = opaque;
-NvmeRequest *req = iocb->req;
 NvmeNamespace *ns = iocb->ns;
 int bytes;
 
@@ -5512,7 +5511,7 @@ static void nvme_format_ns_cb(void *opaque, int ret)
 return;
 }
 
-nvme_format_set(ns, &req->cmd);
+nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
 ns->status = 0x0;
 iocb->ns = NULL;
 iocb->offset = 0;
@@ -5548,9 +5547,6 @@ static void nvme_format_bh(void *opaque)
 NvmeFormatAIOCB *iocb = opaque;
 NvmeRequest *req = iocb->req;
 NvmeCtrl *n = nvme_ctrl(req);
-uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
 uint16_t status;
 int i;
 
@@ -5572,7 +5568,7 @@ static void nvme_format_bh(void *opaque)
 goto done;
 }
 
-status = nvme_format_check(iocb->ns, lbaf, pi);
+status = nvme_format_check(iocb->ns, iocb->lbaf, iocb->pi);
 if (status) {
 req->status = status;
 goto done;
@@ -5595,6 +5591,11 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 {
 NvmeFormatAIOCB *iocb;
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+uint8_t lbaf = dw10 & 0xf;
+uint8_t mset = (dw10 >> 4) & 0x1;
+uint8_t pi = (dw10 >> 5) & 0x7;
+uint8_t pil = (dw10 >> 8) & 0x1;
 uint16_t status;
 
 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5604,6 +5605,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->ret = 0;
 iocb->ns = NULL;
 iocb->nsid = 0;
+iocb->lbaf = lbaf;
+iocb->mset = mset;
+iocb->pi = pi;
+iocb->pil = pil;
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
-- 
2.35.1

[PATCH v2 2/6] hw/nvme: add host behavior support feature

2022-03-01 Thread Klaus Jensen

From: Naveen Nagar 

Add support for getting and setting the Host Behavior Support feature.

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 8 
 hw/nvme/nvme.h   | 4 +++-
 include/block/nvme.h | 9 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d08af3bdc1a2..71c60482c75f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -196,6 +196,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 [NVME_WRITE_ATOMICITY]  = true,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 [NVME_TIMESTAMP]= true,
+[NVME_HOST_BEHAVIOR_SUPPORT]= true,
 [NVME_COMMAND_SET_PROFILE]  = true,
 };
 
@@ -206,6 +207,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
+[NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
 [NVME_COMMAND_SET_PROFILE]  = NVME_FEAT_CAP_CHANGE,
 };
 
@@ -5091,6 +5093,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 goto out;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_c2h(n, (uint8_t *)&n->features.hbs,
+sizeof(n->features.hbs), req);
 default:
 break;
 }
@@ -5281,6 +5286,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_h2c(n, (uint8_t *)&n->features.hbs,
+sizeof(n->features.hbs), req);
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 801176a2bd5e..103407038e74 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -468,7 +468,9 @@ typedef struct NvmeCtrl {
 uint16_t temp_thresh_hi;
 uint16_t temp_thresh_low;
 };
-uint32_tasync_config;
+
+uint32_tasync_config;
+NvmeHostBehaviorSupport hbs;
 } features;
 } NvmeCtrl;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index cd068ac89142..e527c728f975 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1216,6 +1216,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_HOST_BEHAVIOR_SUPPORT  = 0x16,
 NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
@@ -1257,6 +1258,13 @@ typedef struct QEMU_PACKED NvmeRangeType {
 uint8_t rsvd48[16];
 } NvmeRangeType;
 
+typedef struct NvmeHostBehaviorSupport {
+uint8_t acre;
+uint8_t etdas;
+uint8_t lbafee;
+uint8_t rsvd3[509];
+} NvmeHostBehaviorSupport;
+
 typedef struct QEMU_PACKED NvmeLBAF {
 uint16_tms;
 uint8_t ds;
@@ -1520,6 +1528,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+QEMU_BUILD_BUG_ON(sizeof(NvmeHostBehaviorSupport) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
-- 
2.35.1

[PATCH v2 6/6] hw/nvme: 64-bit pi support

2022-03-01 Thread Klaus Jensen

From: Naveen Nagar 

This adds support for one possible new protection information format
introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
and 48-bit reference tag. This version does not support storage tags.

Like the CRC16 support already present, this uses a software
implementation of CRC64 (so it is naturally pretty slow). But its good
enough for verification purposes.

This may go nicely hand-in-hand with the support that Keith submitted
for the Linux kernel[1].

  [1]: 
https://lore.kernel.org/linux-nvme/20220126165214.ga1782...@dhcp-10-100-145-180.wdc.com/T/

Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 163 +++
 hw/nvme/dif.c| 363 +--
 hw/nvme/dif.h| 143 -
 hw/nvme/ns.c |  35 -
 hw/nvme/nvme.h   |   3 +
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  67 ++--
 7 files changed, 648 insertions(+), 138 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f1683960b87e..03760ddeae8c 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2050,9 +2050,12 @@ static void nvme_verify_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 uint16_t status;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
 
 if (ret) {
@@ -2141,7 +2144,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 struct nvme_compare_ctx *ctx = req->opaque;
 g_autofree uint8_t *buf = NULL;
 BlockBackend *blk = ns->blkconf.blk;
@@ -2149,6 +2153,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 BlockAcctStats *stats = blk_get_stats(blk);
 uint16_t status = NVME_SUCCESS;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
 
 if (ret) {
@@ -2527,7 +2533,8 @@ typedef struct NvmeCopyAIOCB {
 QEMUBH *bh;
 int ret;
 
-NvmeCopySourceRange *ranges;
+void *ranges;
+unsigned int format;
 int nr;
 int idx;
 
@@ -2538,7 +2545,7 @@ typedef struct NvmeCopyAIOCB {
 BlockAcctCookie write;
 } acct;
 
-uint32_t reftag;
+uint64_t reftag;
 uint64_t slba;
 
 NvmeZone *zone;
@@ -2592,13 +2599,101 @@ static void nvme_copy_bh(void *opaque)
 
 static void nvme_copy_cb(void *opaque, int ret);
 
+static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat0 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = le32_to_cpu(_ranges[idx].reftag);
+}
+}
+
+static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat1 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = 0;
+
+*reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
+*reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
+*reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
+*reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
+*reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
+*reftag |= (uint64_t)_ranges[idx].sr[9];
+}
+}
+
+static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
+ uint64_t

[PATCH v2 1/6] hw/nvme: move dif/pi prototypes into dif.h

2022-03-01 Thread Klaus Jensen

From: Klaus Jensen 

Move dif/pi data structures and inlines to dif.h.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c |  1 +
 hw/nvme/dif.c  |  1 +
 hw/nvme/dif.h  | 53 ++
 hw/nvme/nvme.h | 50 ---
 4 files changed, 55 insertions(+), 50 deletions(-)
 create mode 100644 hw/nvme/dif.h

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 98aac98bef5f..d08af3bdc1a2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -163,6 +163,7 @@
 #include "migration/vmstate.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 #define NVME_MAX_IOQPAIRS 0x
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index 5dbd18b2a4a5..cd0cea2b5ebd 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -13,6 +13,7 @@
 #include "sysemu/block-backend.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
diff --git a/hw/nvme/dif.h b/hw/nvme/dif.h
new file mode 100644
index ..e36fea30e71e
--- /dev/null
+++ b/hw/nvme/dif.h
@@ -0,0 +1,53 @@
+#ifndef HW_NVME_DIF_H
+#define HW_NVME_DIF_H
+
+/* from Linux kernel (crypto/crct10dif_common.c) */
+static const uint16_t t10_dif_crc_table[256] = {
+0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
+   uint32_t reftag);
+uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
+   uint64_t slba);
+void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
+ uint8_t *mbuf, size_t mlen, uint16_t apptag,
+ uint32_t *reftag);
+uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
+uint8_t *mbuf, size_t mlen, uint8_t prinfo,
+uint64_t slba, uint16_t apptag,
+uint16_t appmask, uint32_t *reftag);
+uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);
+
+#endif /* HW_NVME_DIF_H */
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 90c0bb7ce236..801176a2bd5e 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -513,54 +513,4 @@ void nvme_rw_complete_cb(void *opaque, int ret);
 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
NvmeCmd *cmd);
 
-/* from Linux kernel (crypto/crct10dif_common.c) */
-static const uint16_t t10_dif_crc_table[256] = {
-0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-0xA99A

[PATCH v2 5/6] hw/nvme: add pi tuple size helper

2022-03-01 Thread Klaus Jensen

From: Klaus Jensen 

A subsequent patch will introduce a new tuple size; so add a helper and
use that instead of sizeof() and magic numbers.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 14 --
 hw/nvme/dif.c  | 16 
 hw/nvme/dif.h  |  5 +
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 52ab3450b975..f1683960b87e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1068,7 +1068,8 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, 
NvmeRequest *req)
 size_t len = nvme_l2b(ns, nlb);
 uint16_t status;
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 NvmeSg sg;
 
 len += nvme_m2b(ns, nlb);
@@ -1247,7 +1248,8 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, 
uint32_t len,
 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
ns->lbaf.ms, 0, dir);
 }
@@ -2184,7 +2186,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
  * tuple.
  */
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += 
ns->lbaf.ms) {
@@ -3167,7 +3169,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size = data_size;
 }
 }
@@ -3244,7 +3246,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size -= nvme_m2b(ns, nlb);
 }
 }
@@ -5553,7 +,7 @@ static uint16_t nvme_format_check(NvmeNamespace *ns, 
uint8_t lbaf, uint8_t pi)
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
-if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
+if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index cd0cea2b5ebd..891385f33f20 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -48,7 +48,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t 
*buf, size_t len,
 int16_t pil = 0;
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_pract_generate_dif(len, ns->lbasz, ns->lbasz + pil,
@@ -145,7 +145,7 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, 
size_t len,
 }
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_check(prinfo, ns->lbasz + pil);
@@ -184,7 +184,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 do {
@@ -210,7 +210,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 end = mbufp + mlen;
 
 for (; mbufp < end; mbufp += ns->lbaf.ms) {
-memset(mbufp + pil, 0xff, sizeof(NvmeDifTuple));
+memset(mbufp + pil, 0xff, nvme_pi_tuple_size(ns));
 }
 }
 
@@ -284,7 +284,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
 goto out;
 }
 
-if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == 8) {
+if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 goto out;
 }
 
@@ -388,7 +388,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
 
 if (pract) {
 uint8_t *mbuf, *end;
-int16_t pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+int16_t pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 
 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
 if (status) {
@@ -428,7 +428,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n,

[PATCH v2 0/6] hw/nvme: enhanced protection information (64-bit guard)

2022-03-01 Thread Klaus Jensen

From: Klaus Jensen 

This adds support for one possible new protection information format
introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
and 48-bit reference tag. This version does not support storage tags.

Like the CRC16 support already present, this uses a software
implementation of CRC64 (so it is naturally pretty slow). But its good
enough for verification purposes.

This goes hand-in-hand with the support that Keith submitted for the
Linux kernel[1].

  [1]: 
https://lore.kernel.org/linux-nvme/20220201190128.3075065-1-kbu...@kernel.org/

Changes since v1

- Check metadata size depending on pi guard type selected. (Keith)

Klaus Jensen (3):
  hw/nvme: move dif/pi prototypes into dif.h
  hw/nvme: move format parameter parsing
  hw/nvme: add pi tuple size helper

Naveen Nagar (3):
  hw/nvme: add host behavior support feature
  hw/nvme: add support for the lbafee hbs feature
  hw/nvme: 64-bit pi support

 hw/nvme/ctrl.c   | 235 +--
 hw/nvme/dif.c| 378 +--
 hw/nvme/dif.h| 191 ++
 hw/nvme/ns.c |  50 --
 hw/nvme/nvme.h   |  58 +--
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  81 --
 7 files changed, 793 insertions(+), 212 deletions(-)
 create mode 100644 hw/nvme/dif.h

-- 
2.35.1

Re: [PATCH 0/5] block layer: permission API refactoring in preparation

2022-03-01 Thread Kevin Wolf

Am 09.02.2022 um 11:54 hat Emanuele Giuseppe Esposito geschrieben:
> This serie aims to refactoring and fixing permission API related bugs that 
> came
> up in the serie "block layer: split block APIs in global state and I/O".
> In that serie, we are splitting all block layer headers in
> Global State (GS) APIs, holding always the BQL and running in the
> main loop, and I/O running in iothreads.
> 
> The patches in this serie are taken from v6 of the API split,
> to reduce its size and apply these fixes independently.
> 
> Patches 1 and 2 take care of crypto and amend jobs, since they
> incorrectly use the permission API also in iothreads.
> Patches 3-4-5 take care of bdrv_invalidate_cache and callers,
> since this function checks too for permisisons while being
> called by an iothread.

Thanks, applied to the block branch.

Kevin

Re: [PATCH 1/5] crypto: perform permission checks under BQL

2022-03-01 Thread Kevin Wolf

Am 09.02.2022 um 11:54 hat Emanuele Giuseppe Esposito geschrieben:
> Move the permission API calls into driver-specific callbacks
> that always run under BQL. In this case, bdrv_crypto_luks
> needs to perform permission checks before and after
> qcrypto_block_amend_options(). The problem is that the caller,
> block_crypto_amend_options_generic_luks(), can also run in I/O
> from .bdrv_co_amend(). This does not comply with Global State-I/O API split,
> as permissions API must always run under BQL.
> 
> Firstly, introduce .bdrv_amend_pre_run() and .bdrv_amend_clean()
> callbacks. These two callbacks are guaranteed to be invoked under
> BQL, respectively before and after .bdrv_co_amend().
> They take care of performing the permission checks
> in the same way as they are currently done before and after
> qcrypto_block_amend_options().
> These callbacks are in preparation for next patch, where we
> delete the original permission check. Right now they just add redundant
> control.
> 
> Then, call .bdrv_amend_pre_run() before job_start in
> qmp_x_blockdev_amend(), so that it will be run before the job coroutine
> is created and stay in the main loop.
> As a cleanup, use JobDriver's .clean() callback to call
> .bdrv_amend_clean(), and run amend-specific cleanup callbacks under BQL.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  block/amend.c | 24 
>  block/crypto.c| 27 +++
>  include/block/block_int.h | 14 ++
>  3 files changed, 65 insertions(+)
> 
> diff --git a/block/amend.c b/block/amend.c
> index 392df9ef83..329bca53dc 100644
> --- a/block/amend.c
> +++ b/block/amend.c
> @@ -53,10 +53,29 @@ static int coroutine_fn blockdev_amend_run(Job *job, 
> Error **errp)
>  return ret;
>  }
>  
> +static int blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
> +{
> +if (s->bs->drv->bdrv_amend_pre_run) {
> +return s->bs->drv->bdrv_amend_pre_run(s->bs, errp);
> +}
> +
> +return 0;
> +}
> +
> +static void blockdev_amend_clean(Job *job)
> +{
> +BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
> +
> +if (s->bs->drv->bdrv_amend_clean) {
> +s->bs->drv->bdrv_amend_clean(s->bs);
> +}
> +}
> +
>  static const JobDriver blockdev_amend_job_driver = {
>  .instance_size = sizeof(BlockdevAmendJob),
>  .job_type  = JOB_TYPE_AMEND,
>  .run   = blockdev_amend_run,
> +.clean = blockdev_amend_clean,
>  };
>  
>  void qmp_x_blockdev_amend(const char *job_id,
> @@ -113,5 +132,10 @@ void qmp_x_blockdev_amend(const char *job_id,
>  s->bs = bs,
>  s->opts = QAPI_CLONE(BlockdevAmendOptions, options),
>  s->force = has_force ? force : false;
> +
> +if (blockdev_amend_pre_run(s, errp)) {
> +return;
> +}
> +
>  job_start(&s->common);
>  }
> diff --git a/block/crypto.c b/block/crypto.c
> index c8ba4681e2..59f768ea8d 100644
> --- a/block/crypto.c
> +++ b/block/crypto.c
> @@ -777,6 +777,31 @@ block_crypto_get_specific_info_luks(BlockDriverState 
> *bs, Error **errp)
>  return spec_info;
>  }
>  
> +static int
> +block_crypto_amend_prepare(BlockDriverState *bs, Error **errp)
> +{
> +BlockCrypto *crypto = bs->opaque;
> +
> +/* apply for exclusive read/write permissions to the underlying file*/

Missing space before the end of the comment.

> +crypto->updating_keys = true;
> +return bdrv_child_refresh_perms(bs, bs->file, errp);
> +}
> +
> +static void
> +block_crypto_amend_cleanup(BlockDriverState *bs)
> +{
> +BlockCrypto *crypto = bs->opaque;
> +Error *errp = NULL;
> +
> +/* release exclusive read/write permissions to the underlying file*/

And here.

I can fix this up while applying.

Kevin

Re: [PATCH] aio-posix: fix spurious ->poll_ready() callbacks in main loop

2022-03-01 Thread Stefan Hajnoczi

On Wed, Feb 23, 2022 at 03:57:03PM +, Stefan Hajnoczi wrote:
> When ->poll() succeeds the AioHandler is placed on the ready list with
> revents set to the magic value 0. This magic value causes
> aio_dispatch_handler() to invoke ->poll_ready() instead of ->io_read()
> for G_IO_IN or ->io_write() for G_IO_OUT.
> 
> This magic value 0 hack works for the IOThread where AioHandlers are
> placed on ->ready_list and processed by aio_dispatch_ready_handlers().
> It does not work for the main loop where all AioHandlers are processed
> by aio_dispatch_handlers(), even those that are not ready and have a
> revents value of 0.
> 
> As a result the main loop invokes ->poll_ready() on AioHandlers that are
> not ready. These spurious ->poll_ready() calls waste CPU cycles and
> could lead to crashes if the code assumes ->poll() must have succeeded
> before ->poll_ready() is called (a reasonable asumption but I haven't
> seen it in practice).
> 
> Stop using revents to track whether ->poll_ready() will be called on an
> AioHandler. Introduce a separate AioHandler->poll_ready field instead.
> This eliminates spurious ->poll_ready() calls in the main loop.
> 
> Fixes: 826cc32423db2a99d184dbf4f507c737d7e7a4ae ("aio-posix: split poll check 
> from ready handler")
> Signed-off-by: Stefan Hajnoczi 
> ---
>  util/aio-posix.h |  1 +
>  util/aio-posix.c | 32 ++--
>  2 files changed, 19 insertions(+), 14 deletions(-)

Applied to my block tree:
https://gitlab.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature

Re: [PATCH 1/3] util & iothread: Introduce event-loop abstract class

2022-03-01 Thread Stefan Hajnoczi

On Mon, Feb 28, 2022 at 08:05:52PM +0100, Nicolas Saenz Julienne wrote:
> On Thu, 2022-02-24 at 09:48 +, Stefan Hajnoczi wrote:
> > On Mon, Feb 21, 2022 at 06:08:43PM +0100, Nicolas Saenz Julienne wrote:
> > > diff --git a/qom/meson.build b/qom/meson.build
> > > index 062a3789d8..c20e5dd1cb 100644
> > > --- a/qom/meson.build
> > > +++ b/qom/meson.build
> > > @@ -4,6 +4,7 @@ qom_ss.add(files(
> > >'object.c',
> > >'object_interfaces.c',
> > >'qom-qobject.c',
> > > +  '../util/event-loop.c',
> > 
> > This looks strange. I expected util/event-loop.c to be in
> > util/meson.build and added to the util_ss SourceSet instead of qom_ss.
> > 
> > What is the reason for this?
> 
> Sorry I meant to move it into the qom directory while cleaning up the series
> but forgot about it.
> 
> That said, I can see how moving 'event-loop-backend' in qom_ss isn't the
> cleanest.

Yes, qom/ is meant for the QEMU Object Model infrastructure itself, not
for all the QOM classes that rely on it.

> So I tried moving it into util_ss, but for some reason nobody is calling
> 'type_init(even_loop_register_type)'. My guess is there's some compilation
> quirk I'm missing.

Maybe the issue is that libqemuutil.a (util_ss) object files are linked
on demand. If there are no symbol dependencies in the main QEMU code to
event-loop.o then it won't be linked into the executable. That may be
why event_loop_register_type() isn't being called (it's set up by an
__attribute__((constructor)) function in event-loop.o so it doesn't help
create a symbol dependency).

> Any suggestions? I wonder if util_ss is the right spot for 
> 'event-loop-backend'
> anyway, but I don't have a better idea.

What Paolo suggested sounds good: move event-loop.c next to iothread.c
in the top-level source directory.

Stefan

signature.asc
Description: PGP signature

[PATCH v3 4/7] hw/isa/isa-bus: Remove isabus_dev_print()

[PATCH v3 7/7] isa: Inline and remove one-line isa_init_irq()

[PATCH 2/3] block/copy-before-write: add on-cbw-error open parameter

[PATCH 1/3] block/copy-before-write: refactor option parsing

[PATCH 3/3] iotests: add copy-before-write: on-cbw-error tests

[PATCH 0/3] block: copy-before-write: on-cbw-error behavior

[PATCH] block: Fix BB.root changing across bdrv_next()

Re: [PATCH v7 00/31] block layer: split block APIs in global state and I/O

Re: [PATCH v2 0/6] hw/nvme: enhanced protection information (64-bit guard)

Re: [RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

[RFC PATCH 4/5] child_job_drained_poll: override polling condition only when in home thread

[RFC PATCH 1/5] aio-wait.h: introduce AIO_WAIT_WHILE_UNLOCKED

[RFC PATCH 5/5] test-bdrv-drain: ensure draining from main loop stops iothreads

[RFC PATCH 3/5] block/io.c: introduce bdrv_subtree_drained_{begin/end}_unlocked

[RFC PATCH 0/5] Removal of AioContext lock, bs->parents and ->children: proof of concept

[RFC PATCH 2/5] introduce BDRV_POLL_WHILE_UNLOCKED

Re: [PATCH v5 13/15] hw/nvme: Add support for the Virtualization Management command

Re: [PATCH v5 14/15] docs: Add documentation for SR-IOV and Virtualization Enhancements

Re: [PATCH v5 09/15] hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime

Re: [PATCH v7 02/31] main loop: macros to mark GS and I/O functions

[PATCH v2 4/6] hw/nvme: add support for the lbafee hbs feature

[PATCH v2 3/6] hw/nvme: move format parameter parsing

[PATCH v2 2/6] hw/nvme: add host behavior support feature

[PATCH v2 6/6] hw/nvme: 64-bit pi support

[PATCH v2 1/6] hw/nvme: move dif/pi prototypes into dif.h

[PATCH v2 5/6] hw/nvme: add pi tuple size helper

[PATCH v2 0/6] hw/nvme: enhanced protection information (64-bit guard)

Re: [PATCH 0/5] block layer: permission API refactoring in preparation

Re: [PATCH 1/5] crypto: perform permission checks under BQL

Re: [PATCH] aio-posix: fix spurious ->poll_ready() callbacks in main loop

Re: [PATCH 1/3] util & iothread: Introduce event-loop abstract class

31 matches

Site Navigation

Mail list logo

Footer information