date:20200512

[PATCH v3 1/3] target: Remove unnecessary CPU() cast

2020-05-12 Thread Philippe Mathieu-Daudé

The CPU() macro is defined as:

  #define CPU(obj) ((CPUState *)(obj))

which expands to:

  ((CPUState *)object_dynamic_cast_assert((Object *)(obj), (name),
  __FILE__, __LINE__, __func__))

This assertion can only fail when @obj points to something other
than its stated type, i.e. when we're in undefined behavior country.

Remove the unnecessary CPU() casts when we already know the pointer
is of CPUState type.

Patch created mechanically using spatch with this script:

  @@
  typedef CPUState;
  CPUState *s;
  @@
  -   CPU(s)
  +   s

Acked-by: David Gibson 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Richard Henderson 
Reviewed-by: Markus Armbruster 
Signed-off-by: Philippe Mathieu-Daudé 
---
 target/ppc/mmu_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/mmu_helper.c b/target/ppc/mmu_helper.c
index 86c667b094..8972714775 100644
--- a/target/ppc/mmu_helper.c
+++ b/target/ppc/mmu_helper.c
@@ -1820,7 +1820,7 @@ static inline void do_invalidate_BAT(CPUPPCState *env, 
target_ulong BATu,
 if (((end - base) >> TARGET_PAGE_BITS) > 1024) {
 /* Flushing 1024 4K pages is slower than a complete flush */
 LOG_BATS("Flush all BATs\n");
-tlb_flush(CPU(cs));
+tlb_flush(cs);
 LOG_BATS("Flush done\n");
 return;
 }
-- 
2.21.3

[PATCH v3 0/3] various: Remove unnecessary casts

2020-05-12 Thread Philippe Mathieu-Daudé

Remove unnecessary casts using coccinelle scripts.

The CPU()/OBJECT() patches don't introduce logical change,
The DEVICE() one removes various OBJECT_CHECK() calls.

Since v3:
- Fixed patch #2 description (Markus)
- Add A-b/R-b tags

Since v2:
- Reword description (Markus)
- Add A-b/R-b tags

Philippe Mathieu-Daudé (3):
  target: Remove unnecessary CPU() cast
  various: Remove unnecessary OBJECT() cast
  hw: Remove unnecessary DEVICE() cast

 hw/core/bus.c   | 2 +-
 hw/display/artist.c | 2 +-
 hw/display/cg3.c| 2 +-
 hw/display/sm501.c  | 2 +-
 hw/display/tcx.c| 4 ++--
 hw/display/vga-isa.c| 2 +-
 hw/i2c/imx_i2c.c| 2 +-
 hw/i2c/mpc_i2c.c| 2 +-
 hw/ide/ahci-allwinner.c | 2 +-
 hw/ide/piix.c   | 2 +-
 hw/ipmi/smbus_ipmi.c| 2 +-
 hw/microblaze/petalogix_ml605_mmu.c | 8 
 hw/misc/macio/pmu.c | 2 +-
 hw/net/ftgmac100.c  | 3 +--
 hw/net/imx_fec.c| 2 +-
 hw/nubus/nubus-device.c | 2 +-
 hw/pci-host/bonito.c| 2 +-
 hw/ppc/spapr.c  | 2 +-
 hw/s390x/sclp.c | 2 +-
 hw/sh4/sh_pci.c | 2 +-
 hw/xen/xen-legacy-backend.c | 2 +-
 monitor/misc.c  | 3 +--
 qom/object.c| 4 ++--
 target/ppc/mmu_helper.c | 2 +-
 24 files changed, 29 insertions(+), 31 deletions(-)

-- 
2.21.3

[PATCH v3 2/3] various: Remove unnecessary OBJECT() cast

2020-05-12 Thread Philippe Mathieu-Daudé

The OBJECT() macro is defined as:

  #define OBJECT(obj) ((Object *)(obj))

Remove the unnecessary OBJECT() casts when we already know the
pointer is of Object type.

Patch created mechanically using spatch with this script:

  @@
  typedef Object;
  Object *o;
  @@
  -   OBJECT(o)
  +   o

Acked-by: Cornelia Huck 
Acked-by: Corey Minyard 
Acked-by: John Snow 
Reviewed-by: Richard Henderson 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/core/bus.c   | 2 +-
 hw/ide/ahci-allwinner.c | 2 +-
 hw/ipmi/smbus_ipmi.c| 2 +-
 hw/microblaze/petalogix_ml605_mmu.c | 8 
 hw/s390x/sclp.c | 2 +-
 monitor/misc.c  | 3 +--
 qom/object.c| 4 ++--
 7 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/hw/core/bus.c b/hw/core/bus.c
index 3dc0a825f0..4ea5870de8 100644
--- a/hw/core/bus.c
+++ b/hw/core/bus.c
@@ -25,7 +25,7 @@
 
 void qbus_set_hotplug_handler(BusState *bus, Object *handler, Error **errp)
 {
-object_property_set_link(OBJECT(bus), OBJECT(handler),
+object_property_set_link(OBJECT(bus), handler,
  QDEV_HOTPLUG_HANDLER_PROPERTY, errp);
 }
 
diff --git a/hw/ide/ahci-allwinner.c b/hw/ide/ahci-allwinner.c
index bb8393d2b6..8536b9eb5a 100644
--- a/hw/ide/ahci-allwinner.c
+++ b/hw/ide/ahci-allwinner.c
@@ -90,7 +90,7 @@ static void allwinner_ahci_init(Object *obj)
 SysbusAHCIState *s = SYSBUS_AHCI(obj);
 AllwinnerAHCIState *a = ALLWINNER_AHCI(obj);
 
-memory_region_init_io(&a->mmio, OBJECT(obj), &allwinner_ahci_mem_ops, a,
+memory_region_init_io(&a->mmio, obj, &allwinner_ahci_mem_ops, a,
   "allwinner-ahci", ALLWINNER_AHCI_MMIO_SIZE);
 memory_region_add_subregion(&s->ahci.mem, ALLWINNER_AHCI_MMIO_OFF,
 &a->mmio);
diff --git a/hw/ipmi/smbus_ipmi.c b/hw/ipmi/smbus_ipmi.c
index 2a9470d9df..f1a0148755 100644
--- a/hw/ipmi/smbus_ipmi.c
+++ b/hw/ipmi/smbus_ipmi.c
@@ -329,7 +329,7 @@ static void smbus_ipmi_init(Object *obj)
 {
 SMBusIPMIDevice *sid = SMBUS_IPMI(obj);
 
-ipmi_bmc_find_and_link(OBJECT(obj), (Object **) &sid->bmc);
+ipmi_bmc_find_and_link(obj, (Object **) &sid->bmc);
 }
 
 static void smbus_ipmi_get_fwinfo(struct IPMIInterface *ii, IPMIFwInfo *info)
diff --git a/hw/microblaze/petalogix_ml605_mmu.c 
b/hw/microblaze/petalogix_ml605_mmu.c
index 0a2640c40b..52dcea9abd 100644
--- a/hw/microblaze/petalogix_ml605_mmu.c
+++ b/hw/microblaze/petalogix_ml605_mmu.c
@@ -150,9 +150,9 @@ petalogix_ml605_init(MachineState *machine)
 qdev_set_nic_properties(eth0, &nd_table[0]);
 qdev_prop_set_uint32(eth0, "rxmem", 0x1000);
 qdev_prop_set_uint32(eth0, "txmem", 0x1000);
-object_property_set_link(OBJECT(eth0), OBJECT(ds),
+object_property_set_link(OBJECT(eth0), ds,
  "axistream-connected", &error_abort);
-object_property_set_link(OBJECT(eth0), OBJECT(cs),
+object_property_set_link(OBJECT(eth0), cs,
  "axistream-control-connected", &error_abort);
 qdev_init_nofail(eth0);
 sysbus_mmio_map(SYS_BUS_DEVICE(eth0), 0, AXIENET_BASEADDR);
@@ -163,9 +163,9 @@ petalogix_ml605_init(MachineState *machine)
 cs = object_property_get_link(OBJECT(eth0),
   "axistream-control-connected-target", NULL);
 qdev_prop_set_uint32(dma, "freqhz", 100 * 100);
-object_property_set_link(OBJECT(dma), OBJECT(ds),
+object_property_set_link(OBJECT(dma), ds,
  "axistream-connected", &error_abort);
-object_property_set_link(OBJECT(dma), OBJECT(cs),
+object_property_set_link(OBJECT(dma), cs,
  "axistream-control-connected", &error_abort);
 qdev_init_nofail(dma);
 sysbus_mmio_map(SYS_BUS_DEVICE(dma), 0, AXIDMA_BASEADDR);
diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c
index ede056b3ef..4132286db7 100644
--- a/hw/s390x/sclp.c
+++ b/hw/s390x/sclp.c
@@ -322,7 +322,7 @@ void s390_sclp_init(void)
 
 object_property_add_child(qdev_get_machine(), TYPE_SCLP, new,
   NULL);
-object_unref(OBJECT(new));
+object_unref(new);
 qdev_init_nofail(DEVICE(new));
 }
 
diff --git a/monitor/misc.c b/monitor/misc.c
index 9723b466cd..f5207cd242 100644
--- a/monitor/misc.c
+++ b/monitor/misc.c
@@ -1837,8 +1837,7 @@ void object_add_completion(ReadLineState *rs, int 
nb_args, const char *str)
 static int qdev_add_hotpluggable_device(Object *obj, void *opaque)
 {
 GSList **list = opaque;
-DeviceState *dev = (DeviceState *)object_dynamic_cast(OBJECT(obj),
-  TYPE_DEVICE);
+DeviceState *dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE);
 
 if (dev == NULL) {
 return 0;
diff --git a/qom/object.c b/qom/object.c
index be700e831f..07c1443d0e 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -762,7 +762,7 @@ Object *ob

[PATCH v3 3/3] hw: Remove unnecessary DEVICE() cast

2020-05-12 Thread Philippe Mathieu-Daudé

The DEVICE() macro is defined as:

  #define DEVICE(obj) OBJECT_CHECK(DeviceState, (obj), TYPE_DEVICE)

which expands to:

  ((DeviceState *)object_dynamic_cast_assert((Object *)(obj), (name),
 __FILE__, __LINE__,
 __func__))

This assertion can only fail when @obj points to something other
than its stated type, i.e. when we're in undefined behavior country.

Remove the unnecessary DEVICE() casts when we already know the
pointer is of DeviceState type.

Patch created mechanically using spatch with this script:

  @@
  typedef DeviceState;
  DeviceState *s;
  @@
  -   DEVICE(s)
  +   s

Acked-by: David Gibson 
Acked-by: Paul Durrant 
Reviewed-by: Markus Armbruster 
Reviewed-by: Cédric Le Goater 
Acked-by: John Snow 
Reviewed-by: Richard Henderson 
Reviewed-by: Markus Armbruster 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/display/artist.c | 2 +-
 hw/display/cg3.c| 2 +-
 hw/display/sm501.c  | 2 +-
 hw/display/tcx.c| 4 ++--
 hw/display/vga-isa.c| 2 +-
 hw/i2c/imx_i2c.c| 2 +-
 hw/i2c/mpc_i2c.c| 2 +-
 hw/ide/piix.c   | 2 +-
 hw/misc/macio/pmu.c | 2 +-
 hw/net/ftgmac100.c  | 3 +--
 hw/net/imx_fec.c| 2 +-
 hw/nubus/nubus-device.c | 2 +-
 hw/pci-host/bonito.c| 2 +-
 hw/ppc/spapr.c  | 2 +-
 hw/sh4/sh_pci.c | 2 +-
 hw/xen/xen-legacy-backend.c | 2 +-
 16 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/hw/display/artist.c b/hw/display/artist.c
index 753dbb9a77..7e2a4556bd 100644
--- a/hw/display/artist.c
+++ b/hw/display/artist.c
@@ -1353,7 +1353,7 @@ static void artist_realizefn(DeviceState *dev, Error 
**errp)
 s->cursor_height = 32;
 s->cursor_width = 32;
 
-s->con = graphic_console_init(DEVICE(dev), 0, &artist_ops, s);
+s->con = graphic_console_init(dev, 0, &artist_ops, s);
 qemu_console_resize(s->con, s->width, s->height);
 }
 
diff --git a/hw/display/cg3.c b/hw/display/cg3.c
index a1ede10394..f7f1c199ce 100644
--- a/hw/display/cg3.c
+++ b/hw/display/cg3.c
@@ -321,7 +321,7 @@ static void cg3_realizefn(DeviceState *dev, Error **errp)
 
 sysbus_init_irq(sbd, &s->irq);
 
-s->con = graphic_console_init(DEVICE(dev), 0, &cg3_ops, s);
+s->con = graphic_console_init(dev, 0, &cg3_ops, s);
 qemu_console_resize(s->con, s->width, s->height);
 }
 
diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index de0ab9d977..2a564889bd 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -1839,7 +1839,7 @@ static void sm501_init(SM501State *s, DeviceState *dev,
 &s->twoD_engine_region);
 
 /* create qemu graphic console */
-s->con = graphic_console_init(DEVICE(dev), 0, &sm501_ops, s);
+s->con = graphic_console_init(dev, 0, &sm501_ops, s);
 }
 
 static const VMStateDescription vmstate_sm501_state = {
diff --git a/hw/display/tcx.c b/hw/display/tcx.c
index 76de16e8ea..1fb45b1aab 100644
--- a/hw/display/tcx.c
+++ b/hw/display/tcx.c
@@ -868,9 +868,9 @@ static void tcx_realizefn(DeviceState *dev, Error **errp)
 sysbus_init_irq(sbd, &s->irq);
 
 if (s->depth == 8) {
-s->con = graphic_console_init(DEVICE(dev), 0, &tcx_ops, s);
+s->con = graphic_console_init(dev, 0, &tcx_ops, s);
 } else {
-s->con = graphic_console_init(DEVICE(dev), 0, &tcx24_ops, s);
+s->con = graphic_console_init(dev, 0, &tcx24_ops, s);
 }
 s->thcmisc = 0;
 
diff --git a/hw/display/vga-isa.c b/hw/display/vga-isa.c
index 0633ed382c..3aaeeeca1e 100644
--- a/hw/display/vga-isa.c
+++ b/hw/display/vga-isa.c
@@ -74,7 +74,7 @@ static void vga_isa_realizefn(DeviceState *dev, Error **errp)
 0x000a,
 vga_io_memory, 1);
 memory_region_set_coalescing(vga_io_memory);
-s->con = graphic_console_init(DEVICE(dev), 0, s->hw_ops, s);
+s->con = graphic_console_init(dev, 0, s->hw_ops, s);
 
 memory_region_add_subregion(isa_address_space(isadev),
 VBE_DISPI_LFB_PHYSICAL_ADDRESS,
diff --git a/hw/i2c/imx_i2c.c b/hw/i2c/imx_i2c.c
index 30b9aea247..2e02e1c4fa 100644
--- a/hw/i2c/imx_i2c.c
+++ b/hw/i2c/imx_i2c.c
@@ -305,7 +305,7 @@ static void imx_i2c_realize(DeviceState *dev, Error **errp)
   IMX_I2C_MEM_SIZE);
 sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->iomem);
 sysbus_init_irq(SYS_BUS_DEVICE(dev), &s->irq);
-s->bus = i2c_init_bus(DEVICE(dev), NULL);
+s->bus = i2c_init_bus(dev, NULL);
 }
 
 static void imx_i2c_class_init(ObjectClass *klass, void *data)
diff --git a/hw/i2c/mpc_i2c.c b/hw/i2c/mpc_i2c.c
index 0aa1be3ce7..9a724f3a3e 100644
--- a/hw/i2c/mpc_i2c.c
+++ b/hw/i2c/mpc_i2c.c
@@ -332,7 +332,7 @@ static void mpc_i2c_realize(DeviceState *dev, Error **errp)
 memory_region_init_io(&i2c->iomem, OBJECT(i2c), &i2c_ops, i2c,

[PATCH v3 1/2] scripts/qemugdb: Remove shebang header

2020-05-12 Thread Philippe Mathieu-Daudé

These scripts are loaded as plugin by GDB (and they don't
have any __main__ entry point). Remove the shebang header.

Acked-by: Alex Bennée 
Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/qemugdb/__init__.py  | 3 +--
 scripts/qemugdb/aio.py   | 3 +--
 scripts/qemugdb/coroutine.py | 3 +--
 scripts/qemugdb/mtree.py | 4 +---
 scripts/qemugdb/tcg.py   | 1 -
 5 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/scripts/qemugdb/__init__.py b/scripts/qemugdb/__init__.py
index 969f552b26..da8ff612e5 100644
--- a/scripts/qemugdb/__init__.py
+++ b/scripts/qemugdb/__init__.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright (c) 2015 Linaro Ltd
diff --git a/scripts/qemugdb/aio.py b/scripts/qemugdb/aio.py
index 2ba00c..d7c1ba0c28 100644
--- a/scripts/qemugdb/aio.py
+++ b/scripts/qemugdb/aio.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support: aio/iohandler debug
 #
 # Copyright (c) 2015 Red Hat, Inc.
diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py
index 41e079d0e2..db61389022 100644
--- a/scripts/qemugdb/coroutine.py
+++ b/scripts/qemugdb/coroutine.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright 2012 Red Hat, Inc. and/or its affiliates
diff --git a/scripts/qemugdb/mtree.py b/scripts/qemugdb/mtree.py
index 3030a60d3f..8fe42c3c12 100644
--- a/scripts/qemugdb/mtree.py
+++ b/scripts/qemugdb/mtree.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright 2012 Red Hat, Inc. and/or its affiliates
@@ -84,4 +83,3 @@ def print_item(self, ptr, offset = gdb.Value(0), level = 0):
 while not isnull(subregion):
 self.print_item(subregion, addr, level)
 subregion = subregion['subregions_link']['tqe_next']
-
diff --git a/scripts/qemugdb/tcg.py b/scripts/qemugdb/tcg.py
index 18880fc9a7..16c03c06a9 100644
--- a/scripts/qemugdb/tcg.py
+++ b/scripts/qemugdb/tcg.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 # -*- coding: utf-8 -*-
 #
 # GDB debugging support, TCG status
-- 
2.21.3

[PATCH v3 0/2] scripts: More Python fixes

2020-05-12 Thread Philippe Mathieu-Daudé

Trivial Python3 fixes, again...

Since v2:
- Remove patch updating MAINTAINERS

Since v1:
- Added Alex Bennée A-b tags
- Addressed John Snow review comments
  - Use /usr/bin/env
  - Do not modify os.path (dropped last patch)

Philippe Mathieu-Daudé (2):
  scripts/qemugdb: Remove shebang header
  scripts/qmp: Use Python 3 interpreter

 scripts/qemugdb/__init__.py  | 3 +--
 scripts/qemugdb/aio.py   | 3 +--
 scripts/qemugdb/coroutine.py | 3 +--
 scripts/qemugdb/mtree.py | 4 +---
 scripts/qemugdb/tcg.py   | 1 -
 scripts/qmp/qom-get  | 2 +-
 scripts/qmp/qom-list | 2 +-
 scripts/qmp/qom-set  | 2 +-
 scripts/qmp/qom-tree | 2 +-
 9 files changed, 8 insertions(+), 14 deletions(-)

-- 
2.21.3

[PATCH v3 2/2] scripts/qmp: Use Python 3 interpreter

2020-05-12 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/qmp/qom-get  | 2 +-
 scripts/qmp/qom-list | 2 +-
 scripts/qmp/qom-set  | 2 +-
 scripts/qmp/qom-tree | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/qmp/qom-get b/scripts/qmp/qom-get
index 007b4cd442..7c5ede91bb 100755
--- a/scripts/qmp/qom-get
+++ b/scripts/qmp/qom-get
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-list b/scripts/qmp/qom-list
index 03bda3446b..bb68fd65d4 100755
--- a/scripts/qmp/qom-list
+++ b/scripts/qmp/qom-list
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-set b/scripts/qmp/qom-set
index c37fe78b00..19881d85e9 100755
--- a/scripts/qmp/qom-set
+++ b/scripts/qmp/qom-set
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-tree b/scripts/qmp/qom-tree
index 1c8acf61e7..fa91147a03 100755
--- a/scripts/qmp/qom-tree
+++ b/scripts/qmp/qom-tree
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
-- 
2.21.3

[PATCH v2 1/3] hw/ide/ahci: Use qdev gpio rather than qemu_allocate_irqs()

2020-05-12 Thread Philippe Mathieu-Daudé

When plugging/unplugging devices on a AHCI bus, we might
leak the memory returned by qemu_allocate_irqs(). We can
avoid this leak by switching to using qdev_init_gpio_in();
the base class finalize will free the irqs that this allocates
under the hood.

Patch created mechanically using spatch with this script
inspired from commit d6ef883d9d7:

  @@
  typedef qemu_irq;
  identifier irqs, handler;
  expression opaque, count, i;
  @@
  -   qemu_irq *irqs;
  ...
  -   irqs = qemu_allocate_irqs(handler, opaque, count);
  +   qdev_init_gpio_in(DEVICE(opaque), handler, count);
  <+...
  -   irqs[i]
  +   qdev_get_gpio_in(DEVICE(opaque), i)
  ...+>
  ?-  g_free(irqs);

Inspired-by: Peter Maydell 
Reviewed-by: Alistair Francis 
Reviewed-by: John Snow 
Acked-by: John Snow 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/ide/ahci.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 13d91e109a..991bb7c9c8 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1534,19 +1534,18 @@ void ahci_init(AHCIState *s, DeviceState *qdev)
 
 void ahci_realize(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports)
 {
-qemu_irq *irqs;
 int i;
 
 s->as = as;
 s->ports = ports;
 s->dev = g_new0(AHCIDevice, ports);
 ahci_reg_init(s);
-irqs = qemu_allocate_irqs(ahci_irq_set, s, s->ports);
+qdev_init_gpio_in(qdev, ahci_irq_set, s->ports);
 for (i = 0; i < s->ports; i++) {
 AHCIDevice *ad = &s->dev[i];
 
 ide_bus_new(&ad->port, sizeof(ad->port), qdev, i, 1);
-ide_init2(&ad->port, irqs[i]);
+ide_init2(&ad->port, qdev_get_gpio_in(qdev, i));
 
 ad->hba = s;
 ad->port_no = i;
@@ -1554,7 +1553,6 @@ void ahci_realize(AHCIState *s, DeviceState *qdev, 
AddressSpace *as, int ports)
 ad->port.dma->ops = &ahci_dma_ops;
 ide_register_restart_cb(&ad->port);
 }
-g_free(irqs);
 }
 
 void ahci_uninit(AHCIState *s)
-- 
2.21.3

[PATCH v2 0/3] hw: Use qdev gpio rather than qemu_allocate_irqs()

2020-05-12 Thread Philippe Mathieu-Daudé

Use a coccinelle script to convert few qemu_allocate_irqs()
calls to the qdev gpio API.

One memory leak removed in hw/openrisc/pic_cpu.c

Since v1:
- Referrenced Coverity CID (Stafford)
- Reword AHCI description (Zoltan)

Philippe Mathieu-Daudé (3):
  hw/ide/ahci: Use qdev gpio rather than qemu_allocate_irqs()
  hw/mips/mips_int: Use qdev gpio rather than qemu_allocate_irqs()
  hw/openrisc/pic_cpu: Use qdev gpio rather than qemu_allocate_irqs()

 hw/ide/ahci.c | 6 ++
 hw/mips/mips_int.c| 6 ++
 hw/openrisc/pic_cpu.c | 5 ++---
 3 files changed, 6 insertions(+), 11 deletions(-)

-- 
2.21.3

[PATCH v2 3/3] hw/openrisc/pic_cpu: Use qdev gpio rather than qemu_allocate_irqs()

2020-05-12 Thread Philippe Mathieu-Daudé

Coverity points out (CID 1421934) that we are leaking the
memory returned by qemu_allocate_irqs(). We can avoid this
leak by switching to using qdev_init_gpio_in(); the base
class finalize will free the irqs that this allocates under
the hood.

Patch created mechanically using spatch with this script
inspired from commit d6ef883d9d7:

  @@
  typedef qemu_irq;
  identifier irqs, handler;
  expression opaque, count, i;
  @@
  -   qemu_irq *irqs;
  ...
  -   irqs = qemu_allocate_irqs(handler, opaque, count);
  +   qdev_init_gpio_in(DEVICE(opaque), handler, count);
  <+...
  -   irqs[i]
  +   qdev_get_gpio_in(DEVICE(opaque), i)
  ...+>
  ?-  g_free(irqs);

Fixes: Coverity CID 1421934 RESOURCE_LEAK
Inspired-by: Peter Maydell 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/openrisc/pic_cpu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/openrisc/pic_cpu.c b/hw/openrisc/pic_cpu.c
index 36f9350830..4b0c92f842 100644
--- a/hw/openrisc/pic_cpu.c
+++ b/hw/openrisc/pic_cpu.c
@@ -52,10 +52,9 @@ static void openrisc_pic_cpu_handler(void *opaque, int irq, 
int level)
 void cpu_openrisc_pic_init(OpenRISCCPU *cpu)
 {
 int i;
-qemu_irq *qi;
-qi = qemu_allocate_irqs(openrisc_pic_cpu_handler, cpu, NR_IRQS);
+qdev_init_gpio_in(DEVICE(cpu), openrisc_pic_cpu_handler, NR_IRQS);
 
 for (i = 0; i < NR_IRQS; i++) {
-cpu->env.irq[i] = qi[i];
+cpu->env.irq[i] = qdev_get_gpio_in(DEVICE(cpu), i);
 }
 }
-- 
2.21.3

[PATCH v2 2/3] hw/mips/mips_int: Use qdev gpio rather than qemu_allocate_irqs()

2020-05-12 Thread Philippe Mathieu-Daudé

Switch to using the qdev gpio API which is preferred over
qemu_allocate_irqs(). One step to eventually deprecate and
remove qemu_allocate_irqs() one day.

Patch created mechanically using spatch with this script
inspired from commit d6ef883d9d7:

  @@
  typedef qemu_irq;
  identifier irqs, handler;
  expression opaque, count, i;
  @@
  -   qemu_irq *irqs;
  ...
  -   irqs = qemu_allocate_irqs(handler, opaque, count);
  +   qdev_init_gpio_in(DEVICE(opaque), handler, count);
  <+...
  -   irqs[i]
  +   qdev_get_gpio_in(DEVICE(opaque), i)
  ...+>
  ?-  g_free(irqs);

Fixes: Coverity CID 1421934 RESOURCE_LEAK
Inspired-by: Peter Maydell 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/mips/mips_int.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c
index 796730b11d..91788c51a9 100644
--- a/hw/mips/mips_int.c
+++ b/hw/mips/mips_int.c
@@ -74,14 +74,12 @@ static void cpu_mips_irq_request(void *opaque, int irq, int 
level)
 void cpu_mips_irq_init_cpu(MIPSCPU *cpu)
 {
 CPUMIPSState *env = &cpu->env;
-qemu_irq *qi;
 int i;
 
-qi = qemu_allocate_irqs(cpu_mips_irq_request, cpu, 8);
+qdev_init_gpio_in(DEVICE(cpu), cpu_mips_irq_request, 8);
 for (i = 0; i < 8; i++) {
-env->irq[i] = qi[i];
+env->irq[i] = qdev_get_gpio_in(DEVICE(cpu), i);
 }
-g_free(qi);
 }
 
 void cpu_mips_soft_irq(CPUMIPSState *env, int irq, int level)
-- 
2.21.3

Re: [PATCH 2/5] io/channel.c,io/channel-socket.c: Add yank feature

2020-05-12 Thread Daniel P . Berrangé

On Mon, May 11, 2020 at 10:00:42PM +0200, Lukas Straub wrote:
> On Mon, 11 May 2020 12:51:46 +0100
> Daniel P. Berrangé  wrote:
> 
> > On Mon, May 11, 2020 at 01:14:41PM +0200, Lukas Straub wrote:
> > > Add qio_channel_set_yank function to channel and to channel-socket,
> > > which will register a yank function. The yank function calls
> > > shutdown() on the socket.
> > > 
> > > Signed-off-by: Lukas Straub 
> > > ---
> > >  Makefile.objs   |  1 +
> > >  include/io/channel-socket.h |  1 +
> > >  include/io/channel.h| 12 
> > >  io/channel-socket.c | 29 +
> > >  io/channel.c|  9 +
> > >  5 files changed, 52 insertions(+)  
> > 
> > Assuming we want the yank feature (which I'm not entirely convinced
> > of), then I don't think any of this addition should exist. The
> > QIOChannel class already provides a "qio_channel_shutdown" method
> > which can be invoked. The layer above which is using the QIOChannel
> > should be calling this existing qio_channel_shutdown method in
> > response to any yank request.  The I/O layer shouldn't have any
> > direct dependancy on the yank feature.
> 
> Having the code here simplifys the code in the other places.

This introduces a depedancy from the IO channel code into the system
emulator infra which is not desired. The goal is to keep the io/ module
isolated and as self-contained as possible.

I don't think it makes a significant difference to the yank code to
keep it out of the io layer, and simply call the QIOChannel objects
via their existing public API.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 3/5] block/nbd.c: Add yank feature

2020-05-12 Thread Daniel P . Berrangé

On Mon, May 11, 2020 at 07:05:24PM +0200, Lukas Straub wrote:
> On Mon, 11 May 2020 17:19:09 +0100
> "Dr. David Alan Gilbert"  wrote:
> 
> > * Lukas Straub (lukasstra...@web.de) wrote:
> > > Add yank option, pass it to the socket-channel and register a yank
> > > function which sets s->state = NBD_CLIENT_QUIT. This is the same
> > > behaviour as if an error occured.
> > > 
> > > Signed-off-by: Lukas Straub   
> > 
> > > +static void nbd_yank(void *opaque)
> > > +{
> > > +BlockDriverState *bs = opaque;
> > > +BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
> > > +
> > > +atomic_set(&s->state, NBD_CLIENT_QUIT);  
> > 
> > I think I was expecting a shutdown on the socket here - why doesn't it
> > have one?
> 
> For nbd, we register two yank functions: This one and we enable
> the yank feature on the qio channel (see function
> nbd_establish_connection below).

As mentioned on the earlier patch, I don't want to see any yank
code in the QIOChannel object directly. This nbd_yank function
can simply call the qio_channel_shutdown() function directly
and avoid need for modifying the QIOChannel object with yank
support.

This will make the NBD code clearer too, as we can see exactly
what actions are performed at NBD layer when a yank happens,
which is what David was not seeing clearly here.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v3 1/2] scripts/qemugdb: Remove shebang header

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 09:06 hat Philippe Mathieu-DaudÃ© geschrieben:
> These scripts are loaded as plugin by GDB (and they don't
> have any __main__ entry point). Remove the shebang header.
> 
> Acked-by: Alex BennÃ©e 
> Signed-off-by: Philippe Mathieu-DaudÃ© 
> ---
>  scripts/qemugdb/__init__.py  | 3 +--
>  scripts/qemugdb/aio.py   | 3 +--
>  scripts/qemugdb/coroutine.py | 3 +--
>  scripts/qemugdb/mtree.py | 4 +---
>  scripts/qemugdb/tcg.py   | 1 -

There is still a shebang line left in scripts/qemugdb/timers.py.

Kevin

Re: [PATCH 4/5] chardev/char-socket.c: Add yank feature

2020-05-12 Thread Daniel P . Berrangé

On Mon, May 11, 2020 at 01:14:47PM +0200, Lukas Straub wrote:
> Add yank option which is passed to the socket-channel.
> 
> Signed-off-by: Lukas Straub 
> ---
>  chardev/char-socket.c | 8 
>  chardev/char.c| 3 +++
>  qapi/char.json| 5 -
>  3 files changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/chardev/char-socket.c b/chardev/char-socket.c
> index 185fe38dda..e476358941 100644
> --- a/chardev/char-socket.c
> +++ b/chardev/char-socket.c
> @@ -65,6 +65,7 @@ typedef struct {
>  int max_size;
>  int do_telnetopt;
>  int do_nodelay;
> +int do_yank;
>  int *read_msgfds;
>  size_t read_msgfds_num;
>  int *write_msgfds;
> @@ -877,6 +878,9 @@ static int tcp_chr_new_client(Chardev *chr, 
> QIOChannelSocket *sioc)
>  if (s->do_nodelay) {
>  qio_channel_set_delay(s->ioc, false);
>  }
> +if (s->do_yank) {
> +qio_channel_set_yank(s->ioc, true);
> +}

This should call yank_register_function() to  register a local
callback, which then invokes qio_channel_shutdown() upon acting,
avoiding the need to add this qio_channel_set_yank method.

Likewise for the migration code in the next patch.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v3 2/2] scripts/qmp: Use Python 3 interpreter

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 09:06 hat Philippe Mathieu-DaudÃ© geschrieben:
> Signed-off-by: Philippe Mathieu-DaudÃ© 

Reviewed-by: Kevin Wolf

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Daniel P . Berrangé

On Mon, May 11, 2020 at 08:12:18PM +0200, Lukas Straub wrote:
> On Mon, 11 May 2020 12:49:47 +0100
> Daniel P. Berrangé  wrote:
> 
> > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote:
> > > Hello Everyone,
> > > In many cases, if qemu has a network connection (qmp, migration, chardev, 
> > > etc.)
> > > to some other server and that server dies or hangs, qemu hangs too.  
> > 
> > If qemu as a whole hangs due to a stalled network connection, that is a
> > bug in QEMU that we should be fixing IMHO. QEMU should be doing non-blocking
> > I/O in general, such that if the network connection or remote server stalls,
> > we simply stop sending I/O - we shouldn't ever hang the QEMU process or main
> > loop.
> > 
> > There are places in QEMU code which are not well behaved in this respect,
> > but many are, and others are getting fixed where found to be important.
> > 
> > Arguably any place in QEMU code which can result in a hang of QEMU in the
> > event of a stalled network should be considered a security flaw, because
> > the network is untrusted in general.
> 
> The fact that out-of-band qmp commands exist at all shows that we have to 
> make tradeoffs of developer time vs. doing things right. Sure, the migration 
> code can be rewritten to use non-blocking i/o and finegrained locks. But as a 
> hobbyist I don't have time to fix this.
> 
> > > These patches introduce the new 'yank' out-of-band qmp command to recover 
> > > from
> > > these kinds of hangs. The different subsystems register callbacks which 
> > > get
> > > executed with the yank command. For example the callback can shutdown() a
> > > socket. This is intended for the colo use-case, but it can be used for 
> > > other
> > > things too of course.  
> > 
> > IIUC, invoking the "yank" command unconditionally kills every single
> > network connection in QEMU that has registered with the "yank" subsystem.
> > IMHO this is way too big of a hammer, even if we accept there are bugs in
> > QEMU not handling stalled networking well.
> > 
> > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD
> > connection used for the guest disk, we needlessly break I/O.
> 
> Yeah, these patches are intended to solve the problems with the colo
> use-case where all external connections (migration, chardevs, nbd)
> are just for replication. In other use-cases you'd enable the yank
> feature only on the non-essential connections.

That is a pretty inflexible design for other use cases though,
as "non-essential" is not a black & white list in general. There
are varying levels of importance to the different channels. We
can afford to loose migration without any user visible effects.
If that doesn't solve it, a serial device chardev, or VNC connection
can be dropped at the inconvenience of loosing interactive console
which is end user visible impact, so may only be want to be yanked
if the migration yank didn't fix it. 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Dr. David Alan Gilbert

* Daniel P. BerrangÃ© (berra...@redhat.com) wrote:
> On Mon, May 11, 2020 at 08:12:18PM +0200, Lukas Straub wrote:
> > On Mon, 11 May 2020 12:49:47 +0100
> > Daniel P. BerrangÃ©  wrote:
> > 
> > > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote:
> > > > Hello Everyone,
> > > > In many cases, if qemu has a network connection (qmp, migration, 
> > > > chardev, etc.)
> > > > to some other server and that server dies or hangs, qemu hangs too.  
> > > 
> > > If qemu as a whole hangs due to a stalled network connection, that is a
> > > bug in QEMU that we should be fixing IMHO. QEMU should be doing 
> > > non-blocking
> > > I/O in general, such that if the network connection or remote server 
> > > stalls,
> > > we simply stop sending I/O - we shouldn't ever hang the QEMU process or 
> > > main
> > > loop.
> > > 
> > > There are places in QEMU code which are not well behaved in this respect,
> > > but many are, and others are getting fixed where found to be important.
> > > 
> > > Arguably any place in QEMU code which can result in a hang of QEMU in the
> > > event of a stalled network should be considered a security flaw, because
> > > the network is untrusted in general.
> > 
> > The fact that out-of-band qmp commands exist at all shows that we have to 
> > make tradeoffs of developer time vs. doing things right. Sure, the 
> > migration code can be rewritten to use non-blocking i/o and finegrained 
> > locks. But as a hobbyist I don't have time to fix this.
> > 
> > > > These patches introduce the new 'yank' out-of-band qmp command to 
> > > > recover from
> > > > these kinds of hangs. The different subsystems register callbacks which 
> > > > get
> > > > executed with the yank command. For example the callback can shutdown() 
> > > > a
> > > > socket. This is intended for the colo use-case, but it can be used for 
> > > > other
> > > > things too of course.  
> > > 
> > > IIUC, invoking the "yank" command unconditionally kills every single
> > > network connection in QEMU that has registered with the "yank" subsystem.
> > > IMHO this is way too big of a hammer, even if we accept there are bugs in
> > > QEMU not handling stalled networking well.
> > > 
> > > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD
> > > connection used for the guest disk, we needlessly break I/O.
> > 
> > Yeah, these patches are intended to solve the problems with the colo
> > use-case where all external connections (migration, chardevs, nbd)
> > are just for replication. In other use-cases you'd enable the yank
> > feature only on the non-essential connections.
> 
> That is a pretty inflexible design for other use cases though,
> as "non-essential" is not a black & white list in general. There
> are varying levels of importance to the different channels. We
> can afford to loose migration without any user visible effects.
> If that doesn't solve it, a serial device chardev, or VNC connection
> can be dropped at the inconvenience of loosing interactive console
> which is end user visible impact, so may only be want to be yanked
> if the migration yank didn't fix it. 

In the case of COLO that's not the case though - here we explicitly want
to kill the migration to be able to ensure that we can recover - and
we're under time pressure to get the other member of the pair running
again. 

Dave

> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v2 4/5] vhost: check vring address before calling unmap

2020-05-12 Thread Dima Stepanov

On Tue, May 12, 2020 at 11:26:11AM +0800, Jason Wang wrote:
> 
> On 2020/5/11 下午5:11, Dima Stepanov wrote:
> >On Mon, May 11, 2020 at 11:05:58AM +0800, Jason Wang wrote:
> >>On 2020/4/30 下午9:36, Dima Stepanov wrote:
> >>>Since disconnect can happen at any time during initialization not all
> >>>vring buffers (for instance used vring) can be intialized successfully.
> >>>If the buffer was not initialized then vhost_memory_unmap call will lead
> >>>to SIGSEGV. Add checks for the vring address value before calling unmap.
> >>>Also add assert() in the vhost_memory_unmap() routine.
> >>>
> >>>Signed-off-by: Dima Stepanov 
> >>>---
> >>>  hw/virtio/vhost.c | 27 +--
> >>>  1 file changed, 21 insertions(+), 6 deletions(-)
> >>>
> >>>diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>index ddbdc53..3ee50c4 100644
> >>>--- a/hw/virtio/vhost.c
> >>>+++ b/hw/virtio/vhost.c
> >>>@@ -314,6 +314,8 @@ static void vhost_memory_unmap(struct vhost_dev *dev, 
> >>>void *buffer,
> >>> hwaddr len, int is_write,
> >>> hwaddr access_len)
> >>>  {
> >>>+assert(buffer);
> >>>+
> >>>  if (!vhost_dev_has_iommu(dev)) {
> >>>  cpu_physical_memory_unmap(buffer, len, is_write, access_len);
> >>>  }
> >>>@@ -1132,12 +1134,25 @@ static void vhost_virtqueue_stop(struct vhost_dev 
> >>>*dev,
> >>>  vhost_vq_index);
> >>>  }
> >>>-vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, 
> >>>idx),
> >>>-   1, virtio_queue_get_used_size(vdev, idx));
> >>>-vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, 
> >>>idx),
> >>>-   0, virtio_queue_get_avail_size(vdev, idx));
> >>>-vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, 
> >>>idx),
> >>>-   0, virtio_queue_get_desc_size(vdev, idx));
> >>>+/*
> >>>+ * Since the vhost-user disconnect can happen during initialization
> >>>+ * check if vring was initialized, before making unmap.
> >>>+ */
> >>>+if (vq->used) {
> >>>+vhost_memory_unmap(dev, vq->used,
> >>>+   virtio_queue_get_used_size(vdev, idx),
> >>>+   1, virtio_queue_get_used_size(vdev, idx));
> >>>+}
> >>>+if (vq->avail) {
> >>>+vhost_memory_unmap(dev, vq->avail,
> >>>+   virtio_queue_get_avail_size(vdev, idx),
> >>>+   0, virtio_queue_get_avail_size(vdev, idx));
> >>>+}
> >>>+if (vq->desc) {
> >>>+vhost_memory_unmap(dev, vq->desc,
> >>>+   virtio_queue_get_desc_size(vdev, idx),
> >>>+   0, virtio_queue_get_desc_size(vdev, idx));
> >>>+}
> >>
> >>Any reason not checking hdev->started instead? vhost_dev_start() will set it
> >>to true if virtqueues were correctly mapped.
> >>
> >>Thanks
> >Well i see it a little bit different:
> >  - vhost_dev_start() sets hdev->started to true before starting
> >virtqueues
> >  - vhost_virtqueue_start() maps all the memory
> >If we hit the vhost disconnect at the start of the
> >vhost_virtqueue_start(), for instance for this call:
> >   r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
> >Then we will call vhost_user_blk_disconnect:
> >   vhost_user_blk_disconnect()->
> > vhost_user_blk_stop()->
> >   vhost_dev_stop()->
> > vhost_virtqueue_stop()
> >As a result we will come in this routine with the hdev->started still
> >set to true, but if used/avail/desc fields still uninitialized and set
> >to 0.
> 
> 
> I may miss something, but consider both vhost_dev_start() and
> vhost_user_blk_disconnect() were serialized in main loop. Can this really
> happen?
Yes, consider the case when we start the vhost-user-blk device:
  vhost_dev_start->
vhost_virtqueue_start
And we got a disconnect in the middle of vhost_virtqueue_start()
routine, for instance:
  1000 vq->num = state.num = virtio_queue_get_num(vdev, idx);
  1001 r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
  1002 if (r) {
  1003 VHOST_OPS_DEBUG("vhost_set_vring_num failed");
  1004 return -errno;
  1005 }
  --> Here we got a disconnect <--
  1006 
  1007 state.num = virtio_queue_get_last_avail_idx(vdev, idx);
  1008 r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
  1009 if (r) {
  1010 VHOST_OPS_DEBUG("vhost_set_vring_base failed");
  1011 return -errno;
  1012 }
As a result call to vhost_set_vring_base will call the disconnect
routine. The backtrace log for SIGSEGV is as follows:
  Thread 4 "qemu-system-x86" received signal SIGSEGV, Segmentation fault.
  [Switching to Thread 0x72ea9700 (LWP 183150)]
  0x74d60840 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
  (gdb) bt
  #0  0x74d60840 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
  #1  0x55

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Dr. David Alan Gilbert

* Lukas Straub (lukasstra...@web.de) wrote:
> On Mon, 11 May 2020 12:49:47 +0100
> Daniel P. BerrangÃ©  wrote:
> 
> > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote:
> > > Hello Everyone,
> > > In many cases, if qemu has a network connection (qmp, migration, chardev, 
> > > etc.)
> > > to some other server and that server dies or hangs, qemu hangs too.  
> > 
> > If qemu as a whole hangs due to a stalled network connection, that is a
> > bug in QEMU that we should be fixing IMHO. QEMU should be doing non-blocking
> > I/O in general, such that if the network connection or remote server stalls,
> > we simply stop sending I/O - we shouldn't ever hang the QEMU process or main
> > loop.
> > 
> > There are places in QEMU code which are not well behaved in this respect,
> > but many are, and others are getting fixed where found to be important.
> > 
> > Arguably any place in QEMU code which can result in a hang of QEMU in the
> > event of a stalled network should be considered a security flaw, because
> > the network is untrusted in general.
> 
> The fact that out-of-band qmp commands exist at all shows that we have to 
> make tradeoffs of developer time vs. doing things right. Sure, the migration 
> code can be rewritten to use non-blocking i/o and finegrained locks. But as a 
> hobbyist I don't have time to fix this.

If it was just an hobbyist vs fulltime thing then I'd say that was a bad
way to make the decision; however the reality is that even those who are
paid to watch this code don't have the feeling we can make it fail
quickly/non-blocking - and for COLO you need to be absolutely sure you
nail every case, so you'd some how have to audit the whole lot, and keep
watching it.

(and thank you for taking your time to do this!)

Dave


> > > These patches introduce the new 'yank' out-of-band qmp command to recover 
> > > from
> > > these kinds of hangs. The different subsystems register callbacks which 
> > > get
> > > executed with the yank command. For example the callback can shutdown() a
> > > socket. This is intended for the colo use-case, but it can be used for 
> > > other
> > > things too of course.  
> > 
> > IIUC, invoking the "yank" command unconditionally kills every single
> > network connection in QEMU that has registered with the "yank" subsystem.
> > IMHO this is way too big of a hammer, even if we accept there are bugs in
> > QEMU not handling stalled networking well.
> > 
> > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD
> > connection used for the guest disk, we needlessly break I/O.
> 
> Yeah, these patches are intended to solve the problems with the colo use-case 
> where all external connections (migration, chardevs, nbd) are just for 
> replication. In other use-cases you'd enable the yank feature only on the 
> non-essential connections.
> 
> > eg doing this in the chardev backend is not desirable, because the bugs
> > with hanging QEMU are typically caused by the way the frontend device
> > uses the chardev blocking I/O calls, instead of non-blocking I/O calls.
> > 
> > 
> > Regards,
> > Daniel
> 


--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-12 Thread Dima Stepanov

On Tue, May 12, 2020 at 11:47:34AM +0800, Li Feng wrote:
> Hi, Dima.
> 
> If vhost_migration_log return < 0, then vhost_log_global_start will
> trigger a crash.
> Does your patch have process this abort?
> If a disconnect happens in the migration stage, the correct operation
> is to stop the migration, right?
> 
>  841 static void vhost_log_global_start(MemoryListener *listener)
>  842 {
>  843 int r;
>  844
>  845 r = vhost_migration_log(listener, true);
>  846 if (r < 0) {
>  847 abort();
>  848 }
>  849 }
Yes, my patch process it by not returning an error ). That is one of the
point we've talked about with Raphael and Michael in this thread. First
of all in my patches i'm still following the same logic which has been
already in upstream ./hw/virtio/vhost.c:vhost_migration_log():
  ...
  820 if (!dev->started) {
  821 dev->log_enabled = enable;
  822 return 0;
  823 }
  ...
It means, that if device not started, then continue migration without
returning any error. So i followed the same logic, if we got a
disconnect, then it will mean that device isn't started and we can
continue migration. As a result no error is returned and assert() isn't
hit.
Also there is a question from Raphael to Michael about it you can find
it in this thread, by i will add it also:
  > Subject: Re: [PATCH v2 5/5] vhost: add device started check in
  > migration set log

  > On Wed, May 06, 2020 at 06:08:34PM -0400, Raphael Norwitz wrote:
  >> In particular, we need to decide whether a migration should be
  >> allowed to continue if a device disconnects durning the migration
  >> stage.
  >>
  >> mst, any thoughts?

  > Why not? It can't change state while disconnected, so it just makes
  > things easier.

So it looks like a correct way to handle it. Also our internal tests
passed. Some words about our tests:
  - run src VM with vhost-usr-blk daemon used
  - run fio inside it
  - perform reconnect every X seconds (just kill and restart
daemon), X is random
  - run dst VM
  - perform migration
  - fio should complete in dst VM
And we cycle this test like forever. At least for now we see no new
issues.

No other comments mixed in below.

> 
> Thanks,
> 
> Feng Li
> 
> Jason Wang  于2020年5月12日周二 上午11:33写道：
> >
> >
> > On 2020/5/11 下午5:25, Dima Stepanov wrote:
> > > On Mon, May 11, 2020 at 11:15:53AM +0800, Jason Wang wrote:
> > >> On 2020/4/30 下午9:36, Dima Stepanov wrote:
> > >>> If vhost-user daemon is used as a backend for the vhost device, then we
> > >>> should consider a possibility of disconnect at any moment. If such
> > >>> disconnect happened in the vhost_migration_log() routine the vhost
> > >>> device structure will be clean up.
> > >>> At the start of the vhost_migration_log() function there is a check:
> > >>>if (!dev->started) {
> > >>>dev->log_enabled = enable;
> > >>>return 0;
> > >>>}
> > >>> To be consistent with this check add the same check after calling the
> > >>> vhost_dev_set_log() routine. This in general help not to break a
> > >>> migration due the assert() message. But it looks like that this code
> > >>> should be revised to handle these errors more carefully.
> > >>>
> > >>> In case of vhost-user device backend the fail paths should consider the
> > >>> state of the device. In this case we should skip some function calls
> > >>> during rollback on the error paths, so not to get the NULL dereference
> > >>> errors.
> > >>>
> > >>> Signed-off-by: Dima Stepanov 
> > >>> ---
> > >>>   hw/virtio/vhost.c | 39 +++
> > >>>   1 file changed, 35 insertions(+), 4 deletions(-)
> > >>>
> > >>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > >>> index 3ee50c4..d5ab96d 100644
> > >>> --- a/hw/virtio/vhost.c
> > >>> +++ b/hw/virtio/vhost.c
> > >>> @@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev 
> > >>> *dev,
> > >>>   static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
> > >>>   {
> > >>>   int r, i, idx;
> > >>> +
> > >>> +if (!dev->started) {
> > >>> +/*
> > >>> + * If vhost-user daemon is used as a backend for the
> > >>> + * device and the connection is broken, then the vhost_dev
> > >>> + * structure will be reset all its values to 0.
> > >>> + * Add additional check for the device state.
> > >>> + */
> > >>> +return -1;
> > >>> +}
> > >>> +
> > >>>   r = vhost_dev_set_features(dev, enable_log);
> > >>>   if (r < 0) {
> > >>>   goto err_features;
> > >>> @@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev 
> > >>> *dev, bool enable_log)
> > >>>   }
> > >>>   return 0;
> > >>>   err_vq:
> > >>> -for (; i >= 0; --i) {
> > >>> +/*
> > >>> + * Disconnect with the vhost-user daemon can lead to the
> > >>> + * vhost_dev_cleanup() call which will clean up vhost_dev
> > >>> + * structure.
> > >>> + */
> > >>> +for (; dev->started &&

[PATCH] qemu-nbd: Close inherited stderr

2020-05-12 Thread Raphael Pour

Hello,

after e6df58a5, the inherited stderr 'old_stderr' won't get closed
anymore if 'fork_process' is false. This causes other processes relying
on EOF to infinitely block or crash.

From 47ab9b517038d13117876a8bb3ef45c53d7f2f9e Mon Sep 17 00:00:00 2001
From: "Raphael Pour" 
Date: Tue, 12 May 2020 10:18:44 +0200
Subject: [PATCH] qemu-nbd: Close inherited stderr

Close inherited stderr of the parent if fork_process is false.
Otherwise no one will close it. (introduced by e6df58a5)

Signed-off-by: Raphael Pour 
---
 qemu-nbd.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 108a51f7e..f2981e18a 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -1032,8 +1032,15 @@ int main(int argc, char **argv)
 exit(EXIT_FAILURE);
 }

-/* ... close the descriptor we inherited and go on.  */
+/* ... close the descriptor we inherited and ...  */
 close(stderr_fd[1]);
+
+/* ... also close the old_stderr IF fork_process is false
otherwise
+ * it will never get closed.
+ */
+if (!fork_process) {
+  close(old_stderr);
+}
 } else {
 bool errors = false;
 char *buf;
-- 
2.25.4


-- 
Hetzner Online GmbH
Am Datacenter-Park 1
08223 Falkenstein/Vogtland
raphael.p...@hetzner.com
www.hetzner.com

Registergericht Ansbach, HRB 6089
Geschäftsführer: Martin Hetzner, Stephan Konvickova, Günther Müller



signature.asc
Description: OpenPGP digital signature

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Lukas Straub

On Mon, 11 May 2020 16:46:45 +0100
"Dr. David Alan Gilbert"  wrote:

> * Daniel P. BerrangÃ© (berra...@redhat.com) wrote: 
> > ...
> > That way if QEMU does get stuck, you can start by tearing down the
> > least distruptive channel. eg try tearing down the migration connection
> > first (which shouldn't negatively impact the guest), and only if that
> > doesn't work then, move on to tear down the NBD connection (which risks
> > data loss)  
> 
> I wonder if a different way would be to make all network connections
> register with yank, but then make yank take a list of connections to
> shutdown(2).

Good Idea. We could name the connections (/yank callbacks) in the form 
"nbd:", "chardev:" and "migration" (and add 
"netdev:...", etc. in the future). Then make yank take a list of connection 
names as you suggest and silently ignore connections that don't exist. And 
maybe even add a 'query-yank' oob command returning a list of registered 
connections so the management application can do pattern matching if it wants.

Comments?

Regards,
Lukas Straub

> Dave
> 
> > Regards,
> > Daniel
> > -- 
> > |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange 
> > :|
> > |: https://libvirt.org -o-https://fstop138.berrange.com 
> > :|
> > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange 
> > :|  
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> 



pgpYTKir4lhNS.pgp
Description: OpenPGP digital signature

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-12 Thread Dima Stepanov

On Tue, May 12, 2020 at 11:32:50AM +0800, Jason Wang wrote:
> 
> On 2020/5/11 下午5:25, Dima Stepanov wrote:
> >On Mon, May 11, 2020 at 11:15:53AM +0800, Jason Wang wrote:
> >>On 2020/4/30 下午9:36, Dima Stepanov wrote:
> >>>If vhost-user daemon is used as a backend for the vhost device, then we
> >>>should consider a possibility of disconnect at any moment. If such
> >>>disconnect happened in the vhost_migration_log() routine the vhost
> >>>device structure will be clean up.
> >>>At the start of the vhost_migration_log() function there is a check:
> >>>   if (!dev->started) {
> >>>   dev->log_enabled = enable;
> >>>   return 0;
> >>>   }
> >>>To be consistent with this check add the same check after calling the
> >>>vhost_dev_set_log() routine. This in general help not to break a
> >>>migration due the assert() message. But it looks like that this code
> >>>should be revised to handle these errors more carefully.
> >>>
> >>>In case of vhost-user device backend the fail paths should consider the
> >>>state of the device. In this case we should skip some function calls
> >>>during rollback on the error paths, so not to get the NULL dereference
> >>>errors.
> >>>
> >>>Signed-off-by: Dima Stepanov 
> >>>---
> >>>  hw/virtio/vhost.c | 39 +++
> >>>  1 file changed, 35 insertions(+), 4 deletions(-)
> >>>
> >>>diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> >>>index 3ee50c4..d5ab96d 100644
> >>>--- a/hw/virtio/vhost.c
> >>>+++ b/hw/virtio/vhost.c
> >>>@@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev 
> >>>*dev,
> >>>  static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
> >>>  {
> >>>  int r, i, idx;
> >>>+
> >>>+if (!dev->started) {
> >>>+/*
> >>>+ * If vhost-user daemon is used as a backend for the
> >>>+ * device and the connection is broken, then the vhost_dev
> >>>+ * structure will be reset all its values to 0.
> >>>+ * Add additional check for the device state.
> >>>+ */
> >>>+return -1;
> >>>+}
> >>>+
> >>>  r = vhost_dev_set_features(dev, enable_log);
> >>>  if (r < 0) {
> >>>  goto err_features;
> >>>@@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev *dev, 
> >>>bool enable_log)
> >>>  }
> >>>  return 0;
> >>>  err_vq:
> >>>-for (; i >= 0; --i) {
> >>>+/*
> >>>+ * Disconnect with the vhost-user daemon can lead to the
> >>>+ * vhost_dev_cleanup() call which will clean up vhost_dev
> >>>+ * structure.
> >>>+ */
> >>>+for (; dev->started && (i >= 0); --i) {
> >>>  idx = dev->vhost_ops->vhost_get_vq_index(
> >>
> >>Why need the check of dev->started here, can started be modified outside
> >>mainloop? If yes, I don't get the check of !dev->started in the beginning of
> >>this function.
> >>
> >No dev->started can't change outside the mainloop. The main problem is
> >only for the vhost_user_blk daemon. Consider the case when we
> >successfully pass the dev->started check at the beginning of the
> >function, but after it we hit the disconnect on the next call on the
> >second or third iteration:
> >  r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, enable_log);
> >The unix socket backend device will call the disconnect routine for this
> >device and reset the structure. So the structure will be reset (and
> >dev->started set to false) inside this set_addr() call.
> 
> 
> I still don't get here. I think the disconnect can not happen in the middle
> of vhost_dev_set_log() since both of them were running in mainloop. And even
> if it can, we probably need other synchronization mechanism other than
> simple check here.
Disconnect isn't happened in the separate thread it is happened in this
routine inside vhost_dev_set_log. When for instance vhost_user_write()
call failed:
  vhost_user_set_log_base()
vhost_user_write()
  vhost_user_blk_disconnect()
vhost_dev_cleanup()
  vhost_user_backend_cleanup()
So the point is that if we somehow got a disconnect with the
vhost-user-blk daemon before the vhost_user_write() call then it will
continue clean up by running vhost_user_blk_disconnect() function. I
wrote a more detailed backtrace stack in the separate thread, which is
pretty similar to what we have here:
  Re: [PATCH v2 4/5] vhost: check vring address before calling unmap
The places are different but the problem is pretty similar.

So if vhost-user commands handshake then everything is fine and
reconnect will work as expected. The only problem is how to handle
reconnect properly between vhost-user command send/receive.

As i wrote we have a test:
  - run src VM with vhost-usr-blk daemon used
  - run fio inside it
  - perform reconnect every X seconds (just kill and restart daemon),
X is random
  - run dst VM
  - perform migration
  - fio should complete in dst VM
And we cycle this test like forever.
So it fails once per ~25 iteration. By adding some delays in

Re: [PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT

2020-05-12 Thread Eyal Moscovici




On 07/05/2020 0:49, Eric Blake wrote:

On 5/6/20 4:34 PM, Eyal Moscovici wrote:
Following commit f46bfdbfc8f95cf65d7818ef68a801e063c40332 
(util/cutils: Change

qemu_strtosz*() from int64_t to uint64_t) which added a similar check to
cvtnum. As a result there is no need to check it separately outside 
of cvtnum.


Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
  qemu-img.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 6a4327aaba..116a9c6349 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv)
  int64_t sval;
    sval = cvtnum(optarg);
-    if (sval < 0 || sval > INT_MAX) {
+    if (sval < 0) {
  error_report("Invalid buffer size specified");


INT_MAX is smaller than cvtnum's check for INT64_MAX.  This code 
change allows larger buffer sizes, which is probably not a good idea.
I was the most hesitant about this patch because of the size difference. 
I decided to submit it because the type is int64 which pairs better with 
the MAX_INT64 check and I couldn't find a concrete reason to cap the 
variable at MAX_INT. Do you a concrete reason? Because the max size 
should rerally come into effect on very fringe cases and if you are 
asking for a really big buffer you should know the risks.



  return 1;
  }
@@ -4320,7 +4320,7 @@ static int img_bench(int argc, char **argv)
  int64_t sval;
    sval = cvtnum(optarg);
-    if (sval < 0 || sval > INT_MAX) {
+    if (sval < 0) {
  error_report("Invalid step size specified");
  return 1;
  }
@@ -4493,7 +4493,7 @@ static int img_dd_bs(const char *arg,
    res = cvtnum(arg);
  -    if (res <= 0 || res > INT_MAX) {
+    if (res <= 0) {
  error_report("invalid number: '%s'", arg);
  return 1;
  }



NACK.

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Daniel P . Berrangé

On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote:
> On Mon, 11 May 2020 16:46:45 +0100
> "Dr. David Alan Gilbert"  wrote:
> 
> > * Daniel P. BerrangÃ© (berra...@redhat.com) wrote: 
> > > ...
> > > That way if QEMU does get stuck, you can start by tearing down the
> > > least distruptive channel. eg try tearing down the migration connection
> > > first (which shouldn't negatively impact the guest), and only if that
> > > doesn't work then, move on to tear down the NBD connection (which risks
> > > data loss)  
> > 
> > I wonder if a different way would be to make all network connections
> > register with yank, but then make yank take a list of connections to
> > shutdown(2).
> 
> Good Idea. We could name the connections (/yank callbacks) in the
> form "nbd:", "chardev:" and "migration"
> (and add "netdev:...", etc. in the future). Then make yank take a
> list of connection names as you suggest and silently ignore connections
> that don't exist. And maybe even add a 'query-yank' oob command returning
> a list of registered connections so the management application can do
> pattern matching if it wants.

Yes, that would make the yank command much more flexible in how it can
be used.

As an alternative to using formatted strings like this, it could be
modelled more explicitly in QAPI

  { 'struct':  'YankChannels',
'data': { 'chardev': [ 'string' ],
  'nbd': ['string'],
  'migration': bool } }

In this example, 'chardev' would accept a list of chardev IDs which
have it enabled, 'nbd' would accept a list of block node IDs which
have it enabled, and migration is a singleton on/off.

The benefit of this modelling is that you can introspect QEMU
to discover what classes of channels support being yanked by
this QEMU build, as well as what instances are configured to
be yanked. ie you can distinguish between a QEMU that doesn't
support yanking network devices, from a QEMU that does support
yanking network devices, but doesn't have it enabled for any
network device instances.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 2/5] qemu_img: add error report to cvtnum

2020-05-12 Thread Eyal Moscovici




On 07/05/2020 0:59, Eric Blake wrote:

On 5/6/20 4:34 PM, Eyal Moscovici wrote:
All calls to cvtnum check the return value and print the same error 
message more
or less. And so error reporting moved to cvtnum to reduce duplicate 
code and

provide a single error message.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
  qemu-img.c | 63 --
  tests/qemu-iotests/049.out |  4 +--
  2 files changed, 28 insertions(+), 39 deletions(-)





-    err = qemu_strtosz(s, NULL, &value);
-    if (err < 0) {
+    err = qemu_strtosz(arg_value, NULL, &value);
+    if (err < 0 && err != -ERANGE) {
+    error_report("Invalid %s specified! You may use "
+ "k, M, G, T, P or E suffixes for ", arg_name);
+    error_report("kilobytes, megabytes, gigabytes, terabytes, "
+ "petabytes and exabytes.");
 return err;
 }
-    if (value > INT64_MAX) {
+    if (err == -ERANGE || value > INT64_MAX) {
+    error_report("Invalid %s specified! Must be less than 8 EiB!",


Copied from our pre-existing errors, but why are we shouting at our 
user?  This would be a good time to s/!/./ to tone it down a bit.

Sure, will fix.



@@ -4491,10 +4488,12 @@ static int img_dd_bs(const char *arg,
  {
  int64_t res;
  -    res = cvtnum(arg);
+    res = cvtnum("bs", arg);
  -    if (res <= 0) {
-    error_report("invalid number: '%s'", arg);
+    if (res < 0) {
+    return 1;
+    } else if (res == 0) {
+    error_report("Invalid bs specified! It cannot be 0.");


Maybe it's worth two functions:

int64_t cvtnum_full(const char *name, const char *value, int64_t min, 
int64_t max)


and then a common helper:

int64_t cvtnum(const char *name, const char *value) {
    return cvtnum_full(name, value, 0, INT64_MAX);
}

many existing callers remain with cvtnum(), but callers like this 
could be cvtnum("bs", arg, 1, INT64_MAX).  You'd still have to 
special-case other restrictions, such as whether a number must a 
power-of-2, but that's fewer places.



Great idea, I will create two functions.

+++ b/tests/qemu-iotests/049.out
@@ -92,13 +92,13 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 
size=1649267441664 cluster_size=65536 l

  == 3. Invalid sizes ==
    qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1024
-qemu-img: Image size must be less than 8 EiB!
+qemu-img: Invalid image size specified! Must be less than 8 EiB!


Nice that you checked for iotest fallout.  Is this really the only 
impacted test?



Thanks, yes.

Re: [PATCH v2 5/5] qemu-img: Add --start-offset and --max-length to map

2020-05-12 Thread Eyal Moscovici




On 07/05/2020 1:04, Eric Blake wrote:

On 5/6/20 4:34 PM, Eyal Moscovici wrote:

The mapping operation of large disks especially ones stored over a
long chain of QCOW2 files can take a long time to finish.
Additionally when mapping fails there was no way recover by
restarting the mapping from the failed location.

The new options, --start-offset and --max-length allows the user to
divide these type of map operations into shorter independent tasks.

Reviewed-by: Eric Blake 


This patch has some changes from v1.  Among others,...


@@ -3041,6 +3045,18 @@ static int img_map(int argc, char **argv)
  case OPTION_OUTPUT:
  output = optarg;
  break;
+    case 's':
+    start_offset = cvtnum("start offset", optarg);
+    if (start_offset < 0) {
+    return 1;
+    }
+    break;


the new semantics of cvtnum() in this series is enough of a difference 
that I would have removed R-b to make sure the updated patch gets 
re-reviewed, if it had been me as author.  But in this case, it does 
look like the changes are all addressed to comments I suggested in v1, 
so I'm fine that you left my R-b.



Ok, got it.

Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure

2020-05-12 Thread Nir Soffer

On Fri, May 8, 2020 at 9:03 PM Eric Blake  wrote:
>
> It's useful to know how much space can be occupied by qcow2 persistent
> bitmaps, even though such metadata is unrelated to the guest-visible
> data.  Report this value as an additional field, present when
> measuring an existing image and the output format supports bitmaps.
> Update iotest 178 and 190 to updated output, as well as new coverage
> in 190 demonstrating non-zero values made possible with the
> recently-added qemu-img bitmap command.
>
> The addition of a new field demonstrates why we should always
> zero-initialize qapi C structs; while the qcow2 driver still fully
> populates all fields, the raw and crypto drivers had to be tweaked to
> avoid uninitialized data.
>
> See also: https://bugzilla.redhat.com/1779904
>
> Reported-by: Nir Soffer 
> Signed-off-by: Eric Blake 
> ---
>  qapi/block-core.json | 15 +++
>  block/crypto.c   |  2 +-
>  block/qcow2.c| 37 ---
>  block/raw-format.c   |  2 +-
>  qemu-img.c   |  3 +++
>  tests/qemu-iotests/178.out.qcow2 | 16 
>  tests/qemu-iotests/190   | 43 ++--
>  tests/qemu-iotests/190.out   | 23 -
>  8 files changed, 128 insertions(+), 13 deletions(-)
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 943df1926a91..9a7a388c7ad3 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -633,18 +633,23 @@
>  # efficiently so file size may be smaller than virtual disk size.
>  #
>  # The values are upper bounds that are guaranteed to fit the new image file.
> -# Subsequent modification, such as internal snapshot or bitmap creation, may
> -# require additional space and is not covered here.
> +# Subsequent modification, such as internal snapshot or further bitmap
> +# creation, may require additional space and is not covered here.
>  #
> -# @required: Size required for a new image file, in bytes.
> +# @required: Size required for a new image file, in bytes, when copying just
> +#guest-visible contents.
>  #
>  # @fully-allocated: Image file size, in bytes, once data has been written
> -#   to all sectors.
> +#   to all sectors, when copying just guest-visible contents.

This does not break old code since previously we always reported only
guest visible content
here, but it changes the semantics, and now you cannot allocate
"required" size, you need
to allocate "required" size with "bitmaps" size. If we add a new
extension all users will have to
change the calculation again.

I think keeping the semantics that "required" and "fully-allocated"
are the size you need based
on the parameters of the command is easier to use and more consistent.
Current the measure
command contract is that invoking it with similar parameters as used
in convert will give
the right measurement needed for the convert command.

> +#
> +# @bitmaps: Additional size required for bitmap metadata in a source image,
> +#   if that bitmap metadata can be copied in addition to guest
> +#   contents. (since 5.1)

Reporting bitmaps without specifying that bitmaps are needed is less consistent
with "qemu-img convert", but has the advantage that we don't need to
check if measure
supports bitmaps. But this will work only if new versions always
report "bitmaps" even when
the value is zero.

With the current way, to measure an image we need to do:

qemu-img info --output json ...
check if image contains bitmaps
qemu-img measure --output json ...
check if bitmaps are reported.

If image has bitmaps and bitmaps are not reported, we know that we have an old
version of qemu-img that does not know how to measure bitmaps.

If bitmaps are reported in both commands we will use the value when creating
block devices.

If we always report bitmaps even when they are zero, we don't need to
run "qemu-img info".

But  my preferred interface is:

   qemu-img measure --bitmaps ...

And report bitmaps only if the user asked to get the value. In this
case the required and
fully-allocated values will include bitmaps.

To learn if qemu-img measure understand bitmaps we can check --help
output, like we did
in the past until we can require the version on all platforms.

An advantage is not having to change old tests.

Nir

>  #
>  # Since: 2.10
>  ##
>  { 'struct': 'BlockMeasureInfo',
> -  'data': {'required': 'int', 'fully-allocated': 'int'} }
> +  'data': {'required': 'int', 'fully-allocated': 'int', '*bitmaps': 'int'} }
>
>  ##
>  # @query-block:
> diff --git a/block/crypto.c b/block/crypto.c
> index 6b21d6bf6c01..eadbcb248563 100644
> --- a/block/crypto.c
> +++ b/block/crypto.c
> @@ -552,7 +552,7 @@ static BlockMeasureInfo *block_crypto_measure(QemuOpts 
> *opts,
>   * Unallocated blocks are still encrypted so allocation status makes no
>   * difference to the file size.
>   */
> -info = g

[PATCH v4 0/6] scripts: More Python fixes

2020-05-12 Thread Philippe Mathieu-Daudé

Trivial Python3 fixes, again...

Since v3:
- Fixed missing scripts/qemugdb/timers.py (kwolf)
- Cover more scripts
- Check for __main__ in few scripts

Since v2:
- Remove patch updating MAINTAINERS

Since v1:
- Added Alex Bennée A-b tags
- Addressed John Snow review comments
  - Use /usr/bin/env
  - Do not modify os.path (dropped last patch)

Philippe Mathieu-Daudé (6):
  scripts/qemugdb: Remove shebang header
  scripts/qemu-gdb: Use Python 3 interpreter
  scripts/qmp: Use Python 3 interpreter
  scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main()
  scripts/modules/module_block: Use Python 3 interpreter & add
pseudo-main
  tests/migration/guestperf: Use Python 3 interpreter

 scripts/kvm/vmxcap |  7 ---
 scripts/modules/module_block.py| 31 +++---
 scripts/qemu-gdb.py|  4 ++--
 scripts/qemugdb/__init__.py|  3 +--
 scripts/qemugdb/aio.py |  3 +--
 scripts/qemugdb/coroutine.py   |  3 +--
 scripts/qemugdb/mtree.py   |  4 +---
 scripts/qemugdb/tcg.py |  1 -
 scripts/qemugdb/timers.py  |  1 -
 scripts/qmp/qom-get|  2 +-
 scripts/qmp/qom-list   |  2 +-
 scripts/qmp/qom-set|  2 +-
 scripts/qmp/qom-tree   |  2 +-
 tests/migration/guestperf-batch.py |  2 +-
 tests/migration/guestperf-plot.py  |  2 +-
 tests/migration/guestperf.py   |  2 +-
 16 files changed, 33 insertions(+), 38 deletions(-)

-- 
2.21.3

[PATCH v4 3/6] scripts/qmp: Use Python 3 interpreter

2020-05-12 Thread Philippe Mathieu-Daudé

From: Philippe Mathieu-Daudé 

Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/qmp/qom-get  | 2 +-
 scripts/qmp/qom-list | 2 +-
 scripts/qmp/qom-set  | 2 +-
 scripts/qmp/qom-tree | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/qmp/qom-get b/scripts/qmp/qom-get
index 007b4cd442..7c5ede91bb 100755
--- a/scripts/qmp/qom-get
+++ b/scripts/qmp/qom-get
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-list b/scripts/qmp/qom-list
index 03bda3446b..bb68fd65d4 100755
--- a/scripts/qmp/qom-list
+++ b/scripts/qmp/qom-list
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-set b/scripts/qmp/qom-set
index c37fe78b00..19881d85e9 100755
--- a/scripts/qmp/qom-set
+++ b/scripts/qmp/qom-set
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
diff --git a/scripts/qmp/qom-tree b/scripts/qmp/qom-tree
index 1c8acf61e7..fa91147a03 100755
--- a/scripts/qmp/qom-tree
+++ b/scripts/qmp/qom-tree
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##
 # QEMU Object Model test tools
 #
-- 
2.21.3

[PATCH v4 4/6] scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main()

2020-05-12 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/kvm/vmxcap | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index 971ed0e721..6fe66d5f57 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #
 # tool for querying VMX capabilities
 #
@@ -275,5 +275,6 @@ controls = [
 ),
 ]
 
-for c in controls:
-c.show()
+if __name__ == '__main__':
+for c in controls:
+c.show()
-- 
2.21.3

[PATCH v4 5/6] scripts/modules/module_block: Use Python 3 interpreter & add pseudo-main

2020-05-12 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/modules/module_block.py | 31 ---
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/scripts/modules/module_block.py b/scripts/modules/module_block.py
index f23191fac1..2e7021b952 100644
--- a/scripts/modules/module_block.py
+++ b/scripts/modules/module_block.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #
 # Module information generator
 #
@@ -10,7 +10,6 @@
 # This work is licensed under the terms of the GNU GPL, version 2.
 # See the COPYING file in the top-level directory.
 
-import sys
 import os
 
 def get_string_struct(line):
@@ -80,19 +79,21 @@ def print_bottom(fheader):
 #endif
 ''')
 
-# First argument: output file
-# All other arguments: modules source files (.c)
-output_file = sys.argv[1]
-with open(output_file, 'w') as fheader:
-print_top(fheader)
+if __name__ == '__main__':
+import sys
+# First argument: output file
+# All other arguments: modules source files (.c)
+output_file = sys.argv[1]
+with open(output_file, 'w') as fheader:
+print_top(fheader)
 
-for filename in sys.argv[2:]:
-if os.path.isfile(filename):
-process_file(fheader, filename)
-else:
-print("File " + filename + " does not exist.", file=sys.stderr)
-sys.exit(1)
+for filename in sys.argv[2:]:
+if os.path.isfile(filename):
+process_file(fheader, filename)
+else:
+print("File " + filename + " does not exist.", file=sys.stderr)
+sys.exit(1)
 
-print_bottom(fheader)
+print_bottom(fheader)
 
-sys.exit(0)
+sys.exit(0)
-- 
2.21.3

[PATCH v4 2/6] scripts/qemu-gdb: Use Python 3 interpreter

2020-05-12 Thread Philippe Mathieu-Daudé

From: Philippe Mathieu-Daudé 

Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/qemu-gdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py
index f2a305c42e..e0bfa7b5a4 100644
--- a/scripts/qemu-gdb.py
+++ b/scripts/qemu-gdb.py
@@ -1,5 +1,5 @@
-#!/usr/bin/python
-
+#!/usr/bin/env python3
+#
 # GDB debugging support
 #
 # Copyright 2012 Red Hat, Inc. and/or its affiliates
-- 
2.21.3

[PATCH v4 1/6] scripts/qemugdb: Remove shebang header

2020-05-12 Thread Philippe Mathieu-Daudé

From: Philippe Mathieu-Daudé 

These scripts are loaded as plugin by GDB (and they don't
have any __main__ entry point). Remove the shebang header.

Acked-by: Alex Bennée 
Signed-off-by: Philippe Mathieu-Daudé 
---
 scripts/qemugdb/__init__.py  | 3 +--
 scripts/qemugdb/aio.py   | 3 +--
 scripts/qemugdb/coroutine.py | 3 +--
 scripts/qemugdb/mtree.py | 4 +---
 scripts/qemugdb/tcg.py   | 1 -
 scripts/qemugdb/timers.py| 1 -
 6 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/scripts/qemugdb/__init__.py b/scripts/qemugdb/__init__.py
index 969f552b26..da8ff612e5 100644
--- a/scripts/qemugdb/__init__.py
+++ b/scripts/qemugdb/__init__.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright (c) 2015 Linaro Ltd
diff --git a/scripts/qemugdb/aio.py b/scripts/qemugdb/aio.py
index 2ba00c..d7c1ba0c28 100644
--- a/scripts/qemugdb/aio.py
+++ b/scripts/qemugdb/aio.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support: aio/iohandler debug
 #
 # Copyright (c) 2015 Red Hat, Inc.
diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py
index 41e079d0e2..db61389022 100644
--- a/scripts/qemugdb/coroutine.py
+++ b/scripts/qemugdb/coroutine.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright 2012 Red Hat, Inc. and/or its affiliates
diff --git a/scripts/qemugdb/mtree.py b/scripts/qemugdb/mtree.py
index 3030a60d3f..8fe42c3c12 100644
--- a/scripts/qemugdb/mtree.py
+++ b/scripts/qemugdb/mtree.py
@@ -1,5 +1,4 @@
-#!/usr/bin/python
-
+#
 # GDB debugging support
 #
 # Copyright 2012 Red Hat, Inc. and/or its affiliates
@@ -84,4 +83,3 @@ def print_item(self, ptr, offset = gdb.Value(0), level = 0):
 while not isnull(subregion):
 self.print_item(subregion, addr, level)
 subregion = subregion['subregions_link']['tqe_next']
-
diff --git a/scripts/qemugdb/tcg.py b/scripts/qemugdb/tcg.py
index 18880fc9a7..16c03c06a9 100644
--- a/scripts/qemugdb/tcg.py
+++ b/scripts/qemugdb/tcg.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 # -*- coding: utf-8 -*-
 #
 # GDB debugging support, TCG status
diff --git a/scripts/qemugdb/timers.py b/scripts/qemugdb/timers.py
index f0e132d27a..46537b27cf 100644
--- a/scripts/qemugdb/timers.py
+++ b/scripts/qemugdb/timers.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 # -*- coding: utf-8 -*-
 # GDB debugging support
 #
-- 
2.21.3

[PATCH v4 6/6] tests/migration/guestperf: Use Python 3 interpreter

2020-05-12 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 tests/migration/guestperf-batch.py | 2 +-
 tests/migration/guestperf-plot.py  | 2 +-
 tests/migration/guestperf.py   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/migration/guestperf-batch.py 
b/tests/migration/guestperf-batch.py
index cb150ce804..f1e900908d 100755
--- a/tests/migration/guestperf-batch.py
+++ b/tests/migration/guestperf-batch.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #
 # Migration test batch comparison invokation
 #
diff --git a/tests/migration/guestperf-plot.py 
b/tests/migration/guestperf-plot.py
index d70bb7a557..907151011a 100755
--- a/tests/migration/guestperf-plot.py
+++ b/tests/migration/guestperf-plot.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #
 # Migration test graph plotting command
 #
diff --git a/tests/migration/guestperf.py b/tests/migration/guestperf.py
index 99b027e8ba..ba1c4bc4ca 100755
--- a/tests/migration/guestperf.py
+++ b/tests/migration/guestperf.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 #
 # Migration test direct invokation command
 #
-- 
2.21.3

Re: [PATCH v3 1/2] scripts/qemugdb: Remove shebang header

2020-05-12 Thread Philippe Mathieu-Daudé


On 5/12/20 10:55 AM, Kevin Wolf wrote:

Am 12.05.2020 um 09:06 hat Philippe Mathieu-DaudÃ© geschrieben:

These scripts are loaded as plugin by GDB (and they don't
have any __main__ entry point). Remove the shebang header.

Acked-by: Alex BennÃ©e 
Signed-off-by: Philippe Mathieu-DaudÃ© 
---
  scripts/qemugdb/__init__.py  | 3 +--
  scripts/qemugdb/aio.py   | 3 +--
  scripts/qemugdb/coroutine.py | 3 +--
  scripts/qemugdb/mtree.py | 4 +---
  scripts/qemugdb/tcg.py   | 1 -


There is still a shebang line left in scripts/qemugdb/timers.py.


Oops, thanks for catching this.



Kevin

Re: [RFC] bdrv_flush: only use fast path when in owned AioContext

2020-05-12 Thread Kevin Wolf

Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben:
> Just because we're in a coroutine doesn't imply ownership of the context
> of the flushed drive. In such a case use the slow path which explicitly
> enters bdrv_flush_co_entry in the correct AioContext.
> 
> Signed-off-by: Stefan Reiter 
> ---
> 
> We've experienced some lockups in this codepath when taking snapshots of VMs
> with drives that have IO-Threads enabled (we have an async 'savevm'
> implementation running from a coroutine).
> 
> Currently no reproducer for upstream versions I could find, but in testing 
> this
> patch fixes all issues we're seeing and I think the logic checks out.
> 
> The fast path pattern is repeated a few times in this file, so if this change
> makes sense, it's probably worth evaluating the other occurences as well.

What do you mean by "owning" the context? If it's about taking the
AioContext lock, isn't the problem more with calling bdrv_flush() from
code that doesn't take the locks?

Though I think we have some code that doesn't only rely on holding the
AioContext locks, but that actually depends on running in the right
thread, so the change looks right anyway.

Kevin

>  block/io.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index aba67f66b9..ee7310fa13 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -2895,8 +2895,9 @@ int bdrv_flush(BlockDriverState *bs)
>  .ret = NOT_DONE,
>  };
>  
> -if (qemu_in_coroutine()) {
> -/* Fast-path if already in coroutine context */
> +if (qemu_in_coroutine() &&
> +bdrv_get_aio_context(bs) == qemu_get_current_aio_context()) {
> +/* Fast-path if already in coroutine and we own the drive's context 
> */
>  bdrv_flush_co_entry(&flush_co);
>  } else {
>  co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
> -- 
> 2.20.1
> 
>

Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure

2020-05-12 Thread Max Reitz

On 12.05.20 12:17, Nir Soffer wrote:
> On Fri, May 8, 2020 at 9:03 PM Eric Blake  wrote:
>>
>> It's useful to know how much space can be occupied by qcow2 persistent
>> bitmaps, even though such metadata is unrelated to the guest-visible
>> data.  Report this value as an additional field, present when
>> measuring an existing image and the output format supports bitmaps.
>> Update iotest 178 and 190 to updated output, as well as new coverage
>> in 190 demonstrating non-zero values made possible with the
>> recently-added qemu-img bitmap command.
>>
>> The addition of a new field demonstrates why we should always
>> zero-initialize qapi C structs; while the qcow2 driver still fully
>> populates all fields, the raw and crypto drivers had to be tweaked to
>> avoid uninitialized data.
>>
>> See also: https://bugzilla.redhat.com/1779904
>>
>> Reported-by: Nir Soffer 
>> Signed-off-by: Eric Blake 
>> ---
>>  qapi/block-core.json | 15 +++
>>  block/crypto.c   |  2 +-
>>  block/qcow2.c| 37 ---
>>  block/raw-format.c   |  2 +-
>>  qemu-img.c   |  3 +++
>>  tests/qemu-iotests/178.out.qcow2 | 16 
>>  tests/qemu-iotests/190   | 43 ++--
>>  tests/qemu-iotests/190.out   | 23 -
>>  8 files changed, 128 insertions(+), 13 deletions(-)
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 943df1926a91..9a7a388c7ad3 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -633,18 +633,23 @@
>>  # efficiently so file size may be smaller than virtual disk size.
>>  #
>>  # The values are upper bounds that are guaranteed to fit the new image file.
>> -# Subsequent modification, such as internal snapshot or bitmap creation, may
>> -# require additional space and is not covered here.
>> +# Subsequent modification, such as internal snapshot or further bitmap
>> +# creation, may require additional space and is not covered here.
>>  #
>> -# @required: Size required for a new image file, in bytes.
>> +# @required: Size required for a new image file, in bytes, when copying just
>> +#guest-visible contents.
>>  #
>>  # @fully-allocated: Image file size, in bytes, once data has been written
>> -#   to all sectors.
>> +#   to all sectors, when copying just guest-visible 
>> contents.
> 
> This does not break old code since previously we always reported only
> guest visible content
> here, but it changes the semantics, and now you cannot allocate
> "required" size, you need
> to allocate "required" size with "bitmaps" size.

Only if you copy the bitmaps, though, which requires using the --bitmaps
switch for convert.

> If we add a new
> extension all users will have to
> change the calculation again.

It was my impression that existing users won’t have to do that, because
they don’t use --bitmaps yet.

In contrast, if we included the bitmap size in @required or
@fully-allocated, then previous users that didn’t copy bitmaps to the
destination (which they couldn’t) would allocate too much space.

...revisiting this after reading more of your mail: With a --bitmaps
switch, existing users wouldn’t suffer from such compatibility problems.
 However, then users (that now want to copy bitmaps) will have to pass
the new --bitmaps flag in the command line, and I don’t see how that’s
less complicated than just adding @bitmaps to @required.

> I think keeping the semantics that "required" and "fully-allocated"
> are the size you need based
> on the parameters of the command is easier to use and more consistent.
> Current the measure
> command contract is that invoking it with similar parameters as used
> in convert will give
> the right measurement needed for the convert command.
> 
>> +#
>> +# @bitmaps: Additional size required for bitmap metadata in a source image,
>> +#   if that bitmap metadata can be copied in addition to guest
>> +#   contents. (since 5.1)
> 
> Reporting bitmaps without specifying that bitmaps are needed is less 
> consistent
> with "qemu-img convert", but has the advantage that we don't need to
> check if measure
> supports bitmaps. But this will work only if new versions always
> report "bitmaps" even when
> the value is zero.
> 
> With the current way, to measure an image we need to do:
> 
> qemu-img info --output json ...
> check if image contains bitmaps
> qemu-img measure --output json ...
> check if bitmaps are reported.
> 
> If image has bitmaps and bitmaps are not reported, we know that we have an old
> version of qemu-img that does not know how to measure bitmaps.

Well, but you’ll also see that convert --bitmaps will fail.  But I can
see that you probably want to know about that before you create the
target image.

> If bitmaps are reported in both commands we will use the value when creating
> block devices.

And otherwise?  Do you error out, or d

Re: [PATCH v5 13/15] acpi: drop build_piix4_pm()

2020-05-12 Thread Gerd Hoffmann

On Mon, May 11, 2020 at 09:37:32PM +0200, Igor Mammedov wrote:
> On Thu,  7 May 2020 15:16:38 +0200
> Gerd Hoffmann  wrote:
> 
> > The _SB.PCI0.PX13.P13C opregion (holds isa device enable bits)
> > is not used any more, remove it from DSDT.
> > 
> > Signed-off-by: Gerd Hoffmann 
> > ---
> >  hw/i386/acpi-build.c | 16 
> >  1 file changed, 16 deletions(-)
> > 
> > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > index 765409a90eb6..c1e63cce5e8e 100644
> > --- a/hw/i386/acpi-build.c
> > +++ b/hw/i386/acpi-build.c
> > @@ -1441,21 +1441,6 @@ static void build_q35_isa_bridge(Aml *table)
> >  aml_append(table, scope);
> >  }
> >  
> > -static void build_piix4_pm(Aml *table)
> > -{
> > -Aml *dev;
> > -Aml *scope;
> > -
> > -scope =  aml_scope("_SB.PCI0");
> > -dev = aml_device("PX13");
> > -aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010003)));
> I agree about removing P13C but I'm not sure if it's safe to remove
> whole isa bridge

The isa bridge is _SB0.PCI0.ISA (piix4 function 0).
_SB.PCI0.PX13 is the acpi pm device (piix4 function 3).

So this does _not_ remove the isa bridge.  And given that PX13 has
nothing declared beside the opregion we are removing and I have not
found any other references to the PX13 device I assumed it is ok to drop
that one too.  Didn't notice any odd side effects in testing.

take care,
  Gerd

Re: [PATCH v4 0/6] scripts: More Python fixes

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 12:32 hat Philippe Mathieu-Daudé geschrieben:
> Trivial Python3 fixes, again...
> 
> Since v3:
> - Fixed missing scripts/qemugdb/timers.py (kwolf)
> - Cover more scripts
> - Check for __main__ in few scripts

I'm not sure if the __main__ check actually provides anything useful in
source files of standalone tools that aren't supposed to be imported
from somewhere else. But of course, it's not wrong either.

Reviewed-by: Kevin Wolf

Re: [RFC] bdrv_flush: only use fast path when in owned AioContext

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 12:57 hat Kevin Wolf geschrieben:
> Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben:
> > Just because we're in a coroutine doesn't imply ownership of the context
> > of the flushed drive. In such a case use the slow path which explicitly
> > enters bdrv_flush_co_entry in the correct AioContext.
> > 
> > Signed-off-by: Stefan Reiter 
> > ---
> > 
> > We've experienced some lockups in this codepath when taking snapshots of VMs
> > with drives that have IO-Threads enabled (we have an async 'savevm'
> > implementation running from a coroutine).
> > 
> > Currently no reproducer for upstream versions I could find, but in testing 
> > this
> > patch fixes all issues we're seeing and I think the logic checks out.
> > 
> > The fast path pattern is repeated a few times in this file, so if this 
> > change
> > makes sense, it's probably worth evaluating the other occurences as well.
> 
> What do you mean by "owning" the context? If it's about taking the
> AioContext lock, isn't the problem more with calling bdrv_flush() from
> code that doesn't take the locks?
> 
> Though I think we have some code that doesn't only rely on holding the
> AioContext locks, but that actually depends on running in the right
> thread, so the change looks right anyway.

Well, the idea is right, but the change itself isn't, of course. If
we're already in coroutine context, we must not busy wait with
BDRV_POLL_WHILE(). I'll see if I can put something together after lunch.

Kevin

Re: [PATCH v4 4/6] scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main()

2020-05-12 Thread Paolo Bonzini

On 12/05/20 12:32, Philippe Mathieu-Daudé wrote:
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  scripts/kvm/vmxcap | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
> index 971ed0e721..6fe66d5f57 100755
> --- a/scripts/kvm/vmxcap
> +++ b/scripts/kvm/vmxcap
> @@ -1,4 +1,4 @@
> -#!/usr/bin/python
> +#!/usr/bin/env python3
>  #
>  # tool for querying VMX capabilities
>  #
> @@ -275,5 +275,6 @@ controls = [
>  ),
>  ]
>  
> -for c in controls:
> -c.show()
> +if __name__ == '__main__':
> +for c in controls:
> +c.show()
> 

Acked-by: Paolo Bonzini

Re: [RFC] bdrv_flush: only use fast path when in owned AioContext

2020-05-12 Thread Stefan Reiter


On 5/12/20 1:32 PM, Kevin Wolf wrote:

Am 12.05.2020 um 12:57 hat Kevin Wolf geschrieben:

Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben:

Just because we're in a coroutine doesn't imply ownership of the context
of the flushed drive. In such a case use the slow path which explicitly
enters bdrv_flush_co_entry in the correct AioContext.

Signed-off-by: Stefan Reiter 
---

We've experienced some lockups in this codepath when taking snapshots of VMs
with drives that have IO-Threads enabled (we have an async 'savevm'
implementation running from a coroutine).

Currently no reproducer for upstream versions I could find, but in testing this
patch fixes all issues we're seeing and I think the logic checks out.

The fast path pattern is repeated a few times in this file, so if this change
makes sense, it's probably worth evaluating the other occurences as well.


What do you mean by "owning" the context? If it's about taking the
AioContext lock, isn't the problem more with calling bdrv_flush() from
code that doesn't take the locks?

Though I think we have some code that doesn't only rely on holding the
AioContext locks, but that actually depends on running in the right
thread, so the change looks right anyway.


"Owning" as in it only works (doesn't hang) when bdrv_flush_co_entry 
runs on the same AioContext that the BlockDriverState it's flushing 
belongs to.


We hold the locks for all AioContexts we want to flush in our code (in 
this case called from do_vm_stop/bdrv_flush_all so we're even in a 
drained section).




Well, the idea is right, but the change itself isn't, of course. If
we're already in coroutine context, we must not busy wait with
BDRV_POLL_WHILE(). I'll see if I can put something together after lunch.

Kevin




Thanks for taking a look!

Re: [PATCH] qemu-nbd: Close inherited stderr

2020-05-12 Thread Eric Blake


On 5/12/20 3:56 AM, Raphael Pour wrote:

Hello,

after e6df58a5, the inherited stderr 'old_stderr' won't get closed
anymore if 'fork_process' is false. This causes other processes relying
on EOF to infinitely block or crash.


From 47ab9b517038d13117876a8bb3ef45c53d7f2f9e Mon Sep 17 00:00:00 2001

From: "Raphael Pour" 
Date: Tue, 12 May 2020 10:18:44 +0200
Subject: [PATCH] qemu-nbd: Close inherited stderr

Close inherited stderr of the parent if fork_process is false.
Otherwise no one will close it. (introduced by e6df58a5)

Signed-off-by: Raphael Pour 
---
  qemu-nbd.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)



Wouldn't it just be simpler to not dup in the first place?

diff --git i/qemu-nbd.c w/qemu-nbd.c
index 4aa005004ebd..6ba2544feb3a 100644
--- i/qemu-nbd.c
+++ w/qemu-nbd.c
@@ -916,7 +916,9 @@ int main(int argc, char **argv)
 } else if (pid == 0) {
 close(stderr_fd[0]);

-old_stderr = dup(STDERR_FILENO);
+if (fork_process) {
+old_stderr = dup(STDERR_FILENO);
+}
 ret = qemu_daemon(1, 0);

 /* Temporarily redirect stderr to the parent's pipe...  */


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT

2020-05-12 Thread Eric Blake


On 5/12/20 4:39 AM, Eyal Moscovici wrote:


+++ b/qemu-img.c
@@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv)
Â Â Â Â Â Â Â Â Â Â Â Â Â  int64_t sval;
Â  Â Â Â Â Â Â Â Â Â Â Â Â Â  sval = cvtnum(optarg);
-Â Â Â Â Â Â Â Â Â Â Â  if (sval < 0 || sval > INT_MAX) {
+Â Â Â Â Â Â Â Â Â Â Â  if (sval < 0) {
Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  error_report("Invalid buffer size 
specified");


INT_MAX is smaller than cvtnum's check for INT64_MAX.Â  This code 
change allows larger buffer sizes, which is probably not a good idea.
I was the most hesitant about this patch because of the size difference. 
I decided to submit it because the type is int64 which pairs better with 
the MAX_INT64 check and I couldn't find a concrete reason to cap the 
variable at MAX_INT. Do you a concrete reason? Because the max size 
should rerally come into effect on very fringe cases and if you are 
asking for a really big buffer you should know the risks.


The commit message does not call out a change in behavior.  If you are 
sure that you want a change in behavior, then justify that: mention that 
your patch specifically changes the behavior to allow larger buffers, 
and why that is okay.  Otherwise, it looks like your patch is accidental 
in its behavior change, which costs reviewer time.


In this particular case, I see no reason to allow larger buffers.  That 
is, the existing limits made sense (benchmarking anything larger than 
the maximum qcow2 cluster size of 2M is unlikely to produce useful 
results, let alone something as large as 2G, so allowing > 2G is not 
helping the user).  So even if your commit message did a good job of 
explaining that the change was intentional, it is a tough sell why we 
need it.  If all your commit is intended to do is refactoring, then it 
should not be changing behavior.



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

[RFC PATCH 0/3] block: Synchronous bdrv_*() from coroutine in different AioContext

2020-05-12 Thread Kevin Wolf

Stefan (Reiter), after looking a bit closer at this, I think there is no
bug in QEMU, but the bug is in your coroutine code that calls block
layer functions without moving into the right AioContext first. I've
written this series anyway as it potentially makes the life of callers
easier and would probably make your buggy code correct.

However, it doesn't feel right to commit something like patch 2 without
having a user for it. Is there a reason why you can't upstream your
async snapshot code?

The series would also happen fix a bug in my recent patch to convert
qmp_block_resize() to coroutines, but I feel it's not how I would
naturally fix it. Switching the thread already in the QMP handler before
calling bdrv_truncate() would feel more appropriate. I wonder if it
wouldn't actually be the same for your snapshot code.

Kevin Wolf (3):
  block: Factor out bdrv_run_co()
  block: Allow bdrv_run_co() from different AioContext
  block: Assert we're running in the right thread

 block/io.c | 122 -
 1 file changed, 45 insertions(+), 77 deletions(-)

-- 
2.25.3

[RFC PATCH 1/3] block: Factor out bdrv_run_co()

2020-05-12 Thread Kevin Wolf

We have a few bdrv_*() functions that can either spawn a new coroutine
and wait for it with BDRV_POLL_WHILE() or use a fastpath if they are
alreeady running in a coroutine. All of them duplicate basically the
same code.

Factor the common code into a new function bdrv_run_co().

Signed-off-by: Kevin Wolf 
---
 block/io.c | 104 +++--
 1 file changed, 28 insertions(+), 76 deletions(-)

diff --git a/block/io.c b/block/io.c
index 7d30e61edc..c1badaadc9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -891,6 +891,22 @@ static int bdrv_check_byte_request(BlockDriverState *bs, 
int64_t offset,
 return 0;
 }
 
+static int bdrv_run_co(BlockDriverState *bs, CoroutineEntry *entry,
+   void *opaque, int *ret)
+{
+if (qemu_in_coroutine()) {
+/* Fast-path if already in coroutine context */
+entry(opaque);
+} else {
+Coroutine *co = qemu_coroutine_create(entry, opaque);
+*ret = NOT_DONE;
+bdrv_coroutine_enter(bs, co);
+BDRV_POLL_WHILE(bs, *ret == NOT_DONE);
+}
+
+return *ret;
+}
+
 typedef struct RwCo {
 BdrvChild *child;
 int64_t offset;
@@ -923,25 +939,15 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 QEMUIOVector *qiov, bool is_write,
 BdrvRequestFlags flags)
 {
-Coroutine *co;
 RwCo rwco = {
 .child = child,
 .offset = offset,
 .qiov = qiov,
 .is_write = is_write,
-.ret = NOT_DONE,
 .flags = flags,
 };
 
-if (qemu_in_coroutine()) {
-/* Fast-path if already in coroutine context */
-bdrv_rw_co_entry(&rwco);
-} else {
-co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
-bdrv_coroutine_enter(child->bs, co);
-BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
-}
-return rwco.ret;
+return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco, &rwco.ret);
 }
 
 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
@@ -2230,7 +2236,6 @@ typedef struct BdrvCoBlockStatusData {
 int64_t *map;
 BlockDriverState **file;
 int ret;
-bool done;
 } BdrvCoBlockStatusData;
 
 int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
@@ -2492,7 +2497,6 @@ static void coroutine_fn 
bdrv_block_status_above_co_entry(void *opaque)
data->want_zero,
data->offset, data->bytes,
data->pnum, data->map, data->file);
-data->done = true;
 aio_wait_kick();
 }
 
@@ -2508,7 +2512,6 @@ static int 
bdrv_common_block_status_above(BlockDriverState *bs,
   int64_t *map,
   BlockDriverState **file)
 {
-Coroutine *co;
 BdrvCoBlockStatusData data = {
 .bs = bs,
 .base = base,
@@ -2518,18 +2521,9 @@ static int 
bdrv_common_block_status_above(BlockDriverState *bs,
 .pnum = pnum,
 .map = map,
 .file = file,
-.done = false,
 };
 
-if (qemu_in_coroutine()) {
-/* Fast-path if already in coroutine context */
-bdrv_block_status_above_co_entry(&data);
-} else {
-co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
-bdrv_coroutine_enter(bs, co);
-BDRV_POLL_WHILE(bs, !data.done);
-}
-return data.ret;
+return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data, &data.ret);
 }
 
 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
@@ -2669,22 +2663,13 @@ static inline int
 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
 bool is_read)
 {
-if (qemu_in_coroutine()) {
-return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
-} else {
-BdrvVmstateCo data = {
-.bs = bs,
-.qiov   = qiov,
-.pos= pos,
-.is_read= is_read,
-.ret= -EINPROGRESS,
-};
-Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
-
-bdrv_coroutine_enter(bs, co);
-BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
-return data.ret;
-}
+BdrvVmstateCo data = {
+.bs = bs,
+.qiov   = qiov,
+.pos= pos,
+.is_read= is_read,
+};
+return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data, &data.ret);
 }
 
 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
@@ -2890,22 +2875,11 @@ early_exit:
 
 int bdrv_flush(BlockDriverState *bs)
 {
-Coroutine *co;
 FlushCo flush_co = {
 .bs = bs,
-.ret = NOT_DONE,
 };
 
-if (qemu_in_coroutine()) {
-/* Fast-path if already in coroutine context */
-bdrv_flush_co_entry(&flush_co);
-} else {
-co = qemu_coroutine_create(bdrv_flush_co_en

[RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext

2020-05-12 Thread Kevin Wolf

Coroutine functions that are entered through bdrv_run_co() are already
safe to call from synchronous code in a different AioContext because
bdrv_coroutine_enter() will schedule them in the context of the node.

However, the coroutine fastpath still requires that we're already in the
right AioContext when called in coroutine context.

In order to make the behaviour more consistent and to make life a bit
easier for callers, let's check the AioContext and automatically move
the current coroutine around if we're not in the right context yet.

Signed-off-by: Kevin Wolf 
---
 block/io.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index c1badaadc9..7808e8bdc0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, 
CoroutineEntry *entry,
void *opaque, int *ret)
 {
 if (qemu_in_coroutine()) {
-/* Fast-path if already in coroutine context */
+Coroutine *self = qemu_coroutine_self();
+AioContext *bs_ctx = bdrv_get_aio_context(bs);
+AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
+
+if (bs_ctx != co_ctx) {
+/* Move to the iothread of the node */
+aio_co_schedule(bs_ctx, self);
+qemu_coroutine_yield();
+}
 entry(opaque);
+if (bs_ctx != co_ctx) {
+/* Move back to the original AioContext */
+aio_co_schedule(bs_ctx, self);
+qemu_coroutine_yield();
+}
 } else {
 Coroutine *co = qemu_coroutine_create(entry, opaque);
 *ret = NOT_DONE;
-- 
2.25.3

[RFC PATCH 3/3] block: Assert we're running in the right thread

2020-05-12 Thread Kevin Wolf

tracked_request_begin() is called for most I/O operations, so it's a
good place to assert that we're indeed running in the home thread of the
node's AioContext.

Signed-off-by: Kevin Wolf 
---
 block/io.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index 7808e8bdc0..924bf5ba46 100644
--- a/block/io.c
+++ b/block/io.c
@@ -695,14 +695,17 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
   uint64_t bytes,
   enum BdrvTrackedRequestType type)
 {
+Coroutine *self = qemu_coroutine_self();
+
 assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
+assert(bs->aio_context == qemu_coroutine_get_aio_context(self));
 
 *req = (BdrvTrackedRequest){
 .bs = bs,
 .offset = offset,
 .bytes  = bytes,
 .type   = type,
-.co = qemu_coroutine_self(),
+.co = self,
 .serialising= false,
 .overlap_offset = offset,
 .overlap_bytes  = bytes,
-- 
2.25.3

[PATCH v3 11/15] copy-on-read: Support preadv/pwritev_part functions

2020-05-12 Thread Andrey Shinkevich

Add support for the recently introduced functions
bdrv_co_preadv_part()
and
bdrv_co_pwritev_part()
to the COR-filter driver.

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 74e01ee..4030d56 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs)
 }
 
 
-static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
-  uint64_t offset, uint64_t bytes,
-  QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs,
+   uint64_t offset, uint64_t bytes,
+   QEMUIOVector *qiov,
+   size_t qiov_offset,
+   int flags)
 {
-return bdrv_co_preadv(bs->file, offset, bytes, qiov,
-  flags | BDRV_REQ_COPY_ON_READ);
+return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+   flags | BDRV_REQ_COPY_ON_READ);
 }
 
 
-static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
-   uint64_t offset, uint64_t bytes,
-   QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs,
+uint64_t offset,
+uint64_t bytes,
+QEMUIOVector *qiov,
+size_t qiov_offset, int flags)
 {
-
-return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
+flags);
 }
 
 
@@ -152,8 +156,8 @@ static BlockDriver bdrv_copy_on_read = {
 
 .bdrv_getlength = cor_getlength,
 
-.bdrv_co_preadv = cor_co_preadv,
-.bdrv_co_pwritev= cor_co_pwritev,
+.bdrv_co_preadv_part= cor_co_preadv_part,
+.bdrv_co_pwritev_part   = cor_co_pwritev_part,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
 .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
-- 
1.8.3.1

[PATCH v3 02/15] copy-on-read: Support compressed writes

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-on-read.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 242d3ff..c4fa468 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -106,6 +106,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState 
*bs,
 }
 
 
+static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs,
+  uint64_t offset,
+  uint64_t bytes,
+  QEMUIOVector *qiov)
+{
+return bdrv_co_pwritev(bs->file, offset, bytes, qiov,
+   BDRV_REQ_WRITE_COMPRESSED);
+}
+
+
 static void cor_eject(BlockDriverState *bs, bool eject_flag)
 {
 bdrv_eject(bs->file->bs, eject_flag);
@@ -130,6 +140,7 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_co_pwritev= cor_co_pwritev,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
+.bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
 
 .bdrv_eject = cor_eject,
 .bdrv_lock_medium   = cor_lock_medium,
-- 
1.8.3.1

[PATCH v3 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread Andrey Shinkevich

With this series, all the block-stream COR operations pass through
the COR-filter. The patches 01-08/15 are taken from the series
"block: Deal with filters" by Max Reitz, the full version of that
can be found in the branches:

  https://git.xanclic.moe/XanClic/qemu child-access-functions-v6
  https://github.com/XanClic/qemu child-access-functions-v6

  When running iotests, apply "char-socket: Fix race condition"
  to avoid sporadic segmentation faults.

v3:
  01: The COR filter insert/remove functions moved to block/copy-on-read.c
  to be a part of API.
  02: block/stream.c code refactoring.
  03: The separate call to block_job_add_bdrv() is used to block operations
  on the active node after the filter inserted and the job created.
  04: The iotests case 030::test_overlapping_4 was modified to unbound
  the block-stream job from the base node.
  05: The COR driver functions preadv/pwritev replaced with their analogous
  preadv/pwritev_part.

v2:
  01: No more skipping filters while checking for operation blockers.
  However, we exclude filters between the bottom node and base
  because we do not set the operation blockers for filters anymore.
  02: As stated above, we do not set the operation blockers for filters
  anymore. So, skip filters when we block operations for the target
  node.
  03: The comment added for the patch 4/7.
  04: The QAPI target version changed to 5.1.
  05: The 'filter-node-name' now excluded from using in the test #030.
  If we need it no more in a final version of the series, the patch
  5/7 may be removed.
  06: The COR-filter included into the frozen chain of a block-stream job.
  The 'above_base' node pointer is left because it is essential for
  finding the base node in case of filters above.


Andrey Shinkevich (7):
  block: prepare block-stream for using COR-filter
  copy-on-read: Support change filename functions
  copy-on-read: Support preadv/pwritev_part functions
  copy-on-read: add filter append/drop functions
  qapi: add filter-node-name to block-stream
  iotests: prepare 245 for using filter in block-stream
  block: apply COR-filter to block-stream jobs

Max Reitz (8):
  block: Mark commit and mirror as filter drivers
  copy-on-read: Support compressed writes
  block: Add child access functions
  block: Add chain helper functions
  block: Include filters when freezing backing chain
  block: Use CAFs in block status functions
  commit: Deal with filters when blocking intermediate nodes
  block: Use CAFs when working with backing chains

 block.c| 275 ++---
 block/commit.c |  85 ++---
 block/copy-on-read.c   | 155 +--
 block/io.c |  19 +--
 block/mirror.c |   6 +-
 block/monitor/block-hmp-cmds.c |   4 +-
 block/stream.c |  89 +
 blockdev.c |  19 ++-
 include/block/block.h  |   6 +-
 include/block/block_int.h  |  67 +-
 qapi/block-core.json   |   6 +
 tests/qemu-iotests/030 |  17 +--
 tests/qemu-iotests/141.out |   2 +-
 tests/qemu-iotests/245 |  10 +-
 14 files changed, 621 insertions(+), 139 deletions(-)

-- 
1.8.3.1

[PATCH v3 12/15] copy-on-read: add filter append/drop functions

2020-05-12 Thread Andrey Shinkevich

Provide API for the COR-filter insertion/removal.
Also, drop the filter child permissions for an inactive state when the
filter node is being removed.

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 98 
 1 file changed, 98 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 4030d56..31e1619 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -24,8 +24,15 @@
 #include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "block/copy-on-read.h"
 
 
+typedef struct BDRVStateCOR {
+bool active;
+} BDRVStateCOR;
+
 static int cor_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
@@ -57,6 +64,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild 
*c,
uint64_t perm, uint64_t shared,
uint64_t *nperm, uint64_t *nshared)
 {
+BDRVStateCOR *s = bs->opaque;
+
+if (!s->active) {
+/*
+ * While the filter is being removed
+ */
+*nperm = 0;
+*nshared = BLK_PERM_ALL;
+return;
+}
+
 *nperm = perm & PERM_PASSTHROUGH;
 *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
 
@@ -150,6 +168,7 @@ static int cor_change_backing_file(BlockDriverState *bs,
 
 static BlockDriver bdrv_copy_on_read = {
 .format_name= "copy-on-read",
+.instance_size  = sizeof(BDRVStateCOR),
 
 .bdrv_open  = cor_open,
 .bdrv_child_perm= cor_child_perm,
@@ -178,4 +197,83 @@ static void bdrv_copy_on_read_init(void)
 bdrv_register(&bdrv_copy_on_read);
 }
 
+
+static BlockDriverState *create_filter_node(BlockDriverState *bs,
+const char *filter_node_name,
+Error **errp)
+{
+QDict *opts = qdict_new();
+
+qdict_put_str(opts, "driver", "copy-on-read");
+qdict_put_str(opts, "file", bdrv_get_node_name(bs));
+if (filter_node_name) {
+qdict_put_str(opts, "node-name", filter_node_name);
+}
+
+return bdrv_open(NULL, NULL, opts, BDRV_O_RDWR, errp);
+}
+
+
+BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs,
+ const char *filter_node_name,
+ Error **errp)
+{
+BlockDriverState *cor_filter_bs;
+BDRVStateCOR *state;
+Error *local_err = NULL;
+
+cor_filter_bs = create_filter_node(bs, filter_node_name, errp);
+if (cor_filter_bs == NULL) {
+error_prepend(errp, "Could not create filter node: ");
+return NULL;
+}
+
+if (!filter_node_name) {
+cor_filter_bs->implicit = true;
+}
+
+bdrv_drained_begin(bs);
+bdrv_replace_node(bs, cor_filter_bs, &local_err);
+bdrv_drained_end(bs);
+
+if (local_err) {
+bdrv_unref(cor_filter_bs);
+error_propagate(errp, local_err);
+return NULL;
+}
+
+state = cor_filter_bs->opaque;
+state->active = true;
+
+return cor_filter_bs;
+}
+
+
+void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs)
+{
+BdrvChild *child;
+BlockDriverState *bs;
+BDRVStateCOR *s = cor_filter_bs->opaque;
+
+child = bdrv_filtered_rw_child(cor_filter_bs);
+if (!child) {
+return;
+}
+bs = child->bs;
+
+/* Retain the BDS until we complete the graph change. */
+bdrv_ref(bs);
+/* Hold a guest back from writing while permissions are being reset. */
+bdrv_drained_begin(bs);
+/* Drop permissions before the graph change. */
+s->active = false;
+bdrv_child_refresh_perms(cor_filter_bs, child, &error_abort);
+bdrv_replace_node(cor_filter_bs, bs, &error_abort);
+
+bdrv_drained_end(bs);
+bdrv_unref(bs);
+bdrv_unref(cor_filter_bs);
+}
+
+
 block_init(bdrv_copy_on_read_init);
-- 
1.8.3.1

[PATCH v3 09/15] block: prepare block-stream for using COR-filter

2020-05-12 Thread Andrey Shinkevich

This patch is the first one in the series where the COR-filter node
will be hard-coded for using in the block-stream job. The block jobs
may be run in parallel. Exclude conflicts with filter nodes used for
a concurrent job while checking for the blocked operations. It incurs
changes in the iotests 030::test_overlapping_4 that now passes with no
conflict because the stream job does not have a real dependency on its
base and on a filter above it.

Signed-off-by: Andrey Shinkevich 
---
 blockdev.c | 11 +--
 tests/qemu-iotests/030 |  9 -
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index b3c840e..97c2ba2 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2763,6 +2763,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 Error *local_err = NULL;
 const char *base_name = NULL;
 int job_flags = JOB_DEFAULT;
+BlockDriverState *bottom_cow_node;
 
 if (!has_on_error) {
 on_error = BLOCKDEV_ON_ERROR_REPORT;
@@ -2807,8 +2808,14 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 base_name = base_bs->filename;
 }
 
-/* Check for op blockers in the whole chain between bs and base */
-for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) {
+bottom_cow_node = bdrv_find_overlay(bs, base_bs);
+if (!bottom_cow_node) {
+error_setg(errp, "bottom node is not found, nothing to stream");
+goto out;
+}
+/* Check for op blockers in the whole chain between bs and bottom */
+for (iter = bs; iter && iter != bdrv_filtered_bs(bottom_cow_node);
+ iter = bdrv_filtered_bs(iter)) {
 if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) {
 goto out;
 }
diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 104e3ce..d7638cd 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -368,8 +368,7 @@ class TestParallelOps(iotests.QMPTestCase):
 self.wait_until_completed(drive='commit-drive0')
 
 # In this case the base node of the stream job is the same as the
-# top node of commit job. Since this results in the commit filter
-# node being part of the stream chain, this is not allowed.
+# top node of commit job.
 def test_overlapping_4(self):
 self.assert_no_active_block_jobs()
 
@@ -381,13 +380,13 @@ class TestParallelOps(iotests.QMPTestCase):
 
 # Stream from node2 into node4
 result = self.vm.qmp('block-stream', device='node4', 
base_node='node2', job_id='node4')
-self.assert_qmp(result, 'error/desc',
-"Cannot freeze 'backing' link to 'commit-filter'")
+self.assert_qmp(result, 'return', {})
 
 result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0)
 self.assert_qmp(result, 'return', {})
 
-self.wait_until_completed()
+self.vm.run_job(job='drive0', auto_dismiss=True)
+self.vm.run_job(job='node4', auto_dismiss=True)
 self.assert_no_active_block_jobs()
 
 # In this case the base node of the stream job is the commit job's
-- 
1.8.3.1

[PATCH v3 04/15] block: Add chain helper functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Add some helper functions for skipping filters in a chain of block
nodes.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c   | 55 +++
 include/block/block_int.h |  3 +++
 2 files changed, 58 insertions(+)

diff --git a/block.c b/block.c
index b2aae2e..5b4ebfe 100644
--- a/block.c
+++ b/block.c
@@ -6863,3 +6863,58 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs)
 {
 return bdrv_filtered_rw_child(bs) ?: bs->file;
 }
+
+static BlockDriverState *bdrv_skip_filters(BlockDriverState *bs,
+   bool stop_on_explicit_filter)
+{
+BdrvChild *filtered;
+
+if (!bs) {
+return NULL;
+}
+
+while (!(stop_on_explicit_filter && !bs->implicit)) {
+filtered = bdrv_filtered_rw_child(bs);
+if (!filtered) {
+break;
+}
+bs = filtered->bs;
+}
+/*
+ * Note that this treats nodes with bs->drv == NULL as not being
+ * R/W filters (bs->drv == NULL should be replaced by something
+ * else anyway).
+ * The advantage of this behavior is that this function will thus
+ * always return a non-NULL value (given a non-NULL @bs).
+ */
+
+return bs;
+}
+
+/*
+ * Return the first BDS that has not been added implicitly or that
+ * does not have an RW-filtered child down the chain starting from @bs
+ * (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, true);
+}
+
+/*
+ * Return the first BDS that does not have an RW-filtered child down
+ * the chain starting from @bs (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, false);
+}
+
+/*
+ * For a backing chain, return the first non-filter backing image of
+ * the first non-filter image.
+ */
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
+{
+return 
bdrv_skip_rw_filters(bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs)));
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dca59e9..86f7666 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1347,6 +1347,9 @@ BdrvChild *bdrv_filtered_child(BlockDriverState *bs);
 BdrvChild *bdrv_metadata_child(BlockDriverState *bs);
 BdrvChild *bdrv_storage_child(BlockDriverState *bs);
 BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs);
 
 static inline BlockDriverState *child_bs(BdrvChild *child)
 {
-- 
1.8.3.1

[PATCH v3 08/15] block: Use CAFs when working with backing chains

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Use child access functions when iterating through backing chains so
filters do not break the chain.

Signed-off-by: Max Reitz 
---
 block.c | 40 
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index 459412e..69e9152 100644
--- a/block.c
+++ b/block.c
@@ -4593,7 +4593,8 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 }
 
 /*
- * Finds the image layer in the chain that has 'bs' as its backing file.
+ * Finds the image layer in the chain that has 'bs' (or a filter on
+ * top of it) as its backing file.
  *
  * active is the current topmost image.
  *
@@ -4605,11 +4606,18 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
 BlockDriverState *bs)
 {
-while (active && bs != backing_bs(active)) {
-active = backing_bs(active);
+bs = bdrv_skip_rw_filters(bs);
+active = bdrv_skip_rw_filters(active);
+
+while (active) {
+BlockDriverState *next = bdrv_backing_chain_next(active);
+if (bs == next) {
+return active;
+}
+active = next;
 }
 
-return active;
+return NULL;
 }
 
 /* Given a BDS, searches for the base layer. */
@@ -4761,9 +4769,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
  * other intermediate nodes have been dropped.
  * If 'top' is an implicit node (e.g. "commit_top") we should skip
  * it because no one inherits from it. We use explicit_top for that. */
-while (explicit_top && explicit_top->implicit) {
-explicit_top = backing_bs(explicit_top);
-}
+explicit_top = bdrv_skip_implicit_filters(explicit_top);
 update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
 
 /* success - we can delete the intermediate states, and link top->base */
@@ -5205,7 +5211,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
 {
 while (top && top != base) {
-top = backing_bs(top);
+top = bdrv_filtered_bs(top);
 }
 
 return top != NULL;
@@ -5466,7 +5472,17 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 
 is_protocol = path_has_protocol(backing_file);
 
-for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) {
+/*
+ * Being largely a legacy function, skip any filters here
+ * (because filters do not have normal filenames, so they cannot
+ * match anyway; and allowing json:{} filenames is a bit out of
+ * scope).
+ */
+for (curr_bs = bdrv_skip_rw_filters(bs);
+ bdrv_filtered_cow_child(curr_bs) != NULL;
+ curr_bs = bdrv_backing_chain_next(curr_bs))
+{
+BlockDriverState *bs_below = bdrv_backing_chain_next(curr_bs);
 
 /* If either of the filename paths is actually a protocol, then
  * compare unmodified paths; otherwise make paths relative */
@@ -5474,7 +5490,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 char *backing_file_full_ret;
 
 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 /* Also check against the full backing filename for the image */
@@ -5484,7 +5500,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
 g_free(backing_file_full_ret);
 if (equal) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
@@ -5510,7 +5526,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 g_free(filename_tmp);
 
 if (strcmp(backing_file_full, filename_full) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
-- 
1.8.3.1

[PATCH v3 07/15] commit: Deal with filters when blocking intermediate nodes

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

This includes some permission limiting (for example, we only need to
take the RESIZE permission if the base is smaller than the top).

Signed-off-by: Max Reitz 
Signed-off-by: Andrey Shinkevich 
---
 block/commit.c | 75 ++
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index e1f45a4..4df145d 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -37,6 +37,7 @@ typedef struct CommitBlockJob {
 BlockBackend *top;
 BlockBackend *base;
 BlockDriverState *base_bs;
+BlockDriverState *above_base;
 BlockdevOnError on_error;
 bool base_read_only;
 bool chain_frozen;
@@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 break;
 }
 /* Copy if allocated above the base */
-ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false,
+ret = bdrv_is_allocated_above(blk_bs(s->top), s->above_base, true,
   offset, COMMIT_BUFFER_SIZE, &n);
 copy = (ret == 1);
 trace_commit_one_iteration(s, offset, n, ret);
@@ -253,15 +254,35 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 CommitBlockJob *s;
 BlockDriverState *iter;
 BlockDriverState *commit_top_bs = NULL;
+BlockDriverState *filtered_base;
 Error *local_err = NULL;
+int64_t base_size, top_size;
+uint64_t perms, iter_shared_perms;
 int ret;
 
 assert(top != bs);
-if (top == base) {
+if (bdrv_skip_rw_filters(top) == bdrv_skip_rw_filters(base)) {
 error_setg(errp, "Invalid files for merge: top and base are the same");
 return;
 }
 
+base_size = bdrv_getlength(base);
+if (base_size < 0) {
+error_setg_errno(errp, -base_size, "Could not inquire base image 
size");
+return;
+}
+
+top_size = bdrv_getlength(top);
+if (top_size < 0) {
+error_setg_errno(errp, -top_size, "Could not inquire top image size");
+return;
+}
+
+perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+if (base_size < top_size) {
+perms |= BLK_PERM_RESIZE;
+}
+
 s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
  speed, creation_flags, NULL, NULL, errp);
 if (!s) {
@@ -301,17 +322,43 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 
 s->commit_top_bs = commit_top_bs;
 
-/* Block all nodes between top and base, because they will
- * disappear from the chain after this operation. */
-assert(bdrv_chain_contains(top, base));
-for (iter = top; iter != base; iter = backing_bs(iter)) {
-/* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
- * at s->base (if writes are blocked for a node, they are also blocked
- * for its backing file). The other options would be a second filter
- * driver above s->base. */
+/*
+ * Block all nodes between top and base, because they will
+ * disappear from the chain after this operation.
+ * Note that this assumes that the user is fine with removing all
+ * nodes (including R/W filters) between top and base.  Assuring
+ * this is the responsibility of the interface (i.e. whoever calls
+ * commit_start()).
+ */
+s->above_base = bdrv_find_overlay(top, base);
+assert(s->above_base);
+
+/*
+ * The topmost node with
+ * bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base)
+ */
+filtered_base = bdrv_filtered_cow_bs(s->above_base);
+assert(bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base));
+
+/*
+ * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
+ * at s->base (if writes are blocked for a node, they are also blocked
+ * for its backing file). The other options would be a second filter
+ * driver above s->base.
+ */
+iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
+
+for (iter = top; iter != base; iter = bdrv_filtered_bs(iter)) {
+if (iter == filtered_base) {
+/*
+ * From here on, all nodes are filters on the base.  This
+ * allows us to share BLK_PERM_CONSISTENT_READ.
+ */
+iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
+}
+
 ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
- BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
- errp);
+ iter_shared_perms, errp);
 if (ret < 0) {
 goto fail;
 }
@@ -328,9 +375,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 }
 
 s->base = blk_new(s->common.job.aio_context,
-  BLK_PERM_CONSISTENT_READ
-  | BLK_PERM_WRITE
-  | BLK_PERM_RESIZE,
+

[PATCH v3 14/15] iotests: prepare 245 for using filter in block-stream

2020-05-12 Thread Andrey Shinkevich

The preliminary patch modifies the test 245 to prepare the block-stream
job for using COR-filter. The filter breaks the backing chain being
connected to the underlying node by file child link.

Signed-off-by: Andrey Shinkevich 
---
 tests/qemu-iotests/245 | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index 4f5f0bb..5bcddc4 100755
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -896,15 +896,19 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 # make hd1 read-only and block-stream requires it to be read-write
 # (Which error message appears depends on whether the stream job is
 # already done with copying at this point.)
-self.reopen(opts, {},
+opts = hd_opts(1)
+opts['backing'] = hd_opts(2)
+opts['backing']['backing'] = None
+self.reopen(opts, {'read-only': True},
 ["Can't set node 'hd1' to r/o with copy-on-read enabled",
  "Cannot make block node read-only, there is a writer on it"])
 
 # We can't remove hd2 while the stream job is ongoing
-opts['backing']['backing'] = None
-self.reopen(opts, {'backing.read-only': False}, "Cannot change 
'backing' link from 'hd1' to 'hd2'")
+opts['backing'] = None
+self.reopen(opts, {'read-only': False}, "Cannot change 'backing' link 
from 'hd1' to 'hd2'")
 
 # We can detach hd1 from hd0 because it doesn't affect the stream job
+opts = hd_opts(0)
 opts['backing'] = None
 self.reopen(opts)
 
-- 
1.8.3.1

[PATCH v3 10/15] copy-on-read: Support change filename functions

2020-05-12 Thread Andrey Shinkevich

The COR-filter driver should support a redirecting function to refresh
filenames. Otherwise, a file name of the filter will be copied
instead of the one of a data node. It is also true for the function
bdrv_change_backing_file().

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index c4fa468..74e01ee 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -21,6 +21,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
 
@@ -128,6 +129,21 @@ static void cor_lock_medium(BlockDriverState *bs, bool 
locked)
 }
 
 
+static void cor_refresh_filename(BlockDriverState *bs)
+{
+pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
+bs->file->bs->filename);
+}
+
+
+static int cor_change_backing_file(BlockDriverState *bs,
+   const char *backing_file,
+   const char *backing_fmt)
+{
+return bdrv_change_backing_file(bs->file->bs, backing_file, backing_fmt);
+}
+
+
 static BlockDriver bdrv_copy_on_read = {
 .format_name= "copy-on-read",
 
@@ -146,6 +162,8 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_lock_medium   = cor_lock_medium,
 
 .bdrv_co_block_status   = bdrv_co_block_status_from_file,
+.bdrv_refresh_filename  = cor_refresh_filename,
+.bdrv_change_backing_file   = cor_change_backing_file,
 
 .has_variable_length= true,
 .is_filter  = true,
-- 
1.8.3.1

[PATCH v3 03/15] block: Add child access functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

There are BDS children that the general block layer code can access,
namely bs->file and bs->backing.  Since the introduction of filters and
external data files, their meaning is not quite clear.  bs->backing can
be a COW source, or it can be an R/W-filtered child; bs->file can be an
R/W-filtered child, it can be data and metadata storage, or it can be
just metadata storage.

This overloading really is not helpful.  This patch adds function that
retrieve the correct child for each exact purpose.  Later patches in
this series will make use of them.  Doing so will allow us to handle
filter nodes and external data files in a meaningful way.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
---
 block.c   | 99 +++
 include/block/block_int.h | 57 +--
 2 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 0653ccb..b2aae2e 100644
--- a/block.c
+++ b/block.c
@@ -6764,3 +6764,102 @@ void bdrv_del_child(BlockDriverState *parent_bs, 
BdrvChild *child, Error **errp)
 
 parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
 }
+
+/*
+ * Return the child that @bs acts as an overlay for, and from which data may be
+ * copied in COW or COR operations.  Usually this is the backing file.
+ */
+BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (bs->drv->is_filter) {
+return NULL;
+}
+
+return bs->backing;
+}
+
+/*
+ * If @bs acts as a pass-through filter for one of its children,
+ * return that child.  "Pass-through" means that write operations to
+ * @bs are forwarded to that child instead of triggering COW.
+ */
+BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (!bs->drv->is_filter) {
+return NULL;
+}
+
+/* Only one of @backing or @file may be used */
+assert(!(bs->backing && bs->file));
+
+return bs->backing ?: bs->file;
+}
+
+/*
+ * Return any filtered child, independently of how it reacts to write
+ * accesses and whether data is copied onto this BDS through COR.
+ */
+BdrvChild *bdrv_filtered_child(BlockDriverState *bs)
+{
+BdrvChild *cow_child = bdrv_filtered_cow_child(bs);
+BdrvChild *rw_child = bdrv_filtered_rw_child(bs);
+
+/* There can only be one filtered child at a time */
+assert(!(cow_child && rw_child));
+
+return cow_child ?: rw_child;
+}
+
+/*
+ * Return the child that stores the metadata for this node.
+ */
+BdrvChild *bdrv_metadata_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+/* Filters do not have metadata */
+if (bs->drv->is_filter) {
+return NULL;
+}
+
+return bs->file;
+}
+
+/*
+ * Return the child that stores the data that is allocated on this
+ * node.  This may or may not include metadata.
+ */
+BdrvChild *bdrv_storage_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (bs->drv->bdrv_storage_child) {
+return bs->drv->bdrv_storage_child(bs);
+}
+
+return bdrv_filtered_rw_child(bs) ?: bs->file;
+}
+
+/*
+ * Return the primary child of this node: For filters, that is the
+ * filtered child.  For other nodes, that is usually the child storing
+ * metadata.
+ * (A generally more helpful description is that this is (usually) the
+ * child that has the same filename as @bs.)
+ */
+BdrvChild *bdrv_primary_child(BlockDriverState *bs)
+{
+return bdrv_filtered_rw_child(bs) ?: bs->file;
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index df6d027..dca59e9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -89,9 +89,11 @@ struct BlockDriver {
 int instance_size;
 
 /* set to true if the BlockDriver is a block filter. Block filters pass
- * certain callbacks that refer to data (see block.c) to their bs->file if
- * the driver doesn't implement them. Drivers that do not wish to forward
- * must implement them and return -ENOTSUP.
+ * certain callbacks that refer to data (see block.c) to their bs->file
+ * or bs->backing (whichever one exists) if the driver doesn't implement
+ * them. Drivers that do not wish to forward must implement them and return
+ * -ENOTSUP.
+ * Note that filters are not allowed to modify data.
  */
 bool is_filter;
 /*
@@ -585,6 +587,13 @@ struct BlockDriver {
  * If this pointer is NULL, the array is considered empty.
  * "filename" and "driver" are always considered strong. */
 const char *const *strong_runtime_opts;
+
+/**
+ * Return the data storage child, if there is exactly one.  If
+ * this function is not implemented, the block layer will assume
+ * bs->file to be this child.
+ */
+BdrvChild *(*bdrv_storage_child)(

[PATCH v3 06/15] block: Use CAFs in block status functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Use the child access functions in the block status inquiry functions as
appropriate.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/io.c b/block/io.c
index 7d30e61..47d8096 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2387,11 +2387,12 @@ static int coroutine_fn 
bdrv_co_block_status(BlockDriverState *bs,
 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
 ret |= BDRV_BLOCK_ALLOCATED;
 } else if (want_zero) {
+BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs);
+
 if (bdrv_unallocated_blocks_are_zero(bs)) {
 ret |= BDRV_BLOCK_ZERO;
-} else if (bs->backing) {
-BlockDriverState *bs2 = bs->backing->bs;
-int64_t size2 = bdrv_getlength(bs2);
+} else if (cow_bs) {
+int64_t size2 = bdrv_getlength(cow_bs);
 
 if (size2 >= 0 && offset >= size2) {
 ret |= BDRV_BLOCK_ZERO;
@@ -2457,7 +2458,7 @@ static int coroutine_fn 
bdrv_co_block_status_above(BlockDriverState *bs,
 bool first = true;
 
 assert(bs != base);
-for (p = bs; p != base; p = backing_bs(p)) {
+for (p = bs; p != base; p = bdrv_filtered_bs(p)) {
 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
file);
 if (ret < 0) {
@@ -2543,7 +2544,7 @@ int bdrv_block_status_above(BlockDriverState *bs, 
BlockDriverState *base,
 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
   int64_t *pnum, int64_t *map, BlockDriverState **file)
 {
-return bdrv_block_status_above(bs, backing_bs(bs),
+return bdrv_block_status_above(bs, bdrv_filtered_bs(bs),
offset, bytes, pnum, map, file);
 }
 
@@ -2553,9 +2554,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, 
int64_t offset,
 int ret;
 int64_t dummy;
 
-ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL);
+ret = bdrv_common_block_status_above(bs, bdrv_filtered_bs(bs), false,
+ offset, bytes, pnum ? pnum : &dummy,
+ NULL, NULL);
 if (ret < 0) {
 return ret;
 }
@@ -2618,7 +2619,7 @@ int bdrv_is_allocated_above(BlockDriverState *top,
 break;
 }
 
-intermediate = backing_bs(intermediate);
+intermediate = bdrv_filtered_bs(intermediate);
 }
 
 *pnum = n;
-- 
1.8.3.1

[PATCH v3 05/15] block: Include filters when freezing backing chain

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

In order to make filters work in backing chains, the associated
functions must be able to deal with them and freeze all filter links, be
they COW or R/W filter links.

In the process, rename these functions to reflect that they now act on
generalized chains of filter nodes instead of backing chains alone.

While at it, add some comments that note which functions require their
caller to ensure that a given child link is not frozen, and how the
callers do so.

Signed-off-by: Max Reitz 
Signed-off-by: Andrey Shinkevich 
---
 block.c   | 81 ++-
 block/commit.c|  8 ++---
 block/mirror.c|  4 +--
 block/stream.c|  8 ++---
 include/block/block.h |  6 ++--
 5 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/block.c b/block.c
index 5b4ebfe..459412e 100644
--- a/block.c
+++ b/block.c
@@ -2513,12 +2513,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
  * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this
  * function uses bdrv_set_perm() to update the permissions according to the new
  * reference that @new_bs gets.
+ *
+ * Callers must ensure that child->frozen is false.
  */
 static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
 {
 BlockDriverState *old_bs = child->bs;
 uint64_t perm, shared_perm;
 
+/* Asserts that child->frozen == false */
 bdrv_replace_child_noperm(child, new_bs);
 
 /*
@@ -2676,6 +2679,7 @@ static void bdrv_detach_child(BdrvChild *child)
 g_free(child);
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_root_unref_child(BdrvChild *child)
 {
 BlockDriverState *child_bs;
@@ -2685,10 +2689,6 @@ void bdrv_root_unref_child(BdrvChild *child)
 bdrv_unref(child_bs);
 }
 
-/**
- * Clear all inherits_from pointers from children and grandchildren of
- * @root that point to @root, where necessary.
- */
 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child)
 {
 BdrvChild *c;
@@ -2713,6 +2713,7 @@ static void bdrv_unset_inherits_from(BlockDriverState 
*root, BdrvChild *child)
 }
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 {
 if (child == NULL) {
@@ -2756,7 +2757,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) &&
 bdrv_inherits_from_recursive(backing_hd, bs);
 
-if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) {
+if (bdrv_is_chain_frozen(bs, child_bs(bs->backing), errp)) {
 return;
 }
 
@@ -2765,6 +2766,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 }
 
 if (bs->backing) {
+/* Cannot be frozen, we checked that above */
 bdrv_unref_child(bs, bs->backing);
 bs->backing = NULL;
 }
@@ -3912,8 +3914,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState 
*reopen_state,
 return -EPERM;
 }
 /* Check if the backing link that we want to replace is frozen */
-if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs),
- errp)) {
+if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) {
 return -EPERM;
 }
 reopen_state->replace_backing_bs = true;
@@ -4248,6 +4249,7 @@ static void bdrv_close(BlockDriverState *bs)
 
 if (bs->drv) {
 if (bs->drv->bdrv_close) {
+/* Must unfreeze all children, so bdrv_unref_child() works */
 bs->drv->bdrv_close(bs);
 }
 bs->drv = NULL;
@@ -4617,20 +4619,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs)
 }
 
 /*
- * Return true if at least one of the backing links between @bs and
- * @base is frozen. @errp is set if that's the case.
+ * Return true if at least one of the (COW and R/W) filter links
+ * between @bs and @base is frozen. @errp is set if that's the case.
  * @base must be reachable from @bs, or NULL.
  */
-bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
-  Error **errp)
+bool bdrv_is_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+  Error **errp)
 {
 BlockDriverState *i;
+BdrvChild *child;
+
+for (i = bs; i != base; i = child_bs(child)) {
+child = bdrv_filtered_child(i);
 
-for (i = bs; i != base; i = backing_bs(i)) {
-if (i->backing && i->backing->frozen) {
+if (child && child->frozen) {
 error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
-   i->backing->name, i->node_name,
-   backing_bs(i)->node_name);
+   child->name, i->node_name, child->bs->node_name);
 return true;
 }
 }
@@ -4639

[PATCH v3 15/15] block: apply COR-filter to block-stream jobs

2020-05-12 Thread Andrey Shinkevich

The patch completes the series with the COR-filter insertion to any
block-stream operation. It also makes changes to the iotests 030 and
141.out.

Signed-off-by: Andrey Shinkevich 
---
 block/stream.c | 83 --
 tests/qemu-iotests/030 |  8 +++--
 tests/qemu-iotests/141.out |  2 +-
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index e0b35f8..a74a07b 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -19,6 +19,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "block/copy-on-read.h"
 
 enum {
 /*
@@ -32,8 +33,11 @@ enum {
 typedef struct StreamBlockJob {
 BlockJob common;
 BlockDriverState *bottom;
+BlockDriverState *cor_filter_bs;
+BlockDriverState *target_bs;
 BlockdevOnError on_error;
 char *backing_file_str;
+char *base_fmt;
 bool bs_read_only;
 bool chain_frozen;
 } StreamBlockJob;
@@ -52,33 +56,25 @@ static void stream_abort(Job *job)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 
 if (s->chain_frozen) {
-BlockJob *bjob = &s->common;
-bdrv_unfreeze_chain(blk_bs(bjob->blk), s->bottom);
+bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom);
 }
 }
 
 static int stream_prepare(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-BlockJob *bjob = &s->common;
-BlockDriverState *bs = blk_bs(bjob->blk);
+BlockDriverState *bs = s->target_bs;
 BlockDriverState *base = backing_bs(s->bottom);
 Error *local_err = NULL;
 int ret = 0;
 
-bdrv_unfreeze_chain(bs, s->bottom);
+bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom);
 s->chain_frozen = false;
 
 if (bs->backing) {
-const char *base_id = NULL, *base_fmt = NULL;
-if (base) {
-base_id = s->backing_file_str;
-if (base->drv) {
-base_fmt = base->drv->format_name;
-}
-}
 bdrv_set_backing_hd(bs, base, &local_err);
-ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+ret = bdrv_change_backing_file(bs, s->backing_file_str,
+   s->base_fmt);
 if (local_err) {
 error_report_err(local_err);
 return -EPERM;
@@ -92,7 +88,9 @@ static void stream_clean(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = &s->common;
-BlockDriverState *bs = blk_bs(bjob->blk);
+BlockDriverState *bs = s->target_bs;
+
+bdrv_cor_filter_drop(s->cor_filter_bs);
 
 /* Reopen the image back in read-only mode if necessary */
 if (s->bs_read_only) {
@@ -102,13 +100,14 @@ static void stream_clean(Job *job)
 }
 
 g_free(s->backing_file_str);
+g_free(s->base_fmt);
 }
 
 static int coroutine_fn stream_run(Job *job, Error **errp)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockBackend *blk = s->common.blk;
-BlockDriverState *bs = blk_bs(blk);
+BlockDriverState *bs = s->target_bs;
 bool enable_cor = !backing_bs(s->bottom);
 int64_t len;
 int64_t offset = 0;
@@ -156,8 +155,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 } else if (ret >= 0) {
 /* Copy if allocated in the intermediate images.  Limit to the
  * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
-ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true,
-  offset, n, &n);
+ret = bdrv_is_allocated_above(bdrv_filtered_cow_bs(bs), s->bottom,
+  true, offset, n, &n);
 /* Finish early if end of backing file has been reached */
 if (ret == 0 && n == 0) {
 n = len - offset;
@@ -225,7 +224,13 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 BlockDriverState *iter;
 bool bs_read_only;
 int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
+BlockDriverState *cor_filter_bs = NULL;
 BlockDriverState *bottom = bdrv_find_overlay(bs, base);
+char *base_fmt = NULL;
+
+if (base && base->drv) {
+base_fmt = g_strdup(base->drv->format_name);
+}
 
 if (bdrv_freeze_chain(bs, bottom, errp) < 0) {
 return;
@@ -240,17 +245,35 @@ void stream_start(const char *job_id, BlockDriverState 
*bs,
 }
 }
 
-/* Prevent concurrent jobs trying to modify the graph structure here, we
- * already have our own plans. Also don't allow resize as the image size is
- * queried only at the job start and then cached. */
-s = block_job_create(job_id, &stream_job_driver, NULL, bs,
- basic_flags | BLK_PERM_GRAPH_MOD,
- basic_flags | BLK_PERM_WRITE,
+cor_filter_bs = bdrv_cor_filter_append(bs,

[PATCH v3 01/15] block: Mark commit and mirror as filter drivers

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

The commit and mirror block nodes are filters, so they should be marked
as such.  (Strictly speaking, BDS.is_filter's documentation states that
a filter's child must be bs->file.  The following patch will relax this
restriction, however.)

Signed-off-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
---
 block/commit.c | 2 ++
 block/mirror.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index 87f6096..445a280 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -240,6 +240,8 @@ static BlockDriver bdrv_commit_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_commit_top_refresh_filename,
 .bdrv_child_perm= bdrv_commit_top_child_perm,
+
+.is_filter  = true,
 };
 
 void commit_start(const char *job_id, BlockDriverState *bs,
diff --git a/block/mirror.c b/block/mirror.c
index aca95c9..b6de24b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1527,6 +1527,8 @@ static BlockDriver bdrv_mirror_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_mirror_top_refresh_filename,
 .bdrv_child_perm= bdrv_mirror_top_child_perm,
+
+.is_filter  = true,
 };
 
 static BlockJob *mirror_start_job(
-- 
1.8.3.1

[PATCH v3 13/15] qapi: add filter-node-name to block-stream

2020-05-12 Thread Andrey Shinkevich

Provide the possibility to pass the 'filter-node-name' parameter to the
block-stream job as it is done for the commit block job. That will be
needed for further iotests implementations.

Signed-off-by: Andrey Shinkevich 
---
 block/monitor/block-hmp-cmds.c | 4 ++--
 block/stream.c | 4 +++-
 blockdev.c | 8 +++-
 include/block/block_int.h  | 7 ++-
 qapi/block-core.json   | 6 ++
 5 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index 4c8c375..6f6636b 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
 
 qmp_block_stream(true, device, device, base != NULL, base, false, NULL,
  false, NULL, qdict_haskey(qdict, "speed"), speed, true,
- BLOCKDEV_ON_ERROR_REPORT, false, false, false, false,
- &error);
+ BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, 
false,
+ false, &error);
 
 hmp_handle_error(mon, error);
 }
diff --git a/block/stream.c b/block/stream.c
index 3cca1f9..e0b35f8 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -217,7 +217,9 @@ static const BlockJobDriver stream_job_driver = {
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp)
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp)
 {
 StreamBlockJob *s;
 BlockDriverState *iter;
diff --git a/blockdev.c b/blockdev.c
index 97c2ba2..67918fc 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2753,6 +2753,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
   bool has_backing_file, const char *backing_file,
   bool has_speed, int64_t speed,
   bool has_on_error, BlockdevOnError on_error,
+  bool has_filter_node_name, const char *filter_node_name,
   bool has_auto_finalize, bool auto_finalize,
   bool has_auto_dismiss, bool auto_dismiss,
   Error **errp)
@@ -2769,6 +2770,10 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 on_error = BLOCKDEV_ON_ERROR_REPORT;
 }
 
+if (!has_filter_node_name) {
+filter_node_name = NULL;
+}
+
 bs = bdrv_lookup_bs(device, device, errp);
 if (!bs) {
 return;
@@ -2840,7 +2845,8 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 }
 
 stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
- job_flags, has_speed ? speed : 0, on_error, &local_err);
+ job_flags, has_speed ? speed : 0, on_error,
+ filter_node_name, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 goto out;
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 86f7666..0fdfa9c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1094,6 +1094,9 @@ int is_windows_drive(const char *filename);
  *  See @BlockJobCreateFlags
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @bs. NULL means
+ * that a node name should be autogenerated.
  * @errp: Error object.
  *
  * Start a streaming operation on @bs.  Clusters that are unallocated
@@ -1106,7 +1109,9 @@ int is_windows_drive(const char *filename);
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp);
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp);
 
 /**
  * commit_start:
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 943df19..df6981e 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2509,6 +2509,11 @@
 #'stop' and 'enospc' can only be used if the block device
 #supports io-status (see BlockInfo).  Since 1.3.
 #
+# @filter-node-name: the node name that should be assigned to the
+#filter driver that the stream job inserts into the graph
+#above @device. If this option is not given, a node name is
+#autogenerated. (Since: 5.1)
+#
 # @auto-finalize: When false, this job will wait in a PENDIN

Re: [PATCH v2 1/4] iotests/109: Don't mirror with mismatched size

2020-05-12 Thread Vladimir Sementsov-Ogievskiy


11.05.2020 16:58, Kevin Wolf wrote:

This patch makes the raw image the same size as the file in a different
format that is mirrored as raw to it to avoid errors when mirror starts
to enforce that source and target are the same size.

We check only that the first 512 bytes are zeroed (instead of 64k)
because some image formats create image files that are smaller than 64k,
so trying to read 64k would result in I/O errors. Apart from this, 512
is more appropriate anyway because the raw format driver protects
specifically the first 512 bytes.

Signed-off-by: Kevin Wolf


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PATCH] hw: Use QEMU_IS_ALIGNED() on parallel flash block size

2020-05-12 Thread Alistair Francis

On Mon, May 11, 2020 at 1:54 PM Philippe Mathieu-Daudé
 wrote:
>
> Use the QEMU_IS_ALIGNED() macro to verify the flash block size
> is properly aligned. It is quicker to process when reviewing.
>
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/arm/sbsa-ref.c   | 2 +-
>  hw/arm/virt.c   | 2 +-
>  hw/block/pflash_cfi01.c | 2 +-
>  hw/block/pflash_cfi02.c | 2 +-
>  hw/i386/pc_sysfw.c  | 2 +-
>  hw/riscv/virt.c | 2 +-
>  6 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
> index 8409ba853d..b379e4a76a 100644
> --- a/hw/arm/sbsa-ref.c
> +++ b/hw/arm/sbsa-ref.c
> @@ -241,7 +241,7 @@ static void sbsa_flash_map1(PFlashCFI01 *flash,
>  {
>  DeviceState *dev = DEVICE(flash);
>
> -assert(size % SBSA_FLASH_SECTOR_SIZE == 0);
> +assert(QEMU_IS_ALIGNED(size, SBSA_FLASH_SECTOR_SIZE));
>  assert(size / SBSA_FLASH_SECTOR_SIZE <= UINT32_MAX);
>  qdev_prop_set_uint32(dev, "num-blocks", size / SBSA_FLASH_SECTOR_SIZE);
>  qdev_init_nofail(dev);
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 634db0cfe9..0a99fddb3d 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -978,7 +978,7 @@ static void virt_flash_map1(PFlashCFI01 *flash,
>  {
>  DeviceState *dev = DEVICE(flash);
>
> -assert(size % VIRT_FLASH_SECTOR_SIZE == 0);
> +assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE));
>  assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
>  qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
>  qdev_init_nofail(dev);
> diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c
> index f586bac269..11922c0f96 100644
> --- a/hw/block/pflash_cfi01.c
> +++ b/hw/block/pflash_cfi01.c
> @@ -964,7 +964,7 @@ PFlashCFI01 *pflash_cfi01_register(hwaddr base,
>  if (blk) {
>  qdev_prop_set_drive(dev, "drive", blk, &error_abort);
>  }
> -assert(size % sector_len == 0);
> +assert(QEMU_IS_ALIGNED(size, sector_len));
>  qdev_prop_set_uint32(dev, "num-blocks", size / sector_len);
>  qdev_prop_set_uint64(dev, "sector-length", sector_len);
>  qdev_prop_set_uint8(dev, "width", bank_width);
> diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c
> index c6b6f2d082..895f7daee3 100644
> --- a/hw/block/pflash_cfi02.c
> +++ b/hw/block/pflash_cfi02.c
> @@ -1003,7 +1003,7 @@ PFlashCFI02 *pflash_cfi02_register(hwaddr base,
>  if (blk) {
>  qdev_prop_set_drive(dev, "drive", blk, &error_abort);
>  }
> -assert(size % sector_len == 0);
> +assert(QEMU_IS_ALIGNED(size, sector_len));
>  qdev_prop_set_uint32(dev, "num-blocks", size / sector_len);
>  qdev_prop_set_uint32(dev, "sector-length", sector_len);
>  qdev_prop_set_uint8(dev, "width", width);
> diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
> index f5f3f466b0..fad41f0e73 100644
> --- a/hw/i386/pc_sysfw.c
> +++ b/hw/i386/pc_sysfw.c
> @@ -168,7 +168,7 @@ static void pc_system_flash_map(PCMachineState *pcms,
>   blk_name(blk), strerror(-size));
>  exit(1);
>  }
> -if (size == 0 || size % FLASH_SECTOR_SIZE != 0) {
> +if (size == 0 || !QEMU_IS_ALIGNED(size, FLASH_SECTOR_SIZE)) {
>  error_report("system firmware block device %s has invalid size "
>   "%" PRId64,
>   blk_name(blk), size);
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index daae3ebdbb..71481d59c2 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -112,7 +112,7 @@ static void virt_flash_map1(PFlashCFI01 *flash,
>  {
>  DeviceState *dev = DEVICE(flash);
>
> -assert(size % VIRT_FLASH_SECTOR_SIZE == 0);
> +assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE));
>  assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX);
>  qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE);
>  qdev_init_nofail(dev);
> --
> 2.21.3
>
>

Re: [PATCH v5 13/15] acpi: drop build_piix4_pm()

2020-05-12 Thread Igor Mammedov

On Tue, 12 May 2020 13:16:05 +0200
Gerd Hoffmann  wrote:

> On Mon, May 11, 2020 at 09:37:32PM +0200, Igor Mammedov wrote:
> > On Thu,  7 May 2020 15:16:38 +0200
> > Gerd Hoffmann  wrote:
> >   
> > > The _SB.PCI0.PX13.P13C opregion (holds isa device enable bits)
> > > is not used any more, remove it from DSDT.
> > > 
> > > Signed-off-by: Gerd Hoffmann 
> > > ---
> > >  hw/i386/acpi-build.c | 16 
> > >  1 file changed, 16 deletions(-)
> > > 
> > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> > > index 765409a90eb6..c1e63cce5e8e 100644
> > > --- a/hw/i386/acpi-build.c
> > > +++ b/hw/i386/acpi-build.c
> > > @@ -1441,21 +1441,6 @@ static void build_q35_isa_bridge(Aml *table)
> > >  aml_append(table, scope);
> > >  }
> > >  
> > > -static void build_piix4_pm(Aml *table)
> > > -{
> > > -Aml *dev;
> > > -Aml *scope;
> > > -
> > > -scope =  aml_scope("_SB.PCI0");
> > > -dev = aml_device("PX13");
> > > -aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010003)));  
> > I agree about removing P13C but I'm not sure if it's safe to remove
> > whole isa bridge  
> 
> The isa bridge is _SB0.PCI0.ISA (piix4 function 0).
> _SB.PCI0.PX13 is the acpi pm device (piix4 function 3).
> 
> So this does _not_ remove the isa bridge.  And given that PX13 has
> nothing declared beside the opregion we are removing and I have not
> found any other references to the PX13 device I assumed it is ok to drop
> that one too.  Didn't notice any odd side effects in testing.
I don't see HID or some other way to tell guest that it should load a driver
for it so probably we can remove it.
If it breaks some legacy OS we can always add it back.


> 
> take care,
>   Gerd
>

Re: [RFC PATCH 1/3] block: Factor out bdrv_run_co()

2020-05-12 Thread Eric Blake


On 5/12/20 9:43 AM, Kevin Wolf wrote:

We have a few bdrv_*() functions that can either spawn a new coroutine
and wait for it with BDRV_POLL_WHILE() or use a fastpath if they are
alreeady running in a coroutine. All of them duplicate basically the


already


same code.

Factor the common code into a new function bdrv_run_co().

Signed-off-by: Kevin Wolf 
---
  block/io.c | 104 +++--
  1 file changed, 28 insertions(+), 76 deletions(-)

diff --git a/block/io.c b/block/io.c
index 7d30e61edc..c1badaadc9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -891,6 +891,22 @@ static int bdrv_check_byte_request(BlockDriverState *bs, 
int64_t offset,
  return 0;
  }
  
+static int bdrv_run_co(BlockDriverState *bs, CoroutineEntry *entry,

+   void *opaque, int *ret)
+{
+if (qemu_in_coroutine()) {
+/* Fast-path if already in coroutine context */
+entry(opaque);
+} else {
+Coroutine *co = qemu_coroutine_create(entry, opaque);
+*ret = NOT_DONE;
+bdrv_coroutine_enter(bs, co);
+BDRV_POLL_WHILE(bs, *ret == NOT_DONE);


For my reference, NOT_DONE is defined as INT_MAX, which does not seem to 
be used as a return value in other situations.



@@ -923,25 +939,15 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
  QEMUIOVector *qiov, bool is_write,
  BdrvRequestFlags flags)
  {
-Coroutine *co;
  RwCo rwco = {
  .child = child,
  .offset = offset,
  .qiov = qiov,
  .is_write = is_write,
-.ret = NOT_DONE,
  .flags = flags,
  };
  
-if (qemu_in_coroutine()) {

-/* Fast-path if already in coroutine context */
-bdrv_rw_co_entry(&rwco);
-} else {
-co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
-bdrv_coroutine_enter(child->bs, co);
-BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
-}
-return rwco.ret;
+return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco, &rwco.ret);


So code that previously looped on NOT_DONE is obviously safe, while...


  }
  
  int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,

@@ -2230,7 +2236,6 @@ typedef struct BdrvCoBlockStatusData {
  int64_t *map;
  BlockDriverState **file;
  int ret;
-bool done;
  } BdrvCoBlockStatusData;
  
  int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,

@@ -2492,7 +2497,6 @@ static void coroutine_fn 
bdrv_block_status_above_co_entry(void *opaque)
 data->want_zero,
 data->offset, data->bytes,
 data->pnum, data->map, data->file);
-data->done = true;
  aio_wait_kick();


...code that looped on something else now has to be checked that 
data->ret is still being set to something useful.  Fortunately that is 
true here.



@@ -2669,22 +2663,13 @@ static inline int
  bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
  bool is_read)
  {
-if (qemu_in_coroutine()) {
-return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
-} else {
-BdrvVmstateCo data = {
-.bs = bs,
-.qiov   = qiov,
-.pos= pos,
-.is_read= is_read,
-.ret= -EINPROGRESS,
-};
-Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
-
-bdrv_coroutine_enter(bs, co);
-BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
-return data.ret;


It's a little harder to see whether -EINPROGRESS might ever be returned 
by a driver, but again this looks safe.


Here, it's a little less obvious whether any driver might return 
-EINPROGRESS, but it looks like if they did that,


Reviewed-by: Eric Blake 

Conflicts with Vladimir's patches which try to add more coroutine 
wrappers (but those need a rebase anyway):

https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg04559.html

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 2/4] iotests/229: Use blkdebug to inject an error

2020-05-12 Thread Vladimir Sementsov-Ogievskiy


11.05.2020 16:58, Kevin Wolf wrote:

229 relies on the mirror running into an I/O error when the target is
smaller than the source. After changing mirror to catch this condition
while starting the job, this test case won't get a job that is paused
for an I/O error any more. Use blkdebug instead to inject an error.

Signed-off-by: Kevin Wolf
Message-Id:<20200507145228.323412-2-kw...@redhat.com>
Reviewed-by: Eric Blake
Signed-off-by: Kevin Wolf



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext

2020-05-12 Thread Thomas Lamprecht

On 5/12/20 4:43 PM, Kevin Wolf wrote:
> Coroutine functions that are entered through bdrv_run_co() are already
> safe to call from synchronous code in a different AioContext because
> bdrv_coroutine_enter() will schedule them in the context of the node.
> 
> However, the coroutine fastpath still requires that we're already in the
> right AioContext when called in coroutine context.
> 
> In order to make the behaviour more consistent and to make life a bit
> easier for callers, let's check the AioContext and automatically move
> the current coroutine around if we're not in the right context yet.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/io.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/block/io.c b/block/io.c
> index c1badaadc9..7808e8bdc0 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, 
> CoroutineEntry *entry,
> void *opaque, int *ret)
>  {
>  if (qemu_in_coroutine()) {
> -/* Fast-path if already in coroutine context */
> +Coroutine *self = qemu_coroutine_self();
> +AioContext *bs_ctx = bdrv_get_aio_context(bs);
> +AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
> +
> +if (bs_ctx != co_ctx) {
> +/* Move to the iothread of the node */
> +aio_co_schedule(bs_ctx, self);
> +qemu_coroutine_yield();
> +}
>  entry(opaque);
> +if (bs_ctx != co_ctx) {
> +/* Move back to the original AioContext */
> +aio_co_schedule(bs_ctx, self);

shouldn't it use co_ctx here, as else it's just scheduled again on the one from 
bs?

Looks OK for me besides that.

> +qemu_coroutine_yield();
> +}
>  } else {
>  Coroutine *co = qemu_coroutine_create(entry, opaque);
>  *ret = NOT_DONE;
>

[PATCH v4 12/15] copy-on-read: add filter append/drop functions

2020-05-12 Thread Andrey Shinkevich

Provide API for the COR-filter insertion/removal.
Also, drop the filter child permissions for an inactive state when the
filter node is being removed.

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 102 +++
 1 file changed, 102 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 4030d56..229a97c 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -24,11 +24,20 @@
 #include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "block/copy-on-read.h"
 
 
+typedef struct BDRVStateCOR {
+bool active;
+} BDRVStateCOR;
+
 static int cor_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
+BDRVStateCOR *state = bs->opaque;
+
 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
errp);
 if (!bs->file) {
@@ -42,6 +51,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int 
flags,
 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 bs->file->bs->supported_zero_flags);
 
+state->active = true;
+
 return 0;
 }
 
@@ -57,6 +68,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild 
*c,
uint64_t perm, uint64_t shared,
uint64_t *nperm, uint64_t *nshared)
 {
+BDRVStateCOR *s = bs->opaque;
+
+if (!s->active) {
+/*
+ * While the filter is being removed
+ */
+*nperm = 0;
+*nshared = BLK_PERM_ALL;
+return;
+}
+
 *nperm = perm & PERM_PASSTHROUGH;
 *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
 
@@ -150,6 +172,7 @@ static int cor_change_backing_file(BlockDriverState *bs,
 
 static BlockDriver bdrv_copy_on_read = {
 .format_name= "copy-on-read",
+.instance_size  = sizeof(BDRVStateCOR),
 
 .bdrv_open  = cor_open,
 .bdrv_child_perm= cor_child_perm,
@@ -178,4 +201,83 @@ static void bdrv_copy_on_read_init(void)
 bdrv_register(&bdrv_copy_on_read);
 }
 
+
+static BlockDriverState *create_filter_node(BlockDriverState *bs,
+const char *filter_node_name,
+Error **errp)
+{
+QDict *opts = qdict_new();
+
+qdict_put_str(opts, "driver", "copy-on-read");
+qdict_put_str(opts, "file", bdrv_get_node_name(bs));
+if (filter_node_name) {
+qdict_put_str(opts, "node-name", filter_node_name);
+}
+
+return bdrv_open(NULL, NULL, opts, BDRV_O_RDWR, errp);
+}
+
+
+BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs,
+ const char *filter_node_name,
+ Error **errp)
+{
+BlockDriverState *cor_filter_bs;
+BDRVStateCOR *state;
+Error *local_err = NULL;
+
+cor_filter_bs = create_filter_node(bs, filter_node_name, errp);
+if (cor_filter_bs == NULL) {
+error_prepend(errp, "Could not create filter node: ");
+return NULL;
+}
+
+if (!filter_node_name) {
+cor_filter_bs->implicit = true;
+}
+
+bdrv_drained_begin(bs);
+bdrv_replace_node(bs, cor_filter_bs, &local_err);
+bdrv_drained_end(bs);
+
+if (local_err) {
+bdrv_unref(cor_filter_bs);
+error_propagate(errp, local_err);
+return NULL;
+}
+
+state = cor_filter_bs->opaque;
+state->active = true;
+
+return cor_filter_bs;
+}
+
+
+void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs)
+{
+BdrvChild *child;
+BlockDriverState *bs;
+BDRVStateCOR *s = cor_filter_bs->opaque;
+
+child = bdrv_filtered_rw_child(cor_filter_bs);
+if (!child) {
+return;
+}
+bs = child->bs;
+
+/* Retain the BDS until we complete the graph change. */
+bdrv_ref(bs);
+/* Hold a guest back from writing while permissions are being reset. */
+bdrv_drained_begin(bs);
+/* Drop permissions before the graph change. */
+s->active = false;
+bdrv_child_refresh_perms(cor_filter_bs, child, &error_abort);
+bdrv_replace_node(cor_filter_bs, bs, &error_abort);
+
+bdrv_drained_end(bs);
+bdrv_unref(bs);
+bdrv_unref(cor_filter_bs);
+}
+
+
 block_init(bdrv_copy_on_read_init);
-- 
1.8.3.1

[PATCH v4 02/15] copy-on-read: Support compressed writes

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-on-read.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 242d3ff..c4fa468 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -106,6 +106,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState 
*bs,
 }
 
 
+static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs,
+  uint64_t offset,
+  uint64_t bytes,
+  QEMUIOVector *qiov)
+{
+return bdrv_co_pwritev(bs->file, offset, bytes, qiov,
+   BDRV_REQ_WRITE_COMPRESSED);
+}
+
+
 static void cor_eject(BlockDriverState *bs, bool eject_flag)
 {
 bdrv_eject(bs->file->bs, eject_flag);
@@ -130,6 +140,7 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_co_pwritev= cor_co_pwritev,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
+.bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
 
 .bdrv_eject = cor_eject,
 .bdrv_lock_medium   = cor_lock_medium,
-- 
1.8.3.1

[PATCH v4 11/15] copy-on-read: Support preadv/pwritev_part functions

2020-05-12 Thread Andrey Shinkevich

Add support for the recently introduced functions
bdrv_co_preadv_part()
and
bdrv_co_pwritev_part()
to the COR-filter driver.

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 74e01ee..4030d56 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs)
 }
 
 
-static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
-  uint64_t offset, uint64_t bytes,
-  QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs,
+   uint64_t offset, uint64_t bytes,
+   QEMUIOVector *qiov,
+   size_t qiov_offset,
+   int flags)
 {
-return bdrv_co_preadv(bs->file, offset, bytes, qiov,
-  flags | BDRV_REQ_COPY_ON_READ);
+return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+   flags | BDRV_REQ_COPY_ON_READ);
 }
 
 
-static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
-   uint64_t offset, uint64_t bytes,
-   QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs,
+uint64_t offset,
+uint64_t bytes,
+QEMUIOVector *qiov,
+size_t qiov_offset, int flags)
 {
-
-return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
+flags);
 }
 
 
@@ -152,8 +156,8 @@ static BlockDriver bdrv_copy_on_read = {
 
 .bdrv_getlength = cor_getlength,
 
-.bdrv_co_preadv = cor_co_preadv,
-.bdrv_co_pwritev= cor_co_pwritev,
+.bdrv_co_preadv_part= cor_co_preadv_part,
+.bdrv_co_pwritev_part   = cor_co_pwritev_part,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
 .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
-- 
1.8.3.1

[PATCH v4 13/15] qapi: add filter-node-name to block-stream

2020-05-12 Thread Andrey Shinkevich

Provide the possibility to pass the 'filter-node-name' parameter to the
block-stream job as it is done for the commit block job. That will be
needed for further iotests implementations.

Signed-off-by: Andrey Shinkevich 
---
 block/monitor/block-hmp-cmds.c | 4 ++--
 block/stream.c | 4 +++-
 blockdev.c | 8 +++-
 include/block/block_int.h  | 7 ++-
 qapi/block-core.json   | 6 ++
 5 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index 4c8c375..6f6636b 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
 
 qmp_block_stream(true, device, device, base != NULL, base, false, NULL,
  false, NULL, qdict_haskey(qdict, "speed"), speed, true,
- BLOCKDEV_ON_ERROR_REPORT, false, false, false, false,
- &error);
+ BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, 
false,
+ false, &error);
 
 hmp_handle_error(mon, error);
 }
diff --git a/block/stream.c b/block/stream.c
index 3cca1f9..e0b35f8 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -217,7 +217,9 @@ static const BlockJobDriver stream_job_driver = {
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp)
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp)
 {
 StreamBlockJob *s;
 BlockDriverState *iter;
diff --git a/blockdev.c b/blockdev.c
index 97c2ba2..67918fc 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2753,6 +2753,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
   bool has_backing_file, const char *backing_file,
   bool has_speed, int64_t speed,
   bool has_on_error, BlockdevOnError on_error,
+  bool has_filter_node_name, const char *filter_node_name,
   bool has_auto_finalize, bool auto_finalize,
   bool has_auto_dismiss, bool auto_dismiss,
   Error **errp)
@@ -2769,6 +2770,10 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 on_error = BLOCKDEV_ON_ERROR_REPORT;
 }
 
+if (!has_filter_node_name) {
+filter_node_name = NULL;
+}
+
 bs = bdrv_lookup_bs(device, device, errp);
 if (!bs) {
 return;
@@ -2840,7 +2845,8 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 }
 
 stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
- job_flags, has_speed ? speed : 0, on_error, &local_err);
+ job_flags, has_speed ? speed : 0, on_error,
+ filter_node_name, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 goto out;
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 86f7666..0fdfa9c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1094,6 +1094,9 @@ int is_windows_drive(const char *filename);
  *  See @BlockJobCreateFlags
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @bs. NULL means
+ * that a node name should be autogenerated.
  * @errp: Error object.
  *
  * Start a streaming operation on @bs.  Clusters that are unallocated
@@ -1106,7 +1109,9 @@ int is_windows_drive(const char *filename);
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp);
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp);
 
 /**
  * commit_start:
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 943df19..df6981e 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2509,6 +2509,11 @@
 #'stop' and 'enospc' can only be used if the block device
 #supports io-status (see BlockInfo).  Since 1.3.
 #
+# @filter-node-name: the node name that should be assigned to the
+#filter driver that the stream job inserts into the graph
+#above @device. If this option is not given, a node name is
+#autogenerated. (Since: 5.1)
+#
 # @auto-finalize: When false, this job will wait in a PENDIN

[PATCH v4 10/15] copy-on-read: Support change filename functions

2020-05-12 Thread Andrey Shinkevich

The COR-filter driver should support a redirecting function to refresh
filenames. Otherwise, a file name of the filter will be copied
instead of the one of a data node. It is also true for the function
bdrv_change_backing_file().

Signed-off-by: Andrey Shinkevich 
---
 block/copy-on-read.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index c4fa468..74e01ee 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -21,6 +21,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
 
@@ -128,6 +129,21 @@ static void cor_lock_medium(BlockDriverState *bs, bool 
locked)
 }
 
 
+static void cor_refresh_filename(BlockDriverState *bs)
+{
+pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
+bs->file->bs->filename);
+}
+
+
+static int cor_change_backing_file(BlockDriverState *bs,
+   const char *backing_file,
+   const char *backing_fmt)
+{
+return bdrv_change_backing_file(bs->file->bs, backing_file, backing_fmt);
+}
+
+
 static BlockDriver bdrv_copy_on_read = {
 .format_name= "copy-on-read",
 
@@ -146,6 +162,8 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_lock_medium   = cor_lock_medium,
 
 .bdrv_co_block_status   = bdrv_co_block_status_from_file,
+.bdrv_refresh_filename  = cor_refresh_filename,
+.bdrv_change_backing_file   = cor_change_backing_file,
 
 .has_variable_length= true,
 .is_filter  = true,
-- 
1.8.3.1

[PATCH v4 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread Andrey Shinkevich

With this series, all the block-stream COR operations pass through
the COR-filter. The patches 01-08/15 are taken from the series
"block: Deal with filters" by Max Reitz, the full version of that
can be found in the branches:

  https://git.xanclic.moe/XanClic/qemu child-access-functions-v6
  https://github.com/XanClic/qemu child-access-functions-v6

  When running iotests, apply "char-socket: Fix race condition"
  to avoid sporadic segmentation faults.
v4:
  01: Initialization of the COR-filter BDRVStateCOR member.

v3:
  01: The COR filter insert/remove functions moved to block/copy-on-read.c
  to be a part of API.
  02: block/stream.c code refactoring.
  03: The separate call to block_job_add_bdrv() is used to block operations
  on the active node after the filter inserted and the job created.
  04: The iotests case 030::test_overlapping_4 was modified to unbound
  the block-stream job from the base node.
  05: The COR driver functions preadv/pwritev replaced with their analogous
  preadv/pwritev_part.

v2:
  01: No more skipping filters while checking for operation blockers.
  However, we exclude filters between the bottom node and base
  because we do not set the operation blockers for filters anymore.
  02: As stated above, we do not set the operation blockers for filters
  anymore. So, skip filters when we block operations for the target
  node.
  03: The comment added for the patch 4/7.
  04: The QAPI target version changed to 5.1.
  05: The 'filter-node-name' now excluded from using in the test #030.
  If we need it no more in a final version of the series, the patch
  5/7 may be removed.
  06: The COR-filter included into the frozen chain of a block-stream job.
  The 'above_base' node pointer is left because it is essential for
  finding the base node in case of filters above.


Andrey Shinkevich (7):
  block: prepare block-stream for using COR-filter
  copy-on-read: Support change filename functions
  copy-on-read: Support preadv/pwritev_part functions
  copy-on-read: add filter append/drop functions
  qapi: add filter-node-name to block-stream
  iotests: prepare 245 for using filter in block-stream
  block: apply COR-filter to block-stream jobs

Max Reitz (8):
  block: Mark commit and mirror as filter drivers
  copy-on-read: Support compressed writes
  block: Add child access functions
  block: Add chain helper functions
  block: Include filters when freezing backing chain
  block: Use CAFs in block status functions
  commit: Deal with filters when blocking intermediate nodes
  block: Use CAFs when working with backing chains

 block.c| 275 ++---
 block/commit.c |  85 ++---
 block/copy-on-read.c   | 159 ++--
 block/io.c |  19 +--
 block/mirror.c |   6 +-
 block/monitor/block-hmp-cmds.c |   4 +-
 block/stream.c |  89 +
 blockdev.c |  19 ++-
 include/block/block.h  |   6 +-
 include/block/block_int.h  |  67 +-
 qapi/block-core.json   |   6 +
 tests/qemu-iotests/030 |  17 +--
 tests/qemu-iotests/141.out |   2 +-
 tests/qemu-iotests/245 |  10 +-
 14 files changed, 625 insertions(+), 139 deletions(-)

-- 
1.8.3.1

[PATCH v4 14/15] iotests: prepare 245 for using filter in block-stream

2020-05-12 Thread Andrey Shinkevich

The preliminary patch modifies the test 245 to prepare the block-stream
job for using COR-filter. The filter breaks the backing chain being
connected to the underlying node by file child link.

Signed-off-by: Andrey Shinkevich 
---
 tests/qemu-iotests/245 | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index 4f5f0bb..5bcddc4 100755
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -896,15 +896,19 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 # make hd1 read-only and block-stream requires it to be read-write
 # (Which error message appears depends on whether the stream job is
 # already done with copying at this point.)
-self.reopen(opts, {},
+opts = hd_opts(1)
+opts['backing'] = hd_opts(2)
+opts['backing']['backing'] = None
+self.reopen(opts, {'read-only': True},
 ["Can't set node 'hd1' to r/o with copy-on-read enabled",
  "Cannot make block node read-only, there is a writer on it"])
 
 # We can't remove hd2 while the stream job is ongoing
-opts['backing']['backing'] = None
-self.reopen(opts, {'backing.read-only': False}, "Cannot change 
'backing' link from 'hd1' to 'hd2'")
+opts['backing'] = None
+self.reopen(opts, {'read-only': False}, "Cannot change 'backing' link 
from 'hd1' to 'hd2'")
 
 # We can detach hd1 from hd0 because it doesn't affect the stream job
+opts = hd_opts(0)
 opts['backing'] = None
 self.reopen(opts)
 
-- 
1.8.3.1

[PATCH v4 06/15] block: Use CAFs in block status functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Use the child access functions in the block status inquiry functions as
appropriate.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/io.c b/block/io.c
index 7d30e61..47d8096 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2387,11 +2387,12 @@ static int coroutine_fn 
bdrv_co_block_status(BlockDriverState *bs,
 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
 ret |= BDRV_BLOCK_ALLOCATED;
 } else if (want_zero) {
+BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs);
+
 if (bdrv_unallocated_blocks_are_zero(bs)) {
 ret |= BDRV_BLOCK_ZERO;
-} else if (bs->backing) {
-BlockDriverState *bs2 = bs->backing->bs;
-int64_t size2 = bdrv_getlength(bs2);
+} else if (cow_bs) {
+int64_t size2 = bdrv_getlength(cow_bs);
 
 if (size2 >= 0 && offset >= size2) {
 ret |= BDRV_BLOCK_ZERO;
@@ -2457,7 +2458,7 @@ static int coroutine_fn 
bdrv_co_block_status_above(BlockDriverState *bs,
 bool first = true;
 
 assert(bs != base);
-for (p = bs; p != base; p = backing_bs(p)) {
+for (p = bs; p != base; p = bdrv_filtered_bs(p)) {
 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
file);
 if (ret < 0) {
@@ -2543,7 +2544,7 @@ int bdrv_block_status_above(BlockDriverState *bs, 
BlockDriverState *base,
 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
   int64_t *pnum, int64_t *map, BlockDriverState **file)
 {
-return bdrv_block_status_above(bs, backing_bs(bs),
+return bdrv_block_status_above(bs, bdrv_filtered_bs(bs),
offset, bytes, pnum, map, file);
 }
 
@@ -2553,9 +2554,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, 
int64_t offset,
 int ret;
 int64_t dummy;
 
-ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL);
+ret = bdrv_common_block_status_above(bs, bdrv_filtered_bs(bs), false,
+ offset, bytes, pnum ? pnum : &dummy,
+ NULL, NULL);
 if (ret < 0) {
 return ret;
 }
@@ -2618,7 +2619,7 @@ int bdrv_is_allocated_above(BlockDriverState *top,
 break;
 }
 
-intermediate = backing_bs(intermediate);
+intermediate = bdrv_filtered_bs(intermediate);
 }
 
 *pnum = n;
-- 
1.8.3.1

[PATCH v4 15/15] block: apply COR-filter to block-stream jobs

2020-05-12 Thread Andrey Shinkevich

The patch completes the series with the COR-filter insertion to any
block-stream operation. It also makes changes to the iotests 030 and
141.out.

Signed-off-by: Andrey Shinkevich 
---
 block/stream.c | 83 --
 tests/qemu-iotests/030 |  8 +++--
 tests/qemu-iotests/141.out |  2 +-
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index e0b35f8..a74a07b 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -19,6 +19,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "block/copy-on-read.h"
 
 enum {
 /*
@@ -32,8 +33,11 @@ enum {
 typedef struct StreamBlockJob {
 BlockJob common;
 BlockDriverState *bottom;
+BlockDriverState *cor_filter_bs;
+BlockDriverState *target_bs;
 BlockdevOnError on_error;
 char *backing_file_str;
+char *base_fmt;
 bool bs_read_only;
 bool chain_frozen;
 } StreamBlockJob;
@@ -52,33 +56,25 @@ static void stream_abort(Job *job)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 
 if (s->chain_frozen) {
-BlockJob *bjob = &s->common;
-bdrv_unfreeze_chain(blk_bs(bjob->blk), s->bottom);
+bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom);
 }
 }
 
 static int stream_prepare(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-BlockJob *bjob = &s->common;
-BlockDriverState *bs = blk_bs(bjob->blk);
+BlockDriverState *bs = s->target_bs;
 BlockDriverState *base = backing_bs(s->bottom);
 Error *local_err = NULL;
 int ret = 0;
 
-bdrv_unfreeze_chain(bs, s->bottom);
+bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom);
 s->chain_frozen = false;
 
 if (bs->backing) {
-const char *base_id = NULL, *base_fmt = NULL;
-if (base) {
-base_id = s->backing_file_str;
-if (base->drv) {
-base_fmt = base->drv->format_name;
-}
-}
 bdrv_set_backing_hd(bs, base, &local_err);
-ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+ret = bdrv_change_backing_file(bs, s->backing_file_str,
+   s->base_fmt);
 if (local_err) {
 error_report_err(local_err);
 return -EPERM;
@@ -92,7 +88,9 @@ static void stream_clean(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = &s->common;
-BlockDriverState *bs = blk_bs(bjob->blk);
+BlockDriverState *bs = s->target_bs;
+
+bdrv_cor_filter_drop(s->cor_filter_bs);
 
 /* Reopen the image back in read-only mode if necessary */
 if (s->bs_read_only) {
@@ -102,13 +100,14 @@ static void stream_clean(Job *job)
 }
 
 g_free(s->backing_file_str);
+g_free(s->base_fmt);
 }
 
 static int coroutine_fn stream_run(Job *job, Error **errp)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockBackend *blk = s->common.blk;
-BlockDriverState *bs = blk_bs(blk);
+BlockDriverState *bs = s->target_bs;
 bool enable_cor = !backing_bs(s->bottom);
 int64_t len;
 int64_t offset = 0;
@@ -156,8 +155,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 } else if (ret >= 0) {
 /* Copy if allocated in the intermediate images.  Limit to the
  * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
-ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true,
-  offset, n, &n);
+ret = bdrv_is_allocated_above(bdrv_filtered_cow_bs(bs), s->bottom,
+  true, offset, n, &n);
 /* Finish early if end of backing file has been reached */
 if (ret == 0 && n == 0) {
 n = len - offset;
@@ -225,7 +224,13 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 BlockDriverState *iter;
 bool bs_read_only;
 int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
+BlockDriverState *cor_filter_bs = NULL;
 BlockDriverState *bottom = bdrv_find_overlay(bs, base);
+char *base_fmt = NULL;
+
+if (base && base->drv) {
+base_fmt = g_strdup(base->drv->format_name);
+}
 
 if (bdrv_freeze_chain(bs, bottom, errp) < 0) {
 return;
@@ -240,17 +245,35 @@ void stream_start(const char *job_id, BlockDriverState 
*bs,
 }
 }
 
-/* Prevent concurrent jobs trying to modify the graph structure here, we
- * already have our own plans. Also don't allow resize as the image size is
- * queried only at the job start and then cached. */
-s = block_job_create(job_id, &stream_job_driver, NULL, bs,
- basic_flags | BLK_PERM_GRAPH_MOD,
- basic_flags | BLK_PERM_WRITE,
+cor_filter_bs = bdrv_cor_filter_append(bs,

[PATCH v4 09/15] block: prepare block-stream for using COR-filter

2020-05-12 Thread Andrey Shinkevich

This patch is the first one in the series where the COR-filter node
will be hard-coded for using in the block-stream job. The block jobs
may be run in parallel. Exclude conflicts with filter nodes used for
a concurrent job while checking for the blocked operations. It incurs
changes in the iotests 030::test_overlapping_4 that now passes with no
conflict because the stream job does not have a real dependency on its
base and on a filter above it.

Signed-off-by: Andrey Shinkevich 
---
 blockdev.c | 11 +--
 tests/qemu-iotests/030 |  9 -
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index b3c840e..97c2ba2 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2763,6 +2763,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 Error *local_err = NULL;
 const char *base_name = NULL;
 int job_flags = JOB_DEFAULT;
+BlockDriverState *bottom_cow_node;
 
 if (!has_on_error) {
 on_error = BLOCKDEV_ON_ERROR_REPORT;
@@ -2807,8 +2808,14 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 base_name = base_bs->filename;
 }
 
-/* Check for op blockers in the whole chain between bs and base */
-for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) {
+bottom_cow_node = bdrv_find_overlay(bs, base_bs);
+if (!bottom_cow_node) {
+error_setg(errp, "bottom node is not found, nothing to stream");
+goto out;
+}
+/* Check for op blockers in the whole chain between bs and bottom */
+for (iter = bs; iter && iter != bdrv_filtered_bs(bottom_cow_node);
+ iter = bdrv_filtered_bs(iter)) {
 if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) {
 goto out;
 }
diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 104e3ce..d7638cd 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -368,8 +368,7 @@ class TestParallelOps(iotests.QMPTestCase):
 self.wait_until_completed(drive='commit-drive0')
 
 # In this case the base node of the stream job is the same as the
-# top node of commit job. Since this results in the commit filter
-# node being part of the stream chain, this is not allowed.
+# top node of commit job.
 def test_overlapping_4(self):
 self.assert_no_active_block_jobs()
 
@@ -381,13 +380,13 @@ class TestParallelOps(iotests.QMPTestCase):
 
 # Stream from node2 into node4
 result = self.vm.qmp('block-stream', device='node4', 
base_node='node2', job_id='node4')
-self.assert_qmp(result, 'error/desc',
-"Cannot freeze 'backing' link to 'commit-filter'")
+self.assert_qmp(result, 'return', {})
 
 result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0)
 self.assert_qmp(result, 'return', {})
 
-self.wait_until_completed()
+self.vm.run_job(job='drive0', auto_dismiss=True)
+self.vm.run_job(job='node4', auto_dismiss=True)
 self.assert_no_active_block_jobs()
 
 # In this case the base node of the stream job is the commit job's
-- 
1.8.3.1

[PATCH v4 08/15] block: Use CAFs when working with backing chains

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Use child access functions when iterating through backing chains so
filters do not break the chain.

Signed-off-by: Max Reitz 
---
 block.c | 40 
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index 459412e..69e9152 100644
--- a/block.c
+++ b/block.c
@@ -4593,7 +4593,8 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 }
 
 /*
- * Finds the image layer in the chain that has 'bs' as its backing file.
+ * Finds the image layer in the chain that has 'bs' (or a filter on
+ * top of it) as its backing file.
  *
  * active is the current topmost image.
  *
@@ -4605,11 +4606,18 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
 BlockDriverState *bs)
 {
-while (active && bs != backing_bs(active)) {
-active = backing_bs(active);
+bs = bdrv_skip_rw_filters(bs);
+active = bdrv_skip_rw_filters(active);
+
+while (active) {
+BlockDriverState *next = bdrv_backing_chain_next(active);
+if (bs == next) {
+return active;
+}
+active = next;
 }
 
-return active;
+return NULL;
 }
 
 /* Given a BDS, searches for the base layer. */
@@ -4761,9 +4769,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
  * other intermediate nodes have been dropped.
  * If 'top' is an implicit node (e.g. "commit_top") we should skip
  * it because no one inherits from it. We use explicit_top for that. */
-while (explicit_top && explicit_top->implicit) {
-explicit_top = backing_bs(explicit_top);
-}
+explicit_top = bdrv_skip_implicit_filters(explicit_top);
 update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
 
 /* success - we can delete the intermediate states, and link top->base */
@@ -5205,7 +5211,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
 {
 while (top && top != base) {
-top = backing_bs(top);
+top = bdrv_filtered_bs(top);
 }
 
 return top != NULL;
@@ -5466,7 +5472,17 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 
 is_protocol = path_has_protocol(backing_file);
 
-for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) {
+/*
+ * Being largely a legacy function, skip any filters here
+ * (because filters do not have normal filenames, so they cannot
+ * match anyway; and allowing json:{} filenames is a bit out of
+ * scope).
+ */
+for (curr_bs = bdrv_skip_rw_filters(bs);
+ bdrv_filtered_cow_child(curr_bs) != NULL;
+ curr_bs = bdrv_backing_chain_next(curr_bs))
+{
+BlockDriverState *bs_below = bdrv_backing_chain_next(curr_bs);
 
 /* If either of the filename paths is actually a protocol, then
  * compare unmodified paths; otherwise make paths relative */
@@ -5474,7 +5490,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 char *backing_file_full_ret;
 
 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 /* Also check against the full backing filename for the image */
@@ -5484,7 +5500,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
 g_free(backing_file_full_ret);
 if (equal) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
@@ -5510,7 +5526,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 g_free(filename_tmp);
 
 if (strcmp(backing_file_full, filename_full) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
-- 
1.8.3.1

[PATCH v4 05/15] block: Include filters when freezing backing chain

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

In order to make filters work in backing chains, the associated
functions must be able to deal with them and freeze all filter links, be
they COW or R/W filter links.

In the process, rename these functions to reflect that they now act on
generalized chains of filter nodes instead of backing chains alone.

While at it, add some comments that note which functions require their
caller to ensure that a given child link is not frozen, and how the
callers do so.

Signed-off-by: Max Reitz 
Signed-off-by: Andrey Shinkevich 
---
 block.c   | 81 ++-
 block/commit.c|  8 ++---
 block/mirror.c|  4 +--
 block/stream.c|  8 ++---
 include/block/block.h |  6 ++--
 5 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/block.c b/block.c
index 5b4ebfe..459412e 100644
--- a/block.c
+++ b/block.c
@@ -2513,12 +2513,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
  * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this
  * function uses bdrv_set_perm() to update the permissions according to the new
  * reference that @new_bs gets.
+ *
+ * Callers must ensure that child->frozen is false.
  */
 static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
 {
 BlockDriverState *old_bs = child->bs;
 uint64_t perm, shared_perm;
 
+/* Asserts that child->frozen == false */
 bdrv_replace_child_noperm(child, new_bs);
 
 /*
@@ -2676,6 +2679,7 @@ static void bdrv_detach_child(BdrvChild *child)
 g_free(child);
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_root_unref_child(BdrvChild *child)
 {
 BlockDriverState *child_bs;
@@ -2685,10 +2689,6 @@ void bdrv_root_unref_child(BdrvChild *child)
 bdrv_unref(child_bs);
 }
 
-/**
- * Clear all inherits_from pointers from children and grandchildren of
- * @root that point to @root, where necessary.
- */
 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child)
 {
 BdrvChild *c;
@@ -2713,6 +2713,7 @@ static void bdrv_unset_inherits_from(BlockDriverState 
*root, BdrvChild *child)
 }
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 {
 if (child == NULL) {
@@ -2756,7 +2757,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) &&
 bdrv_inherits_from_recursive(backing_hd, bs);
 
-if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) {
+if (bdrv_is_chain_frozen(bs, child_bs(bs->backing), errp)) {
 return;
 }
 
@@ -2765,6 +2766,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 }
 
 if (bs->backing) {
+/* Cannot be frozen, we checked that above */
 bdrv_unref_child(bs, bs->backing);
 bs->backing = NULL;
 }
@@ -3912,8 +3914,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState 
*reopen_state,
 return -EPERM;
 }
 /* Check if the backing link that we want to replace is frozen */
-if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs),
- errp)) {
+if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) {
 return -EPERM;
 }
 reopen_state->replace_backing_bs = true;
@@ -4248,6 +4249,7 @@ static void bdrv_close(BlockDriverState *bs)
 
 if (bs->drv) {
 if (bs->drv->bdrv_close) {
+/* Must unfreeze all children, so bdrv_unref_child() works */
 bs->drv->bdrv_close(bs);
 }
 bs->drv = NULL;
@@ -4617,20 +4619,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs)
 }
 
 /*
- * Return true if at least one of the backing links between @bs and
- * @base is frozen. @errp is set if that's the case.
+ * Return true if at least one of the (COW and R/W) filter links
+ * between @bs and @base is frozen. @errp is set if that's the case.
  * @base must be reachable from @bs, or NULL.
  */
-bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
-  Error **errp)
+bool bdrv_is_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+  Error **errp)
 {
 BlockDriverState *i;
+BdrvChild *child;
+
+for (i = bs; i != base; i = child_bs(child)) {
+child = bdrv_filtered_child(i);
 
-for (i = bs; i != base; i = backing_bs(i)) {
-if (i->backing && i->backing->frozen) {
+if (child && child->frozen) {
 error_setg(errp, "Cannot change '%s' link from '%s' to '%s'",
-   i->backing->name, i->node_name,
-   backing_bs(i)->node_name);
+   child->name, i->node_name, child->bs->node_name);
 return true;
 }
 }
@@ -4639

[PATCH v4 07/15] commit: Deal with filters when blocking intermediate nodes

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

This includes some permission limiting (for example, we only need to
take the RESIZE permission if the base is smaller than the top).

Signed-off-by: Max Reitz 
Signed-off-by: Andrey Shinkevich 
---
 block/commit.c | 75 ++
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index e1f45a4..4df145d 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -37,6 +37,7 @@ typedef struct CommitBlockJob {
 BlockBackend *top;
 BlockBackend *base;
 BlockDriverState *base_bs;
+BlockDriverState *above_base;
 BlockdevOnError on_error;
 bool base_read_only;
 bool chain_frozen;
@@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 break;
 }
 /* Copy if allocated above the base */
-ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false,
+ret = bdrv_is_allocated_above(blk_bs(s->top), s->above_base, true,
   offset, COMMIT_BUFFER_SIZE, &n);
 copy = (ret == 1);
 trace_commit_one_iteration(s, offset, n, ret);
@@ -253,15 +254,35 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 CommitBlockJob *s;
 BlockDriverState *iter;
 BlockDriverState *commit_top_bs = NULL;
+BlockDriverState *filtered_base;
 Error *local_err = NULL;
+int64_t base_size, top_size;
+uint64_t perms, iter_shared_perms;
 int ret;
 
 assert(top != bs);
-if (top == base) {
+if (bdrv_skip_rw_filters(top) == bdrv_skip_rw_filters(base)) {
 error_setg(errp, "Invalid files for merge: top and base are the same");
 return;
 }
 
+base_size = bdrv_getlength(base);
+if (base_size < 0) {
+error_setg_errno(errp, -base_size, "Could not inquire base image 
size");
+return;
+}
+
+top_size = bdrv_getlength(top);
+if (top_size < 0) {
+error_setg_errno(errp, -top_size, "Could not inquire top image size");
+return;
+}
+
+perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+if (base_size < top_size) {
+perms |= BLK_PERM_RESIZE;
+}
+
 s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
  speed, creation_flags, NULL, NULL, errp);
 if (!s) {
@@ -301,17 +322,43 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 
 s->commit_top_bs = commit_top_bs;
 
-/* Block all nodes between top and base, because they will
- * disappear from the chain after this operation. */
-assert(bdrv_chain_contains(top, base));
-for (iter = top; iter != base; iter = backing_bs(iter)) {
-/* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
- * at s->base (if writes are blocked for a node, they are also blocked
- * for its backing file). The other options would be a second filter
- * driver above s->base. */
+/*
+ * Block all nodes between top and base, because they will
+ * disappear from the chain after this operation.
+ * Note that this assumes that the user is fine with removing all
+ * nodes (including R/W filters) between top and base.  Assuring
+ * this is the responsibility of the interface (i.e. whoever calls
+ * commit_start()).
+ */
+s->above_base = bdrv_find_overlay(top, base);
+assert(s->above_base);
+
+/*
+ * The topmost node with
+ * bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base)
+ */
+filtered_base = bdrv_filtered_cow_bs(s->above_base);
+assert(bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base));
+
+/*
+ * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
+ * at s->base (if writes are blocked for a node, they are also blocked
+ * for its backing file). The other options would be a second filter
+ * driver above s->base.
+ */
+iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
+
+for (iter = top; iter != base; iter = bdrv_filtered_bs(iter)) {
+if (iter == filtered_base) {
+/*
+ * From here on, all nodes are filters on the base.  This
+ * allows us to share BLK_PERM_CONSISTENT_READ.
+ */
+iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
+}
+
 ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
- BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
- errp);
+ iter_shared_perms, errp);
 if (ret < 0) {
 goto fail;
 }
@@ -328,9 +375,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 }
 
 s->base = blk_new(s->common.job.aio_context,
-  BLK_PERM_CONSISTENT_READ
-  | BLK_PERM_WRITE
-  | BLK_PERM_RESIZE,
+

[PATCH v4 01/15] block: Mark commit and mirror as filter drivers

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

The commit and mirror block nodes are filters, so they should be marked
as such.  (Strictly speaking, BDS.is_filter's documentation states that
a filter's child must be bs->file.  The following patch will relax this
restriction, however.)

Signed-off-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
---
 block/commit.c | 2 ++
 block/mirror.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index 87f6096..445a280 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -240,6 +240,8 @@ static BlockDriver bdrv_commit_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_commit_top_refresh_filename,
 .bdrv_child_perm= bdrv_commit_top_child_perm,
+
+.is_filter  = true,
 };
 
 void commit_start(const char *job_id, BlockDriverState *bs,
diff --git a/block/mirror.c b/block/mirror.c
index aca95c9..b6de24b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1527,6 +1527,8 @@ static BlockDriver bdrv_mirror_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_mirror_top_refresh_filename,
 .bdrv_child_perm= bdrv_mirror_top_child_perm,
+
+.is_filter  = true,
 };
 
 static BlockJob *mirror_start_job(
-- 
1.8.3.1

[PATCH v4 04/15] block: Add chain helper functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

Add some helper functions for skipping filters in a chain of block
nodes.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c   | 55 +++
 include/block/block_int.h |  3 +++
 2 files changed, 58 insertions(+)

diff --git a/block.c b/block.c
index b2aae2e..5b4ebfe 100644
--- a/block.c
+++ b/block.c
@@ -6863,3 +6863,58 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs)
 {
 return bdrv_filtered_rw_child(bs) ?: bs->file;
 }
+
+static BlockDriverState *bdrv_skip_filters(BlockDriverState *bs,
+   bool stop_on_explicit_filter)
+{
+BdrvChild *filtered;
+
+if (!bs) {
+return NULL;
+}
+
+while (!(stop_on_explicit_filter && !bs->implicit)) {
+filtered = bdrv_filtered_rw_child(bs);
+if (!filtered) {
+break;
+}
+bs = filtered->bs;
+}
+/*
+ * Note that this treats nodes with bs->drv == NULL as not being
+ * R/W filters (bs->drv == NULL should be replaced by something
+ * else anyway).
+ * The advantage of this behavior is that this function will thus
+ * always return a non-NULL value (given a non-NULL @bs).
+ */
+
+return bs;
+}
+
+/*
+ * Return the first BDS that has not been added implicitly or that
+ * does not have an RW-filtered child down the chain starting from @bs
+ * (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, true);
+}
+
+/*
+ * Return the first BDS that does not have an RW-filtered child down
+ * the chain starting from @bs (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, false);
+}
+
+/*
+ * For a backing chain, return the first non-filter backing image of
+ * the first non-filter image.
+ */
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
+{
+return 
bdrv_skip_rw_filters(bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs)));
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dca59e9..86f7666 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1347,6 +1347,9 @@ BdrvChild *bdrv_filtered_child(BlockDriverState *bs);
 BdrvChild *bdrv_metadata_child(BlockDriverState *bs);
 BdrvChild *bdrv_storage_child(BlockDriverState *bs);
 BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs);
 
 static inline BlockDriverState *child_bs(BdrvChild *child)
 {
-- 
1.8.3.1

[PATCH v4 03/15] block: Add child access functions

2020-05-12 Thread Andrey Shinkevich

From: Max Reitz 

There are BDS children that the general block layer code can access,
namely bs->file and bs->backing.  Since the introduction of filters and
external data files, their meaning is not quite clear.  bs->backing can
be a COW source, or it can be an R/W-filtered child; bs->file can be an
R/W-filtered child, it can be data and metadata storage, or it can be
just metadata storage.

This overloading really is not helpful.  This patch adds function that
retrieve the correct child for each exact purpose.  Later patches in
this series will make use of them.  Doing so will allow us to handle
filter nodes and external data files in a meaningful way.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
---
 block.c   | 99 +++
 include/block/block_int.h | 57 +--
 2 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 0653ccb..b2aae2e 100644
--- a/block.c
+++ b/block.c
@@ -6764,3 +6764,102 @@ void bdrv_del_child(BlockDriverState *parent_bs, 
BdrvChild *child, Error **errp)
 
 parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
 }
+
+/*
+ * Return the child that @bs acts as an overlay for, and from which data may be
+ * copied in COW or COR operations.  Usually this is the backing file.
+ */
+BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (bs->drv->is_filter) {
+return NULL;
+}
+
+return bs->backing;
+}
+
+/*
+ * If @bs acts as a pass-through filter for one of its children,
+ * return that child.  "Pass-through" means that write operations to
+ * @bs are forwarded to that child instead of triggering COW.
+ */
+BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (!bs->drv->is_filter) {
+return NULL;
+}
+
+/* Only one of @backing or @file may be used */
+assert(!(bs->backing && bs->file));
+
+return bs->backing ?: bs->file;
+}
+
+/*
+ * Return any filtered child, independently of how it reacts to write
+ * accesses and whether data is copied onto this BDS through COR.
+ */
+BdrvChild *bdrv_filtered_child(BlockDriverState *bs)
+{
+BdrvChild *cow_child = bdrv_filtered_cow_child(bs);
+BdrvChild *rw_child = bdrv_filtered_rw_child(bs);
+
+/* There can only be one filtered child at a time */
+assert(!(cow_child && rw_child));
+
+return cow_child ?: rw_child;
+}
+
+/*
+ * Return the child that stores the metadata for this node.
+ */
+BdrvChild *bdrv_metadata_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+/* Filters do not have metadata */
+if (bs->drv->is_filter) {
+return NULL;
+}
+
+return bs->file;
+}
+
+/*
+ * Return the child that stores the data that is allocated on this
+ * node.  This may or may not include metadata.
+ */
+BdrvChild *bdrv_storage_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (bs->drv->bdrv_storage_child) {
+return bs->drv->bdrv_storage_child(bs);
+}
+
+return bdrv_filtered_rw_child(bs) ?: bs->file;
+}
+
+/*
+ * Return the primary child of this node: For filters, that is the
+ * filtered child.  For other nodes, that is usually the child storing
+ * metadata.
+ * (A generally more helpful description is that this is (usually) the
+ * child that has the same filename as @bs.)
+ */
+BdrvChild *bdrv_primary_child(BlockDriverState *bs)
+{
+return bdrv_filtered_rw_child(bs) ?: bs->file;
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index df6d027..dca59e9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -89,9 +89,11 @@ struct BlockDriver {
 int instance_size;
 
 /* set to true if the BlockDriver is a block filter. Block filters pass
- * certain callbacks that refer to data (see block.c) to their bs->file if
- * the driver doesn't implement them. Drivers that do not wish to forward
- * must implement them and return -ENOTSUP.
+ * certain callbacks that refer to data (see block.c) to their bs->file
+ * or bs->backing (whichever one exists) if the driver doesn't implement
+ * them. Drivers that do not wish to forward must implement them and return
+ * -ENOTSUP.
+ * Note that filters are not allowed to modify data.
  */
 bool is_filter;
 /*
@@ -585,6 +587,13 @@ struct BlockDriver {
  * If this pointer is NULL, the array is considered empty.
  * "filename" and "driver" are always considered strong. */
 const char *const *strong_runtime_opts;
+
+/**
+ * Return the data storage child, if there is exactly one.  If
+ * this function is not implemented, the block layer will assume
+ * bs->file to be this child.
+ */
+BdrvChild *(*bdrv_storage_child)(

Re: [PATCH v2 3/4] mirror: Make sure that source and target size match

2020-05-12 Thread Vladimir Sementsov-Ogievskiy


11.05.2020 16:58, Kevin Wolf wrote:

If the target is shorter than the source, mirror would copy data until
it reaches the end of the target and then fail with an I/O error when
trying to write past the end.

If the target is longer than the source, the mirror job would complete
successfully, but the target wouldn't actually be an accurate copy of
the source image (it would contain some additional garbage at the end).

Fix this by checking that both images have the same size when the job
starts.

Signed-off-by: Kevin Wolf 
Message-Id: <20200507145228.323412-3-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
  block/mirror.c | 21 -
  1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index aca95c9bc9..201ffa26f9 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -872,6 +872,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
  BlockDriverState *target_bs = blk_bs(s->target);
  bool need_drain = true;
  int64_t length;
+int64_t target_length;
  BlockDriverInfo bdi;
  char backing_filename[2]; /* we only need 2 characters because we are only
   checking for a NULL string */
@@ -887,24 +888,26 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
  goto immediate_exit;
  }
  
+target_length = blk_getlength(s->target);

+if (target_length < 0) {
+ret = target_length;
+goto immediate_exit;
+}
+
  /* Active commit must resize the base image if its size differs from the
   * active layer. */
  if (s->base == blk_bs(s->target)) {
-int64_t base_length;
-
-base_length = blk_getlength(s->target);
-if (base_length < 0) {
-ret = base_length;
-goto immediate_exit;
-}
-
-if (s->bdev_length > base_length) {
+if (s->bdev_length > target_length) {
  ret = blk_truncate(s->target, s->bdev_length, false,
 PREALLOC_MODE_OFF, 0, NULL);
  if (ret < 0) {
  goto immediate_exit;
  }
  }


Hmm, interesting, if base is larger, is our behavior correct? Blockdev becomes 
larger after commit and old data becomes available? I think we should zero the 
tail after old EOF or shrink the target..


+} else if (s->bdev_length != target_length) {
+error_setg(errp, "Source and target image have different sizes");
+ret = -EINVAL;


Seems, the only case, when mirror_run() sets errp. And, therefore, the only 
correct one..


+goto immediate_exit;
  }
  
  if (s->bdev_length == 0) {





--
Best regards,
Vladimir

Re: [PATCH v2 3/4] mirror: Make sure that source and target size match

2020-05-12 Thread Vladimir Sementsov-Ogievskiy


12.05.2020 20:15, Vladimir Sementsov-Ogievskiy wrote:

+    } else if (s->bdev_length != target_length) {
+    error_setg(errp, "Source and target image have different sizes");
+    ret = -EINVAL;


Seems, the only case, when mirror_run() sets errp. And, therefore, the only 
correct one..


the only one failure case I mean, of course.

--
Best regards,
Vladimir

Re: [PATCH v4 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread Vladimir Sementsov-Ogievskiy


12.05.2020 19:50, Andrey Shinkevich wrote:

With this series, all the block-stream COR operations pass through
the COR-filter. The patches 01-08/15 are taken from the series
"block: Deal with filters" by Max Reitz, the full version of that
can be found in the branches:

   https://git.xanclic.moe/XanClic/qemu child-access-functions-v6
   https://github.com/XanClic/qemu child-access-functions-v6

   When running iotests, apply "char-socket: Fix race condition"
   to avoid sporadic segmentation faults.
v4:
   01: Initialization of the COR-filter BDRVStateCOR member.


Hmm... but 01 doesn't touch COR-filter



v3:
   01: The COR filter insert/remove functions moved to block/copy-on-read.c
   to be a part of API.
   02: block/stream.c code refactoring.
   03: The separate call to block_job_add_bdrv() is used to block operations
   on the active node after the filter inserted and the job created.
   04: The iotests case 030::test_overlapping_4 was modified to unbound
   the block-stream job from the base node.
   05: The COR driver functions preadv/pwritev replaced with their analogous
   preadv/pwritev_part.


I assume, these changes are about your patches, which are 09-15, and Max's 
patches
are unchanged, right?



v2:
   01: No more skipping filters while checking for operation blockers.
   However, we exclude filters between the bottom node and base
   because we do not set the operation blockers for filters anymore.
   02: As stated above, we do not set the operation blockers for filters
   anymore. So, skip filters when we block operations for the target
   node.
   03: The comment added for the patch 4/7.
   04: The QAPI target version changed to 5.1.
   05: The 'filter-node-name' now excluded from using in the test #030.
   If we need it no more in a final version of the series, the patch
   5/7 may be removed.
   06: The COR-filter included into the frozen chain of a block-stream job.
   The 'above_base' node pointer is left because it is essential for
   finding the base node in case of filters above.


Andrey Shinkevich (7):
   block: prepare block-stream for using COR-filter
   copy-on-read: Support change filename functions
   copy-on-read: Support preadv/pwritev_part functions
   copy-on-read: add filter append/drop functions
   qapi: add filter-node-name to block-stream
   iotests: prepare 245 for using filter in block-stream
   block: apply COR-filter to block-stream jobs

Max Reitz (8):
   block: Mark commit and mirror as filter drivers


this is for commit


   copy-on-read: Support compressed writes


for stream


   block: Add child access functions


I do think, that for these series we need only filtered child and nothing more


   block: Add chain helper functions
   block: Include filters when freezing backing chain
   block: Use CAFs in block status functions
   commit: Deal with filters when blocking intermediate nodes
   block: Use CAFs when working with backing chains


So, fix stream, commit and some thing used in it.


Hi Max! Could you take a brief look and say, could we proceed in this way, 
taking part of your old series? How much it conflicts with your plans?

Let me clarify. This all is needed, as we have old proposed feature (and 
patches): discarding blocks from intermediate images during block-stream. It 
helps to save disk space during stream process. And the correct way to get 
access to intermediate nodes (to be able to discard from them) is to append a 
filter. Firstly we proposed our own filter, but that was proposed on list to 
use existing COR filter for stream and it seemed a correct way. So we are 
trying to insert this COR filter.

And the problem with it that without your series it breaks iotest 30, which 
does different magic with parallel stream and commit on the same backing chain.

So, it was my proposal to extract something from your series, to make this test 
work. And the result is here. I thought that the necessary part of your series 
for stream/commit is smaller.. But still, 8 patches is not too much. The 
feature for stream is being postponed already for more than a year due to this 
trouble. We need to proceed somehow. And the feature is useful.


--
Best regards,
Vladimir

Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  block/copy-on-read.o
  CC  block/block-copy.o
  CC  block/crypto.o
/tmp/qemu-test/src/block/copy-on-read.c:29:32: fatal error: 
block/copy-on-read.h: No such file or directory
 #include "block/copy-on-read.h"
^
compilation terminated.
---
  CC  scsi/pr-manager-helper.o
  CC  block/dmg-bz2.o
  CC  crypto/hash.o
/tmp/qemu-test/src/block/stream.c:22:32: fatal error: block/copy-on-read.h: No 
such file or directory
 #include "block/copy-on-read.h"
^
compilation terminated.
  CC  crypto/hash-nettle.o
  CC  crypto/hmac.o
make: *** [block/copy-on-read.o] Error 1
make: *** Waiting for unfinished jobs
make: *** [block/stream.o] Error 1
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=09ff0b4367d645d083a08750a8b4b4c7', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-rfy6u1nc/src/docker-src.2020-05-12-14.18.19.3751:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=09ff0b4367d645d083a08750a8b4b4c7
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-rfy6u1nc/src'
make: *** [docker-run-test-quick@centos7] Error 2

real2m14.644s
user0m8.133s


The full log is available at
http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  scsi/utils.o
  CC  scsi/pr-manager.o
  CC  scsi/pr-manager-helper.o
/tmp/qemu-test/src/block/copy-on-read.c:29:10: fatal error: 
'block/copy-on-read.h' file not found
#include "block/copy-on-read.h"
 ^~
1 error generated.
make: *** [/tmp/qemu-test/src/rules.mak:69: block/copy-on-read.o] Error 1
make: *** Waiting for unfinished jobs
/tmp/qemu-test/src/block/stream.c:22:10: fatal error: 'block/copy-on-read.h' 
file not found
#include "block/copy-on-read.h"
 ^~
1 error generated.
make: *** [/tmp/qemu-test/src/rules.mak:69: block/stream.o] Error 1
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=0116cd1f21484d809ce1bbf09d8f2d13', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 
'TARGET_LIST=x86_64-softmmu', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 
'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', 
'-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-980fkf38/src/docker-src.2020-05-12-14.21.09.10733:/var/tmp/qemu:z,ro',
 'qemu:fedora', '/var/tmp/qemu/run', 'test-debug']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=0116cd1f21484d809ce1bbf09d8f2d13
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-980fkf38/src'
make: *** [docker-run-test-debug@fedora] Error 2

real3m5.236s
user0m8.401s


The full log is available at
http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.asan/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently

2020-05-12 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/



Hi,

This series failed the docker-mingw@fedora build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#! /bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-mingw@fedora J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  nbd/server.o
  CC  nbd/client.o
  CC  nbd/common.o
/tmp/qemu-test/src/block/copy-on-read.c:29:10: fatal error: 
block/copy-on-read.h: No such file or directory
 #include "block/copy-on-read.h"
  ^~
compilation terminated.
make: *** [/tmp/qemu-test/src/rules.mak:69: block/copy-on-read.o] Error 1
make: *** Waiting for unfinished jobs
/tmp/qemu-test/src/block/stream.c:22:10: fatal error: block/copy-on-read.h: No 
such file or directory
 #include "block/copy-on-read.h"
  ^~
compilation terminated.
make: *** [/tmp/qemu-test/src/rules.mak:69: block/stream.o] Error 1
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
sys.exit(main())
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=e588e99e0e78491a81fad3d9b8bea6b1', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-wfo11d6n/src/docker-src.2020-05-12-14.25.07.16522:/var/tmp/qemu:z,ro',
 'qemu:fedora', '/var/tmp/qemu/run', 'test-mingw']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=e588e99e0e78491a81fad3d9b8bea6b1
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-wfo11d6n/src'
make: *** [docker-run-test-mingw@fedora] Error 2

real1m57.227s
user0m8.202s


The full log is available at
http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.docker-mingw@fedora/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 3/4] mirror: Make sure that source and target size match

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 19:15 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 11.05.2020 16:58, Kevin Wolf wrote:
> > If the target is shorter than the source, mirror would copy data until
> > it reaches the end of the target and then fail with an I/O error when
> > trying to write past the end.
> > 
> > If the target is longer than the source, the mirror job would complete
> > successfully, but the target wouldn't actually be an accurate copy of
> > the source image (it would contain some additional garbage at the end).
> > 
> > Fix this by checking that both images have the same size when the job
> > starts.
> > 
> > Signed-off-by: Kevin Wolf 
> > Message-Id: <20200507145228.323412-3-kw...@redhat.com>
> > Reviewed-by: Eric Blake 
> > Signed-off-by: Kevin Wolf 
> > ---
> >   block/mirror.c | 21 -
> >   1 file changed, 12 insertions(+), 9 deletions(-)
> > 
> > diff --git a/block/mirror.c b/block/mirror.c
> > index aca95c9bc9..201ffa26f9 100644
> > --- a/block/mirror.c
> > +++ b/block/mirror.c
> > @@ -872,6 +872,7 @@ static int coroutine_fn mirror_run(Job *job, Error 
> > **errp)
> >   BlockDriverState *target_bs = blk_bs(s->target);
> >   bool need_drain = true;
> >   int64_t length;
> > +int64_t target_length;
> >   BlockDriverInfo bdi;
> >   char backing_filename[2]; /* we only need 2 characters because we are 
> > only
> >checking for a NULL string */
> > @@ -887,24 +888,26 @@ static int coroutine_fn mirror_run(Job *job, Error 
> > **errp)
> >   goto immediate_exit;
> >   }
> > +target_length = blk_getlength(s->target);
> > +if (target_length < 0) {
> > +ret = target_length;
> > +goto immediate_exit;
> > +}
> > +
> >   /* Active commit must resize the base image if its size differs from 
> > the
> >* active layer. */
> >   if (s->base == blk_bs(s->target)) {
> > -int64_t base_length;
> > -
> > -base_length = blk_getlength(s->target);
> > -if (base_length < 0) {
> > -ret = base_length;
> > -goto immediate_exit;
> > -}
> > -
> > -if (s->bdev_length > base_length) {
> > +if (s->bdev_length > target_length) {
> >   ret = blk_truncate(s->target, s->bdev_length, false,
> >  PREALLOC_MODE_OFF, 0, NULL);
> >   if (ret < 0) {
> >   goto immediate_exit;
> >   }
> >   }
> 
> Hmm, interesting, if base is larger, is our behavior correct? Blockdev
> becomes larger after commit and old data becomes available? I think we
> should zero the tail after old EOF or shrink the target..

Yes, I agree, we should shrink it. But active commit is a different case
than what I'm fixing in this patch, so I left it unmodified. We'll
probably need a third series for commit after backup and mirror.

> > +} else if (s->bdev_length != target_length) {
> > +error_setg(errp, "Source and target image have different sizes");
> > +ret = -EINVAL;
> 
> Seems, the only case, when mirror_run() sets errp. And, therefore, the
> only correct one..

job_update_rc() takes care to fill job->err (with strerror()) if it
hasn't been set yet, so the other places aren't strictly wrong, but
probably provide suboptimal error messages.

Kevin

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Dr. David Alan Gilbert

* Daniel P. BerrangÃ© (berra...@redhat.com) wrote:
> On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote:
> > On Mon, 11 May 2020 16:46:45 +0100
> > "Dr. David Alan Gilbert"  wrote:
> > 
> > > * Daniel P. BerrangÃÂ© (berra...@redhat.com) wrote: 
> > > > ...
> > > > That way if QEMU does get stuck, you can start by tearing down the
> > > > least distruptive channel. eg try tearing down the migration connection
> > > > first (which shouldn't negatively impact the guest), and only if that
> > > > doesn't work then, move on to tear down the NBD connection (which risks
> > > > data loss)  
> > > 
> > > I wonder if a different way would be to make all network connections
> > > register with yank, but then make yank take a list of connections to
> > > shutdown(2).
> > 
> > Good Idea. We could name the connections (/yank callbacks) in the
> > form "nbd:", "chardev:" and "migration"
> > (and add "netdev:...", etc. in the future). Then make yank take a
> > list of connection names as you suggest and silently ignore connections
> > that don't exist. And maybe even add a 'query-yank' oob command returning
> > a list of registered connections so the management application can do
> > pattern matching if it wants.
> 
> Yes, that would make the yank command much more flexible in how it can
> be used.
> 
> As an alternative to using formatted strings like this, it could be
> modelled more explicitly in QAPI
> 
>   { 'struct':  'YankChannels',
> 'data': { 'chardev': [ 'string' ],
>   'nbd': ['string'],
> 'migration': bool } }
> 
> In this example, 'chardev' would accept a list of chardev IDs which
> have it enabled, 'nbd' would accept a list of block node IDs which
> have it enabled, and migration is a singleton on/off.

Do we already have a QOM object name for each of these things?
Is that nbd/blockdevice unique - i.e. can you have multiple nbd clients
on the same node?

> The benefit of this modelling is that you can introspect QEMU
> to discover what classes of channels support being yanked by
> this QEMU build, as well as what instances are configured to
> be yanked. ie you can distinguish between a QEMU that doesn't
> support yanking network devices, from a QEMU that does support
> yanking network devices, but doesn't have it enabled for any
> network device instances.

What do we need to make it introspectable like that?

Dave

> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext

2020-05-12 Thread Kevin Wolf

Am 12.05.2020 um 18:02 hat Thomas Lamprecht geschrieben:
> On 5/12/20 4:43 PM, Kevin Wolf wrote:
> > Coroutine functions that are entered through bdrv_run_co() are already
> > safe to call from synchronous code in a different AioContext because
> > bdrv_coroutine_enter() will schedule them in the context of the node.
> > 
> > However, the coroutine fastpath still requires that we're already in the
> > right AioContext when called in coroutine context.
> > 
> > In order to make the behaviour more consistent and to make life a bit
> > easier for callers, let's check the AioContext and automatically move
> > the current coroutine around if we're not in the right context yet.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >  block/io.c | 15 ++-
> >  1 file changed, 14 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/io.c b/block/io.c
> > index c1badaadc9..7808e8bdc0 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, 
> > CoroutineEntry *entry,
> > void *opaque, int *ret)
> >  {
> >  if (qemu_in_coroutine()) {
> > -/* Fast-path if already in coroutine context */
> > +Coroutine *self = qemu_coroutine_self();
> > +AioContext *bs_ctx = bdrv_get_aio_context(bs);
> > +AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
> > +
> > +if (bs_ctx != co_ctx) {
> > +/* Move to the iothread of the node */
> > +aio_co_schedule(bs_ctx, self);
> > +qemu_coroutine_yield();
> > +}
> >  entry(opaque);
> > +if (bs_ctx != co_ctx) {
> > +/* Move back to the original AioContext */
> > +aio_co_schedule(bs_ctx, self);
> 
> shouldn't it use co_ctx here, as else it's just scheduled again on the
> one from bs?

Oops, you're right, of course.

> Looks OK for me besides that.

Thanks!

Kevin

Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure

2020-05-12 Thread Eric Blake


On 5/12/20 6:10 AM, Max Reitz wrote:



This does not break old code since previously we always reported only
guest visible content
here, but it changes the semantics, and now you cannot allocate
"required" size, you need
to allocate "required" size with "bitmaps" size.


Only if you copy the bitmaps, though, which requires using the --bitmaps
switch for convert.


First, a usage question: would you rather that 'qemu-img convert 
--bitmaps' silently succeeds even when the image being converted has no 
bitmaps, or would you rather that it fails loudly when there are no 
bitmaps to be copied over?  As implemented in this patch series, patch 8 
currently silently succeeds.  But in order to make patch 7 and 8 
consistent with one another, I need to know which behavior is easier to 
use: failing convert if the source lacks bitmaps (and thus measure would 
suppress the bitmaps:0 output), or treating lack of bitmaps as nothing 
additional to copy and thereby succeeding (and thus measure should 
output bitmaps:0 to show that no additional space is needed because 
nothing else will be copied, successfully).



If we add a new
extension all users will have to
change the calculation again.


It was my impression that existing users won’t have to do that, because
they don’t use --bitmaps yet.

In contrast, if we included the bitmap size in @required or
@fully-allocated, then previous users that didn’t copy bitmaps to the
destination (which they couldn’t) would allocate too much space.

...revisiting this after reading more of your mail: With a --bitmaps
switch, existing users wouldn’t suffer from such compatibility problems.
  However, then users (that now want to copy bitmaps) will have to pass
the new --bitmaps flag in the command line, and I don’t see how that’s
less complicated than just adding @bitmaps to @required.


More concretely, suppose down the road that we add the ability to copy 
internal snapshots (not that you want to mix internal and external 
snapshots, but that such information already exists and therefore can be 
used as an example).  Which is easier:


$ qemu-img measure -f qcow2 image.qcow2
required size: 8716288
fully allocated size: 8716288
bitmaps size: 1024
internal snapshot size: 2048

where you now have to add three numbers prior to creating dest.qcow2 and 
calling:


$ qemu-img convert image.qcow2 -f dest.qcow2 --bitmaps --snapshots

or using:

$ qemu-img measure --bitmaps --snapshots -f qcow2 image.qcow2
required size: 8719360
fully allocated size: 8719360

where you then call:

$ qemu-img convert image.qcow2 -f dest.qcow2 --bitmaps --snapshots

with a single size that matches the same arguments you pass to qemu-img 
convert?  What about failure cases?  What happens when qemu-img doesn't 
understand --snapshots but does understand --bitmaps?  Do you have to 
try a series of up to three calls to find how much information is supported:


$ qemu-img measure -f qcow2 image.qcow2 --bitmaps --snapshots
error: unknown argument
$ qemu-img measure -f qcow2 image.qcow2 --bitmaps
error: unknown argument
$ qemu-img measure -f qcow2 image.qcow2
data given, now you know that neither --bitmaps nor --snapshots will work

or is it nicer to issue just one measure without options, getting 
separate output lines, and seeing which output lines exist to know which 
convert options are supported, at the minor expense of having to add 
values yourself?


And then back to my question: should 'measure --bitmaps' fail if there 
are no bitmaps to be measured, or silently succeed and not change the 
output size?




With the current way, to measure an image we need to do:

qemu-img info --output json ...
check if image contains bitmaps
qemu-img measure --output json ...
check if bitmaps are reported.


Why do you have to check 'qemu-img info' first?  If measure reports 
bitmaps, then you know bitmaps can be copied; if it doesn't, then you 
can check info as a fallback path to compute size yourself - but 
computing the size yourself doesn't help unless you also have fallbacks 
to copy the bitmaps via QMP commands, because that qemu-img will also 
lack 'qemu-img convert --bitmaps' or 'qemu-img bitmaps' to do it via 
qemu-img.




If image has bitmaps and bitmaps are not reported, we know that we have an old
version of qemu-img that does not know how to measure bitmaps.


Well, but you’ll also see that convert --bitmaps will fail.  But I can
see that you probably want to know about that before you create the
target image.


If bitmaps are reported in both commands we will use the value when creating
block devices.


And otherwise?  Do you error out, or do you just omit --bitmaps from
convert?


If we always report bitmaps even when they are zero, we don't need to
run "qemu-img info".

But  my preferred interface is:

qemu-img measure --bitmaps ...

And report bitmaps only if the user asked to get the value. In this
case the required and
fully-allocated values will include bitmaps.


Well, it would be consistent wit

Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-05-12 Thread Lukas Straub

On Tue, 12 May 2020 10:43:37 +0100
Daniel P. Berrangé  wrote:

> On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote:
> > ... 
> > 
> > Good Idea. We could name the connections (/yank callbacks) in the
> > form "nbd:", "chardev:" and "migration"
> > (and add "netdev:...", etc. in the future). Then make yank take a
> > list of connection names as you suggest and silently ignore connections
> > that don't exist. And maybe even add a 'query-yank' oob command returning
> > a list of registered connections so the management application can do
> > pattern matching if it wants.  
> 
> Yes, that would make the yank command much more flexible in how it can
> be used.
> 
> As an alternative to using formatted strings like this, it could be
> modelled more explicitly in QAPI
> 
>   { 'struct':  'YankChannels',
> 'data': { 'chardev': [ 'string' ],
>   'nbd': ['string'],
> 'migration': bool } }
> 
> In this example, 'chardev' would accept a list of chardev IDs which
> have it enabled, 'nbd' would accept a list of block node IDs which
> have it enabled, and migration is a singleton on/off.

With the new command, the yank feature can then be enabled unconditionally on 
the instances.

> The benefit of this modelling is that you can introspect QEMU
> to discover what classes of channels support being yanked by
> this QEMU build, as well as what instances are configured to
> be yanked. ie you can distinguish between a QEMU that doesn't
> support yanking network devices, from a QEMU that does support
> yanking network devices, but doesn't have it enabled for any
> network device instances.
> 
> Regards,
> Daniel



pgp4JEcSFZzlS.pgp
Description: OpenPGP digital signature

[PATCH] hw/ide: Make IDEDMAOps handlers take a const IDEDMA pointer

2020-05-12 Thread Philippe Mathieu-Daudé

Handlers don't need to modify the IDEDMA structure.
Make it const.

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/ide/internal.h | 12 ++--
 hw/ide/ahci.c | 18 +-
 hw/ide/core.c |  6 +++---
 hw/ide/macio.c|  6 +++---
 hw/ide/pci.c  | 12 ++--
 5 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
index 55da35d768..1a7869e85d 100644
--- a/include/hw/ide/internal.h
+++ b/include/hw/ide/internal.h
@@ -322,12 +322,12 @@ typedef enum { IDE_HD, IDE_CD, IDE_CFATA } IDEDriveKind;
 
 typedef void EndTransferFunc(IDEState *);
 
-typedef void DMAStartFunc(IDEDMA *, IDEState *, BlockCompletionFunc *);
-typedef void DMAVoidFunc(IDEDMA *);
-typedef int DMAIntFunc(IDEDMA *, bool);
-typedef int32_t DMAInt32Func(IDEDMA *, int32_t len);
-typedef void DMAu32Func(IDEDMA *, uint32_t);
-typedef void DMAStopFunc(IDEDMA *, bool);
+typedef void DMAStartFunc(const IDEDMA *, IDEState *, BlockCompletionFunc *);
+typedef void DMAVoidFunc(const IDEDMA *);
+typedef int DMAIntFunc(const IDEDMA *, bool);
+typedef int32_t DMAInt32Func(const IDEDMA *, int32_t len);
+typedef void DMAu32Func(const IDEDMA *, uint32_t);
+typedef void DMAStopFunc(const IDEDMA *, bool);
 
 struct unreported_events {
 bool eject_request;
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 13d91e109a..168d34e9f2 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -44,7 +44,7 @@ static int handle_cmd(AHCIState *s, int port, uint8_t slot);
 static void ahci_reset_port(AHCIState *s, int port);
 static bool ahci_write_fis_d2h(AHCIDevice *ad);
 static void ahci_init_d2h(AHCIDevice *ad);
-static int ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit);
+static int ahci_dma_prepare_buf(const IDEDMA *dma, int32_t limit);
 static bool ahci_map_clb_address(AHCIDevice *ad);
 static bool ahci_map_fis_address(AHCIDevice *ad);
 static void ahci_unmap_clb_address(AHCIDevice *ad);
@@ -1338,7 +1338,7 @@ out:
 }
 
 /* Transfer PIO data between RAM and device */
-static void ahci_pio_transfer(IDEDMA *dma)
+static void ahci_pio_transfer(const IDEDMA *dma)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 IDEState *s = &ad->port.ifs[0];
@@ -1397,7 +1397,7 @@ out:
 }
 }
 
-static void ahci_start_dma(IDEDMA *dma, IDEState *s,
+static void ahci_start_dma(const IDEDMA *dma, IDEState *s,
BlockCompletionFunc *dma_cb)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
@@ -1406,7 +1406,7 @@ static void ahci_start_dma(IDEDMA *dma, IDEState *s,
 dma_cb(s, 0);
 }
 
-static void ahci_restart_dma(IDEDMA *dma)
+static void ahci_restart_dma(const IDEDMA *dma)
 {
 /* Nothing to do, ahci_start_dma already resets s->io_buffer_offset.  */
 }
@@ -1415,7 +1415,7 @@ static void ahci_restart_dma(IDEDMA *dma)
  * IDE/PIO restarts are handled by the core layer, but NCQ commands
  * need an extra kick from the AHCI HBA.
  */
-static void ahci_restart(IDEDMA *dma)
+static void ahci_restart(const IDEDMA *dma)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 int i;
@@ -1432,7 +1432,7 @@ static void ahci_restart(IDEDMA *dma)
  * Called in DMA and PIO R/W chains to read the PRDT.
  * Not shared with NCQ pathways.
  */
-static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit)
+static int32_t ahci_dma_prepare_buf(const IDEDMA *dma, int32_t limit)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 IDEState *s = &ad->port.ifs[0];
@@ -1453,7 +1453,7 @@ static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t 
limit)
  * Called via dma_buf_commit, for both DMA and PIO paths.
  * sglist destruction is handled within dma_buf_commit.
  */
-static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes)
+static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 
@@ -1461,7 +1461,7 @@ static void ahci_commit_buf(IDEDMA *dma, uint32_t 
tx_bytes)
 ad->cur_cmd->status = cpu_to_le32(tx_bytes);
 }
 
-static int ahci_dma_rw_buf(IDEDMA *dma, bool is_write)
+static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 IDEState *s = &ad->port.ifs[0];
@@ -1486,7 +1486,7 @@ static int ahci_dma_rw_buf(IDEDMA *dma, bool is_write)
 return 1;
 }
 
-static void ahci_cmd_done(IDEDMA *dma)
+static void ahci_cmd_done(const IDEDMA *dma)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 689bb36409..d997a78e47 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2570,16 +2570,16 @@ static void ide_init1(IDEBus *bus, int unit)
ide_sector_write_timer_cb, s);
 }
 
-static int ide_nop_int(IDEDMA *dma, bool is_write)
+static int ide_nop_int(const IDEDMA *dma, bool is_write)
 {
 return 0;
 }
 
-static void ide_nop(IDEDMA *dma)
+static void ide_nop(const IDEDMA *dma)
 {
 }
 
-static in

1 2 >

1 - 100 of 119 matches

Mail list logo