[PATCH v3 1/3] target: Remove unnecessary CPU() cast
The CPU() macro is defined as: #define CPU(obj) ((CPUState *)(obj)) which expands to: ((CPUState *)object_dynamic_cast_assert((Object *)(obj), (name), __FILE__, __LINE__, __func__)) This assertion can only fail when @obj points to something other than its stated type, i.e. when we're in undefined behavior country. Remove the unnecessary CPU() casts when we already know the pointer is of CPUState type. Patch created mechanically using spatch with this script: @@ typedef CPUState; CPUState *s; @@ - CPU(s) + s Acked-by: David Gibson Reviewed-by: Cédric Le Goater Reviewed-by: Richard Henderson Reviewed-by: Markus Armbruster Signed-off-by: Philippe Mathieu-Daudé --- target/ppc/mmu_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/mmu_helper.c b/target/ppc/mmu_helper.c index 86c667b094..8972714775 100644 --- a/target/ppc/mmu_helper.c +++ b/target/ppc/mmu_helper.c @@ -1820,7 +1820,7 @@ static inline void do_invalidate_BAT(CPUPPCState *env, target_ulong BATu, if (((end - base) >> TARGET_PAGE_BITS) > 1024) { /* Flushing 1024 4K pages is slower than a complete flush */ LOG_BATS("Flush all BATs\n"); -tlb_flush(CPU(cs)); +tlb_flush(cs); LOG_BATS("Flush done\n"); return; } -- 2.21.3
[PATCH v3 0/3] various: Remove unnecessary casts
Remove unnecessary casts using coccinelle scripts. The CPU()/OBJECT() patches don't introduce logical change, The DEVICE() one removes various OBJECT_CHECK() calls. Since v3: - Fixed patch #2 description (Markus) - Add A-b/R-b tags Since v2: - Reword description (Markus) - Add A-b/R-b tags Philippe Mathieu-Daudé (3): target: Remove unnecessary CPU() cast various: Remove unnecessary OBJECT() cast hw: Remove unnecessary DEVICE() cast hw/core/bus.c | 2 +- hw/display/artist.c | 2 +- hw/display/cg3.c| 2 +- hw/display/sm501.c | 2 +- hw/display/tcx.c| 4 ++-- hw/display/vga-isa.c| 2 +- hw/i2c/imx_i2c.c| 2 +- hw/i2c/mpc_i2c.c| 2 +- hw/ide/ahci-allwinner.c | 2 +- hw/ide/piix.c | 2 +- hw/ipmi/smbus_ipmi.c| 2 +- hw/microblaze/petalogix_ml605_mmu.c | 8 hw/misc/macio/pmu.c | 2 +- hw/net/ftgmac100.c | 3 +-- hw/net/imx_fec.c| 2 +- hw/nubus/nubus-device.c | 2 +- hw/pci-host/bonito.c| 2 +- hw/ppc/spapr.c | 2 +- hw/s390x/sclp.c | 2 +- hw/sh4/sh_pci.c | 2 +- hw/xen/xen-legacy-backend.c | 2 +- monitor/misc.c | 3 +-- qom/object.c| 4 ++-- target/ppc/mmu_helper.c | 2 +- 24 files changed, 29 insertions(+), 31 deletions(-) -- 2.21.3
[PATCH v3 2/3] various: Remove unnecessary OBJECT() cast
The OBJECT() macro is defined as: #define OBJECT(obj) ((Object *)(obj)) Remove the unnecessary OBJECT() casts when we already know the pointer is of Object type. Patch created mechanically using spatch with this script: @@ typedef Object; Object *o; @@ - OBJECT(o) + o Acked-by: Cornelia Huck Acked-by: Corey Minyard Acked-by: John Snow Reviewed-by: Richard Henderson Signed-off-by: Philippe Mathieu-Daudé --- hw/core/bus.c | 2 +- hw/ide/ahci-allwinner.c | 2 +- hw/ipmi/smbus_ipmi.c| 2 +- hw/microblaze/petalogix_ml605_mmu.c | 8 hw/s390x/sclp.c | 2 +- monitor/misc.c | 3 +-- qom/object.c| 4 ++-- 7 files changed, 11 insertions(+), 12 deletions(-) diff --git a/hw/core/bus.c b/hw/core/bus.c index 3dc0a825f0..4ea5870de8 100644 --- a/hw/core/bus.c +++ b/hw/core/bus.c @@ -25,7 +25,7 @@ void qbus_set_hotplug_handler(BusState *bus, Object *handler, Error **errp) { -object_property_set_link(OBJECT(bus), OBJECT(handler), +object_property_set_link(OBJECT(bus), handler, QDEV_HOTPLUG_HANDLER_PROPERTY, errp); } diff --git a/hw/ide/ahci-allwinner.c b/hw/ide/ahci-allwinner.c index bb8393d2b6..8536b9eb5a 100644 --- a/hw/ide/ahci-allwinner.c +++ b/hw/ide/ahci-allwinner.c @@ -90,7 +90,7 @@ static void allwinner_ahci_init(Object *obj) SysbusAHCIState *s = SYSBUS_AHCI(obj); AllwinnerAHCIState *a = ALLWINNER_AHCI(obj); -memory_region_init_io(&a->mmio, OBJECT(obj), &allwinner_ahci_mem_ops, a, +memory_region_init_io(&a->mmio, obj, &allwinner_ahci_mem_ops, a, "allwinner-ahci", ALLWINNER_AHCI_MMIO_SIZE); memory_region_add_subregion(&s->ahci.mem, ALLWINNER_AHCI_MMIO_OFF, &a->mmio); diff --git a/hw/ipmi/smbus_ipmi.c b/hw/ipmi/smbus_ipmi.c index 2a9470d9df..f1a0148755 100644 --- a/hw/ipmi/smbus_ipmi.c +++ b/hw/ipmi/smbus_ipmi.c @@ -329,7 +329,7 @@ static void smbus_ipmi_init(Object *obj) { SMBusIPMIDevice *sid = SMBUS_IPMI(obj); -ipmi_bmc_find_and_link(OBJECT(obj), (Object **) &sid->bmc); +ipmi_bmc_find_and_link(obj, (Object **) &sid->bmc); } static void smbus_ipmi_get_fwinfo(struct IPMIInterface *ii, IPMIFwInfo *info) diff --git a/hw/microblaze/petalogix_ml605_mmu.c b/hw/microblaze/petalogix_ml605_mmu.c index 0a2640c40b..52dcea9abd 100644 --- a/hw/microblaze/petalogix_ml605_mmu.c +++ b/hw/microblaze/petalogix_ml605_mmu.c @@ -150,9 +150,9 @@ petalogix_ml605_init(MachineState *machine) qdev_set_nic_properties(eth0, &nd_table[0]); qdev_prop_set_uint32(eth0, "rxmem", 0x1000); qdev_prop_set_uint32(eth0, "txmem", 0x1000); -object_property_set_link(OBJECT(eth0), OBJECT(ds), +object_property_set_link(OBJECT(eth0), ds, "axistream-connected", &error_abort); -object_property_set_link(OBJECT(eth0), OBJECT(cs), +object_property_set_link(OBJECT(eth0), cs, "axistream-control-connected", &error_abort); qdev_init_nofail(eth0); sysbus_mmio_map(SYS_BUS_DEVICE(eth0), 0, AXIENET_BASEADDR); @@ -163,9 +163,9 @@ petalogix_ml605_init(MachineState *machine) cs = object_property_get_link(OBJECT(eth0), "axistream-control-connected-target", NULL); qdev_prop_set_uint32(dma, "freqhz", 100 * 100); -object_property_set_link(OBJECT(dma), OBJECT(ds), +object_property_set_link(OBJECT(dma), ds, "axistream-connected", &error_abort); -object_property_set_link(OBJECT(dma), OBJECT(cs), +object_property_set_link(OBJECT(dma), cs, "axistream-control-connected", &error_abort); qdev_init_nofail(dma); sysbus_mmio_map(SYS_BUS_DEVICE(dma), 0, AXIDMA_BASEADDR); diff --git a/hw/s390x/sclp.c b/hw/s390x/sclp.c index ede056b3ef..4132286db7 100644 --- a/hw/s390x/sclp.c +++ b/hw/s390x/sclp.c @@ -322,7 +322,7 @@ void s390_sclp_init(void) object_property_add_child(qdev_get_machine(), TYPE_SCLP, new, NULL); -object_unref(OBJECT(new)); +object_unref(new); qdev_init_nofail(DEVICE(new)); } diff --git a/monitor/misc.c b/monitor/misc.c index 9723b466cd..f5207cd242 100644 --- a/monitor/misc.c +++ b/monitor/misc.c @@ -1837,8 +1837,7 @@ void object_add_completion(ReadLineState *rs, int nb_args, const char *str) static int qdev_add_hotpluggable_device(Object *obj, void *opaque) { GSList **list = opaque; -DeviceState *dev = (DeviceState *)object_dynamic_cast(OBJECT(obj), - TYPE_DEVICE); +DeviceState *dev = (DeviceState *)object_dynamic_cast(obj, TYPE_DEVICE); if (dev == NULL) { return 0; diff --git a/qom/object.c b/qom/object.c index be700e831f..07c1443d0e 100644 --- a/qom/object.c +++ b/qom/object.c @@ -762,7 +762,7 @@ Object *ob
[PATCH v3 3/3] hw: Remove unnecessary DEVICE() cast
The DEVICE() macro is defined as: #define DEVICE(obj) OBJECT_CHECK(DeviceState, (obj), TYPE_DEVICE) which expands to: ((DeviceState *)object_dynamic_cast_assert((Object *)(obj), (name), __FILE__, __LINE__, __func__)) This assertion can only fail when @obj points to something other than its stated type, i.e. when we're in undefined behavior country. Remove the unnecessary DEVICE() casts when we already know the pointer is of DeviceState type. Patch created mechanically using spatch with this script: @@ typedef DeviceState; DeviceState *s; @@ - DEVICE(s) + s Acked-by: David Gibson Acked-by: Paul Durrant Reviewed-by: Markus Armbruster Reviewed-by: Cédric Le Goater Acked-by: John Snow Reviewed-by: Richard Henderson Reviewed-by: Markus Armbruster Signed-off-by: Philippe Mathieu-Daudé --- hw/display/artist.c | 2 +- hw/display/cg3.c| 2 +- hw/display/sm501.c | 2 +- hw/display/tcx.c| 4 ++-- hw/display/vga-isa.c| 2 +- hw/i2c/imx_i2c.c| 2 +- hw/i2c/mpc_i2c.c| 2 +- hw/ide/piix.c | 2 +- hw/misc/macio/pmu.c | 2 +- hw/net/ftgmac100.c | 3 +-- hw/net/imx_fec.c| 2 +- hw/nubus/nubus-device.c | 2 +- hw/pci-host/bonito.c| 2 +- hw/ppc/spapr.c | 2 +- hw/sh4/sh_pci.c | 2 +- hw/xen/xen-legacy-backend.c | 2 +- 16 files changed, 17 insertions(+), 18 deletions(-) diff --git a/hw/display/artist.c b/hw/display/artist.c index 753dbb9a77..7e2a4556bd 100644 --- a/hw/display/artist.c +++ b/hw/display/artist.c @@ -1353,7 +1353,7 @@ static void artist_realizefn(DeviceState *dev, Error **errp) s->cursor_height = 32; s->cursor_width = 32; -s->con = graphic_console_init(DEVICE(dev), 0, &artist_ops, s); +s->con = graphic_console_init(dev, 0, &artist_ops, s); qemu_console_resize(s->con, s->width, s->height); } diff --git a/hw/display/cg3.c b/hw/display/cg3.c index a1ede10394..f7f1c199ce 100644 --- a/hw/display/cg3.c +++ b/hw/display/cg3.c @@ -321,7 +321,7 @@ static void cg3_realizefn(DeviceState *dev, Error **errp) sysbus_init_irq(sbd, &s->irq); -s->con = graphic_console_init(DEVICE(dev), 0, &cg3_ops, s); +s->con = graphic_console_init(dev, 0, &cg3_ops, s); qemu_console_resize(s->con, s->width, s->height); } diff --git a/hw/display/sm501.c b/hw/display/sm501.c index de0ab9d977..2a564889bd 100644 --- a/hw/display/sm501.c +++ b/hw/display/sm501.c @@ -1839,7 +1839,7 @@ static void sm501_init(SM501State *s, DeviceState *dev, &s->twoD_engine_region); /* create qemu graphic console */ -s->con = graphic_console_init(DEVICE(dev), 0, &sm501_ops, s); +s->con = graphic_console_init(dev, 0, &sm501_ops, s); } static const VMStateDescription vmstate_sm501_state = { diff --git a/hw/display/tcx.c b/hw/display/tcx.c index 76de16e8ea..1fb45b1aab 100644 --- a/hw/display/tcx.c +++ b/hw/display/tcx.c @@ -868,9 +868,9 @@ static void tcx_realizefn(DeviceState *dev, Error **errp) sysbus_init_irq(sbd, &s->irq); if (s->depth == 8) { -s->con = graphic_console_init(DEVICE(dev), 0, &tcx_ops, s); +s->con = graphic_console_init(dev, 0, &tcx_ops, s); } else { -s->con = graphic_console_init(DEVICE(dev), 0, &tcx24_ops, s); +s->con = graphic_console_init(dev, 0, &tcx24_ops, s); } s->thcmisc = 0; diff --git a/hw/display/vga-isa.c b/hw/display/vga-isa.c index 0633ed382c..3aaeeeca1e 100644 --- a/hw/display/vga-isa.c +++ b/hw/display/vga-isa.c @@ -74,7 +74,7 @@ static void vga_isa_realizefn(DeviceState *dev, Error **errp) 0x000a, vga_io_memory, 1); memory_region_set_coalescing(vga_io_memory); -s->con = graphic_console_init(DEVICE(dev), 0, s->hw_ops, s); +s->con = graphic_console_init(dev, 0, s->hw_ops, s); memory_region_add_subregion(isa_address_space(isadev), VBE_DISPI_LFB_PHYSICAL_ADDRESS, diff --git a/hw/i2c/imx_i2c.c b/hw/i2c/imx_i2c.c index 30b9aea247..2e02e1c4fa 100644 --- a/hw/i2c/imx_i2c.c +++ b/hw/i2c/imx_i2c.c @@ -305,7 +305,7 @@ static void imx_i2c_realize(DeviceState *dev, Error **errp) IMX_I2C_MEM_SIZE); sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->iomem); sysbus_init_irq(SYS_BUS_DEVICE(dev), &s->irq); -s->bus = i2c_init_bus(DEVICE(dev), NULL); +s->bus = i2c_init_bus(dev, NULL); } static void imx_i2c_class_init(ObjectClass *klass, void *data) diff --git a/hw/i2c/mpc_i2c.c b/hw/i2c/mpc_i2c.c index 0aa1be3ce7..9a724f3a3e 100644 --- a/hw/i2c/mpc_i2c.c +++ b/hw/i2c/mpc_i2c.c @@ -332,7 +332,7 @@ static void mpc_i2c_realize(DeviceState *dev, Error **errp) memory_region_init_io(&i2c->iomem, OBJECT(i2c), &i2c_ops, i2c,
[PATCH v3 1/2] scripts/qemugdb: Remove shebang header
These scripts are loaded as plugin by GDB (and they don't have any __main__ entry point). Remove the shebang header. Acked-by: Alex Bennée Signed-off-by: Philippe Mathieu-Daudé --- scripts/qemugdb/__init__.py | 3 +-- scripts/qemugdb/aio.py | 3 +-- scripts/qemugdb/coroutine.py | 3 +-- scripts/qemugdb/mtree.py | 4 +--- scripts/qemugdb/tcg.py | 1 - 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/scripts/qemugdb/__init__.py b/scripts/qemugdb/__init__.py index 969f552b26..da8ff612e5 100644 --- a/scripts/qemugdb/__init__.py +++ b/scripts/qemugdb/__init__.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright (c) 2015 Linaro Ltd diff --git a/scripts/qemugdb/aio.py b/scripts/qemugdb/aio.py index 2ba00c..d7c1ba0c28 100644 --- a/scripts/qemugdb/aio.py +++ b/scripts/qemugdb/aio.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support: aio/iohandler debug # # Copyright (c) 2015 Red Hat, Inc. diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py index 41e079d0e2..db61389022 100644 --- a/scripts/qemugdb/coroutine.py +++ b/scripts/qemugdb/coroutine.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright 2012 Red Hat, Inc. and/or its affiliates diff --git a/scripts/qemugdb/mtree.py b/scripts/qemugdb/mtree.py index 3030a60d3f..8fe42c3c12 100644 --- a/scripts/qemugdb/mtree.py +++ b/scripts/qemugdb/mtree.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright 2012 Red Hat, Inc. and/or its affiliates @@ -84,4 +83,3 @@ def print_item(self, ptr, offset = gdb.Value(0), level = 0): while not isnull(subregion): self.print_item(subregion, addr, level) subregion = subregion['subregions_link']['tqe_next'] - diff --git a/scripts/qemugdb/tcg.py b/scripts/qemugdb/tcg.py index 18880fc9a7..16c03c06a9 100644 --- a/scripts/qemugdb/tcg.py +++ b/scripts/qemugdb/tcg.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # -*- coding: utf-8 -*- # # GDB debugging support, TCG status -- 2.21.3
[PATCH v3 0/2] scripts: More Python fixes
Trivial Python3 fixes, again... Since v2: - Remove patch updating MAINTAINERS Since v1: - Added Alex Bennée A-b tags - Addressed John Snow review comments - Use /usr/bin/env - Do not modify os.path (dropped last patch) Philippe Mathieu-Daudé (2): scripts/qemugdb: Remove shebang header scripts/qmp: Use Python 3 interpreter scripts/qemugdb/__init__.py | 3 +-- scripts/qemugdb/aio.py | 3 +-- scripts/qemugdb/coroutine.py | 3 +-- scripts/qemugdb/mtree.py | 4 +--- scripts/qemugdb/tcg.py | 1 - scripts/qmp/qom-get | 2 +- scripts/qmp/qom-list | 2 +- scripts/qmp/qom-set | 2 +- scripts/qmp/qom-tree | 2 +- 9 files changed, 8 insertions(+), 14 deletions(-) -- 2.21.3
[PATCH v3 2/2] scripts/qmp: Use Python 3 interpreter
Signed-off-by: Philippe Mathieu-Daudé --- scripts/qmp/qom-get | 2 +- scripts/qmp/qom-list | 2 +- scripts/qmp/qom-set | 2 +- scripts/qmp/qom-tree | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/qmp/qom-get b/scripts/qmp/qom-get index 007b4cd442..7c5ede91bb 100755 --- a/scripts/qmp/qom-get +++ b/scripts/qmp/qom-get @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-list b/scripts/qmp/qom-list index 03bda3446b..bb68fd65d4 100755 --- a/scripts/qmp/qom-list +++ b/scripts/qmp/qom-list @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-set b/scripts/qmp/qom-set index c37fe78b00..19881d85e9 100755 --- a/scripts/qmp/qom-set +++ b/scripts/qmp/qom-set @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-tree b/scripts/qmp/qom-tree index 1c8acf61e7..fa91147a03 100755 --- a/scripts/qmp/qom-tree +++ b/scripts/qmp/qom-tree @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # -- 2.21.3
[PATCH v2 1/3] hw/ide/ahci: Use qdev gpio rather than qemu_allocate_irqs()
When plugging/unplugging devices on a AHCI bus, we might leak the memory returned by qemu_allocate_irqs(). We can avoid this leak by switching to using qdev_init_gpio_in(); the base class finalize will free the irqs that this allocates under the hood. Patch created mechanically using spatch with this script inspired from commit d6ef883d9d7: @@ typedef qemu_irq; identifier irqs, handler; expression opaque, count, i; @@ - qemu_irq *irqs; ... - irqs = qemu_allocate_irqs(handler, opaque, count); + qdev_init_gpio_in(DEVICE(opaque), handler, count); <+... - irqs[i] + qdev_get_gpio_in(DEVICE(opaque), i) ...+> ?- g_free(irqs); Inspired-by: Peter Maydell Reviewed-by: Alistair Francis Reviewed-by: John Snow Acked-by: John Snow Signed-off-by: Philippe Mathieu-Daudé --- hw/ide/ahci.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 13d91e109a..991bb7c9c8 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -1534,19 +1534,18 @@ void ahci_init(AHCIState *s, DeviceState *qdev) void ahci_realize(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports) { -qemu_irq *irqs; int i; s->as = as; s->ports = ports; s->dev = g_new0(AHCIDevice, ports); ahci_reg_init(s); -irqs = qemu_allocate_irqs(ahci_irq_set, s, s->ports); +qdev_init_gpio_in(qdev, ahci_irq_set, s->ports); for (i = 0; i < s->ports; i++) { AHCIDevice *ad = &s->dev[i]; ide_bus_new(&ad->port, sizeof(ad->port), qdev, i, 1); -ide_init2(&ad->port, irqs[i]); +ide_init2(&ad->port, qdev_get_gpio_in(qdev, i)); ad->hba = s; ad->port_no = i; @@ -1554,7 +1553,6 @@ void ahci_realize(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports) ad->port.dma->ops = &ahci_dma_ops; ide_register_restart_cb(&ad->port); } -g_free(irqs); } void ahci_uninit(AHCIState *s) -- 2.21.3
[PATCH v2 0/3] hw: Use qdev gpio rather than qemu_allocate_irqs()
Use a coccinelle script to convert few qemu_allocate_irqs() calls to the qdev gpio API. One memory leak removed in hw/openrisc/pic_cpu.c Since v1: - Referrenced Coverity CID (Stafford) - Reword AHCI description (Zoltan) Philippe Mathieu-Daudé (3): hw/ide/ahci: Use qdev gpio rather than qemu_allocate_irqs() hw/mips/mips_int: Use qdev gpio rather than qemu_allocate_irqs() hw/openrisc/pic_cpu: Use qdev gpio rather than qemu_allocate_irqs() hw/ide/ahci.c | 6 ++ hw/mips/mips_int.c| 6 ++ hw/openrisc/pic_cpu.c | 5 ++--- 3 files changed, 6 insertions(+), 11 deletions(-) -- 2.21.3
[PATCH v2 3/3] hw/openrisc/pic_cpu: Use qdev gpio rather than qemu_allocate_irqs()
Coverity points out (CID 1421934) that we are leaking the memory returned by qemu_allocate_irqs(). We can avoid this leak by switching to using qdev_init_gpio_in(); the base class finalize will free the irqs that this allocates under the hood. Patch created mechanically using spatch with this script inspired from commit d6ef883d9d7: @@ typedef qemu_irq; identifier irqs, handler; expression opaque, count, i; @@ - qemu_irq *irqs; ... - irqs = qemu_allocate_irqs(handler, opaque, count); + qdev_init_gpio_in(DEVICE(opaque), handler, count); <+... - irqs[i] + qdev_get_gpio_in(DEVICE(opaque), i) ...+> ?- g_free(irqs); Fixes: Coverity CID 1421934 RESOURCE_LEAK Inspired-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé --- hw/openrisc/pic_cpu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/openrisc/pic_cpu.c b/hw/openrisc/pic_cpu.c index 36f9350830..4b0c92f842 100644 --- a/hw/openrisc/pic_cpu.c +++ b/hw/openrisc/pic_cpu.c @@ -52,10 +52,9 @@ static void openrisc_pic_cpu_handler(void *opaque, int irq, int level) void cpu_openrisc_pic_init(OpenRISCCPU *cpu) { int i; -qemu_irq *qi; -qi = qemu_allocate_irqs(openrisc_pic_cpu_handler, cpu, NR_IRQS); +qdev_init_gpio_in(DEVICE(cpu), openrisc_pic_cpu_handler, NR_IRQS); for (i = 0; i < NR_IRQS; i++) { -cpu->env.irq[i] = qi[i]; +cpu->env.irq[i] = qdev_get_gpio_in(DEVICE(cpu), i); } } -- 2.21.3
[PATCH v2 2/3] hw/mips/mips_int: Use qdev gpio rather than qemu_allocate_irqs()
Switch to using the qdev gpio API which is preferred over qemu_allocate_irqs(). One step to eventually deprecate and remove qemu_allocate_irqs() one day. Patch created mechanically using spatch with this script inspired from commit d6ef883d9d7: @@ typedef qemu_irq; identifier irqs, handler; expression opaque, count, i; @@ - qemu_irq *irqs; ... - irqs = qemu_allocate_irqs(handler, opaque, count); + qdev_init_gpio_in(DEVICE(opaque), handler, count); <+... - irqs[i] + qdev_get_gpio_in(DEVICE(opaque), i) ...+> ?- g_free(irqs); Fixes: Coverity CID 1421934 RESOURCE_LEAK Inspired-by: Peter Maydell Signed-off-by: Philippe Mathieu-Daudé --- hw/mips/mips_int.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/mips/mips_int.c b/hw/mips/mips_int.c index 796730b11d..91788c51a9 100644 --- a/hw/mips/mips_int.c +++ b/hw/mips/mips_int.c @@ -74,14 +74,12 @@ static void cpu_mips_irq_request(void *opaque, int irq, int level) void cpu_mips_irq_init_cpu(MIPSCPU *cpu) { CPUMIPSState *env = &cpu->env; -qemu_irq *qi; int i; -qi = qemu_allocate_irqs(cpu_mips_irq_request, cpu, 8); +qdev_init_gpio_in(DEVICE(cpu), cpu_mips_irq_request, 8); for (i = 0; i < 8; i++) { -env->irq[i] = qi[i]; +env->irq[i] = qdev_get_gpio_in(DEVICE(cpu), i); } -g_free(qi); } void cpu_mips_soft_irq(CPUMIPSState *env, int irq, int level) -- 2.21.3
Re: [PATCH 2/5] io/channel.c,io/channel-socket.c: Add yank feature
On Mon, May 11, 2020 at 10:00:42PM +0200, Lukas Straub wrote: > On Mon, 11 May 2020 12:51:46 +0100 > Daniel P. Berrangé wrote: > > > On Mon, May 11, 2020 at 01:14:41PM +0200, Lukas Straub wrote: > > > Add qio_channel_set_yank function to channel and to channel-socket, > > > which will register a yank function. The yank function calls > > > shutdown() on the socket. > > > > > > Signed-off-by: Lukas Straub > > > --- > > > Makefile.objs | 1 + > > > include/io/channel-socket.h | 1 + > > > include/io/channel.h| 12 > > > io/channel-socket.c | 29 + > > > io/channel.c| 9 + > > > 5 files changed, 52 insertions(+) > > > > Assuming we want the yank feature (which I'm not entirely convinced > > of), then I don't think any of this addition should exist. The > > QIOChannel class already provides a "qio_channel_shutdown" method > > which can be invoked. The layer above which is using the QIOChannel > > should be calling this existing qio_channel_shutdown method in > > response to any yank request. The I/O layer shouldn't have any > > direct dependancy on the yank feature. > > Having the code here simplifys the code in the other places. This introduces a depedancy from the IO channel code into the system emulator infra which is not desired. The goal is to keep the io/ module isolated and as self-contained as possible. I don't think it makes a significant difference to the yank code to keep it out of the io layer, and simply call the QIOChannel objects via their existing public API. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH 3/5] block/nbd.c: Add yank feature
On Mon, May 11, 2020 at 07:05:24PM +0200, Lukas Straub wrote: > On Mon, 11 May 2020 17:19:09 +0100 > "Dr. David Alan Gilbert" wrote: > > > * Lukas Straub (lukasstra...@web.de) wrote: > > > Add yank option, pass it to the socket-channel and register a yank > > > function which sets s->state = NBD_CLIENT_QUIT. This is the same > > > behaviour as if an error occured. > > > > > > Signed-off-by: Lukas Straub > > > > > +static void nbd_yank(void *opaque) > > > +{ > > > +BlockDriverState *bs = opaque; > > > +BDRVNBDState *s = (BDRVNBDState *)bs->opaque; > > > + > > > +atomic_set(&s->state, NBD_CLIENT_QUIT); > > > > I think I was expecting a shutdown on the socket here - why doesn't it > > have one? > > For nbd, we register two yank functions: This one and we enable > the yank feature on the qio channel (see function > nbd_establish_connection below). As mentioned on the earlier patch, I don't want to see any yank code in the QIOChannel object directly. This nbd_yank function can simply call the qio_channel_shutdown() function directly and avoid need for modifying the QIOChannel object with yank support. This will make the NBD code clearer too, as we can see exactly what actions are performed at NBD layer when a yank happens, which is what David was not seeing clearly here. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v3 1/2] scripts/qemugdb: Remove shebang header
Am 12.05.2020 um 09:06 hat Philippe Mathieu-Daudé geschrieben: > These scripts are loaded as plugin by GDB (and they don't > have any __main__ entry point). Remove the shebang header. > > Acked-by: Alex Bennée > Signed-off-by: Philippe Mathieu-Daudé > --- > scripts/qemugdb/__init__.py | 3 +-- > scripts/qemugdb/aio.py | 3 +-- > scripts/qemugdb/coroutine.py | 3 +-- > scripts/qemugdb/mtree.py | 4 +--- > scripts/qemugdb/tcg.py | 1 - There is still a shebang line left in scripts/qemugdb/timers.py. Kevin
Re: [PATCH 4/5] chardev/char-socket.c: Add yank feature
On Mon, May 11, 2020 at 01:14:47PM +0200, Lukas Straub wrote: > Add yank option which is passed to the socket-channel. > > Signed-off-by: Lukas Straub > --- > chardev/char-socket.c | 8 > chardev/char.c| 3 +++ > qapi/char.json| 5 - > 3 files changed, 15 insertions(+), 1 deletion(-) > > diff --git a/chardev/char-socket.c b/chardev/char-socket.c > index 185fe38dda..e476358941 100644 > --- a/chardev/char-socket.c > +++ b/chardev/char-socket.c > @@ -65,6 +65,7 @@ typedef struct { > int max_size; > int do_telnetopt; > int do_nodelay; > +int do_yank; > int *read_msgfds; > size_t read_msgfds_num; > int *write_msgfds; > @@ -877,6 +878,9 @@ static int tcp_chr_new_client(Chardev *chr, > QIOChannelSocket *sioc) > if (s->do_nodelay) { > qio_channel_set_delay(s->ioc, false); > } > +if (s->do_yank) { > +qio_channel_set_yank(s->ioc, true); > +} This should call yank_register_function() to register a local callback, which then invokes qio_channel_shutdown() upon acting, avoiding the need to add this qio_channel_set_yank method. Likewise for the migration code in the next patch. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v3 2/2] scripts/qmp: Use Python 3 interpreter
Am 12.05.2020 um 09:06 hat Philippe Mathieu-Daudé geschrieben: > Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Kevin Wolf
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
On Mon, May 11, 2020 at 08:12:18PM +0200, Lukas Straub wrote: > On Mon, 11 May 2020 12:49:47 +0100 > Daniel P. Berrangé wrote: > > > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote: > > > Hello Everyone, > > > In many cases, if qemu has a network connection (qmp, migration, chardev, > > > etc.) > > > to some other server and that server dies or hangs, qemu hangs too. > > > > If qemu as a whole hangs due to a stalled network connection, that is a > > bug in QEMU that we should be fixing IMHO. QEMU should be doing non-blocking > > I/O in general, such that if the network connection or remote server stalls, > > we simply stop sending I/O - we shouldn't ever hang the QEMU process or main > > loop. > > > > There are places in QEMU code which are not well behaved in this respect, > > but many are, and others are getting fixed where found to be important. > > > > Arguably any place in QEMU code which can result in a hang of QEMU in the > > event of a stalled network should be considered a security flaw, because > > the network is untrusted in general. > > The fact that out-of-band qmp commands exist at all shows that we have to > make tradeoffs of developer time vs. doing things right. Sure, the migration > code can be rewritten to use non-blocking i/o and finegrained locks. But as a > hobbyist I don't have time to fix this. > > > > These patches introduce the new 'yank' out-of-band qmp command to recover > > > from > > > these kinds of hangs. The different subsystems register callbacks which > > > get > > > executed with the yank command. For example the callback can shutdown() a > > > socket. This is intended for the colo use-case, but it can be used for > > > other > > > things too of course. > > > > IIUC, invoking the "yank" command unconditionally kills every single > > network connection in QEMU that has registered with the "yank" subsystem. > > IMHO this is way too big of a hammer, even if we accept there are bugs in > > QEMU not handling stalled networking well. > > > > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD > > connection used for the guest disk, we needlessly break I/O. > > Yeah, these patches are intended to solve the problems with the colo > use-case where all external connections (migration, chardevs, nbd) > are just for replication. In other use-cases you'd enable the yank > feature only on the non-essential connections. That is a pretty inflexible design for other use cases though, as "non-essential" is not a black & white list in general. There are varying levels of importance to the different channels. We can afford to loose migration without any user visible effects. If that doesn't solve it, a serial device chardev, or VNC connection can be dropped at the inconvenience of loosing interactive console which is end user visible impact, so may only be want to be yanked if the migration yank didn't fix it. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
* Daniel P. Berrangé (berra...@redhat.com) wrote: > On Mon, May 11, 2020 at 08:12:18PM +0200, Lukas Straub wrote: > > On Mon, 11 May 2020 12:49:47 +0100 > > Daniel P. Berrangé wrote: > > > > > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote: > > > > Hello Everyone, > > > > In many cases, if qemu has a network connection (qmp, migration, > > > > chardev, etc.) > > > > to some other server and that server dies or hangs, qemu hangs too. > > > > > > If qemu as a whole hangs due to a stalled network connection, that is a > > > bug in QEMU that we should be fixing IMHO. QEMU should be doing > > > non-blocking > > > I/O in general, such that if the network connection or remote server > > > stalls, > > > we simply stop sending I/O - we shouldn't ever hang the QEMU process or > > > main > > > loop. > > > > > > There are places in QEMU code which are not well behaved in this respect, > > > but many are, and others are getting fixed where found to be important. > > > > > > Arguably any place in QEMU code which can result in a hang of QEMU in the > > > event of a stalled network should be considered a security flaw, because > > > the network is untrusted in general. > > > > The fact that out-of-band qmp commands exist at all shows that we have to > > make tradeoffs of developer time vs. doing things right. Sure, the > > migration code can be rewritten to use non-blocking i/o and finegrained > > locks. But as a hobbyist I don't have time to fix this. > > > > > > These patches introduce the new 'yank' out-of-band qmp command to > > > > recover from > > > > these kinds of hangs. The different subsystems register callbacks which > > > > get > > > > executed with the yank command. For example the callback can shutdown() > > > > a > > > > socket. This is intended for the colo use-case, but it can be used for > > > > other > > > > things too of course. > > > > > > IIUC, invoking the "yank" command unconditionally kills every single > > > network connection in QEMU that has registered with the "yank" subsystem. > > > IMHO this is way too big of a hammer, even if we accept there are bugs in > > > QEMU not handling stalled networking well. > > > > > > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD > > > connection used for the guest disk, we needlessly break I/O. > > > > Yeah, these patches are intended to solve the problems with the colo > > use-case where all external connections (migration, chardevs, nbd) > > are just for replication. In other use-cases you'd enable the yank > > feature only on the non-essential connections. > > That is a pretty inflexible design for other use cases though, > as "non-essential" is not a black & white list in general. There > are varying levels of importance to the different channels. We > can afford to loose migration without any user visible effects. > If that doesn't solve it, a serial device chardev, or VNC connection > can be dropped at the inconvenience of loosing interactive console > which is end user visible impact, so may only be want to be yanked > if the migration yank didn't fix it. In the case of COLO that's not the case though - here we explicitly want to kill the migration to be able to ensure that we can recover - and we're under time pressure to get the other member of the pair running again. Dave > Regards, > Daniel > -- > |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o-https://fstop138.berrange.com :| > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :| -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v2 4/5] vhost: check vring address before calling unmap
On Tue, May 12, 2020 at 11:26:11AM +0800, Jason Wang wrote: > > On 2020/5/11 下午5:11, Dima Stepanov wrote: > >On Mon, May 11, 2020 at 11:05:58AM +0800, Jason Wang wrote: > >>On 2020/4/30 下午9:36, Dima Stepanov wrote: > >>>Since disconnect can happen at any time during initialization not all > >>>vring buffers (for instance used vring) can be intialized successfully. > >>>If the buffer was not initialized then vhost_memory_unmap call will lead > >>>to SIGSEGV. Add checks for the vring address value before calling unmap. > >>>Also add assert() in the vhost_memory_unmap() routine. > >>> > >>>Signed-off-by: Dima Stepanov > >>>--- > >>> hw/virtio/vhost.c | 27 +-- > >>> 1 file changed, 21 insertions(+), 6 deletions(-) > >>> > >>>diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > >>>index ddbdc53..3ee50c4 100644 > >>>--- a/hw/virtio/vhost.c > >>>+++ b/hw/virtio/vhost.c > >>>@@ -314,6 +314,8 @@ static void vhost_memory_unmap(struct vhost_dev *dev, > >>>void *buffer, > >>> hwaddr len, int is_write, > >>> hwaddr access_len) > >>> { > >>>+assert(buffer); > >>>+ > >>> if (!vhost_dev_has_iommu(dev)) { > >>> cpu_physical_memory_unmap(buffer, len, is_write, access_len); > >>> } > >>>@@ -1132,12 +1134,25 @@ static void vhost_virtqueue_stop(struct vhost_dev > >>>*dev, > >>> vhost_vq_index); > >>> } > >>>-vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, > >>>idx), > >>>- 1, virtio_queue_get_used_size(vdev, idx)); > >>>-vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, > >>>idx), > >>>- 0, virtio_queue_get_avail_size(vdev, idx)); > >>>-vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, > >>>idx), > >>>- 0, virtio_queue_get_desc_size(vdev, idx)); > >>>+/* > >>>+ * Since the vhost-user disconnect can happen during initialization > >>>+ * check if vring was initialized, before making unmap. > >>>+ */ > >>>+if (vq->used) { > >>>+vhost_memory_unmap(dev, vq->used, > >>>+ virtio_queue_get_used_size(vdev, idx), > >>>+ 1, virtio_queue_get_used_size(vdev, idx)); > >>>+} > >>>+if (vq->avail) { > >>>+vhost_memory_unmap(dev, vq->avail, > >>>+ virtio_queue_get_avail_size(vdev, idx), > >>>+ 0, virtio_queue_get_avail_size(vdev, idx)); > >>>+} > >>>+if (vq->desc) { > >>>+vhost_memory_unmap(dev, vq->desc, > >>>+ virtio_queue_get_desc_size(vdev, idx), > >>>+ 0, virtio_queue_get_desc_size(vdev, idx)); > >>>+} > >> > >>Any reason not checking hdev->started instead? vhost_dev_start() will set it > >>to true if virtqueues were correctly mapped. > >> > >>Thanks > >Well i see it a little bit different: > > - vhost_dev_start() sets hdev->started to true before starting > >virtqueues > > - vhost_virtqueue_start() maps all the memory > >If we hit the vhost disconnect at the start of the > >vhost_virtqueue_start(), for instance for this call: > > r = dev->vhost_ops->vhost_set_vring_base(dev, &state); > >Then we will call vhost_user_blk_disconnect: > > vhost_user_blk_disconnect()-> > > vhost_user_blk_stop()-> > > vhost_dev_stop()-> > > vhost_virtqueue_stop() > >As a result we will come in this routine with the hdev->started still > >set to true, but if used/avail/desc fields still uninitialized and set > >to 0. > > > I may miss something, but consider both vhost_dev_start() and > vhost_user_blk_disconnect() were serialized in main loop. Can this really > happen? Yes, consider the case when we start the vhost-user-blk device: vhost_dev_start-> vhost_virtqueue_start And we got a disconnect in the middle of vhost_virtqueue_start() routine, for instance: 1000 vq->num = state.num = virtio_queue_get_num(vdev, idx); 1001 r = dev->vhost_ops->vhost_set_vring_num(dev, &state); 1002 if (r) { 1003 VHOST_OPS_DEBUG("vhost_set_vring_num failed"); 1004 return -errno; 1005 } --> Here we got a disconnect <-- 1006 1007 state.num = virtio_queue_get_last_avail_idx(vdev, idx); 1008 r = dev->vhost_ops->vhost_set_vring_base(dev, &state); 1009 if (r) { 1010 VHOST_OPS_DEBUG("vhost_set_vring_base failed"); 1011 return -errno; 1012 } As a result call to vhost_set_vring_base will call the disconnect routine. The backtrace log for SIGSEGV is as follows: Thread 4 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x72ea9700 (LWP 183150)] 0x74d60840 in ?? () from /lib/x86_64-linux-gnu/libc.so.6 (gdb) bt #0 0x74d60840 in ?? () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x55
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
* Lukas Straub (lukasstra...@web.de) wrote: > On Mon, 11 May 2020 12:49:47 +0100 > Daniel P. Berrangé wrote: > > > On Mon, May 11, 2020 at 01:14:34PM +0200, Lukas Straub wrote: > > > Hello Everyone, > > > In many cases, if qemu has a network connection (qmp, migration, chardev, > > > etc.) > > > to some other server and that server dies or hangs, qemu hangs too. > > > > If qemu as a whole hangs due to a stalled network connection, that is a > > bug in QEMU that we should be fixing IMHO. QEMU should be doing non-blocking > > I/O in general, such that if the network connection or remote server stalls, > > we simply stop sending I/O - we shouldn't ever hang the QEMU process or main > > loop. > > > > There are places in QEMU code which are not well behaved in this respect, > > but many are, and others are getting fixed where found to be important. > > > > Arguably any place in QEMU code which can result in a hang of QEMU in the > > event of a stalled network should be considered a security flaw, because > > the network is untrusted in general. > > The fact that out-of-band qmp commands exist at all shows that we have to > make tradeoffs of developer time vs. doing things right. Sure, the migration > code can be rewritten to use non-blocking i/o and finegrained locks. But as a > hobbyist I don't have time to fix this. If it was just an hobbyist vs fulltime thing then I'd say that was a bad way to make the decision; however the reality is that even those who are paid to watch this code don't have the feeling we can make it fail quickly/non-blocking - and for COLO you need to be absolutely sure you nail every case, so you'd some how have to audit the whole lot, and keep watching it. (and thank you for taking your time to do this!) Dave > > > These patches introduce the new 'yank' out-of-band qmp command to recover > > > from > > > these kinds of hangs. The different subsystems register callbacks which > > > get > > > executed with the yank command. For example the callback can shutdown() a > > > socket. This is intended for the colo use-case, but it can be used for > > > other > > > things too of course. > > > > IIUC, invoking the "yank" command unconditionally kills every single > > network connection in QEMU that has registered with the "yank" subsystem. > > IMHO this is way too big of a hammer, even if we accept there are bugs in > > QEMU not handling stalled networking well. > > > > eg if a chardev hangs QEMU, and we tear down everything, killing the NBD > > connection used for the guest disk, we needlessly break I/O. > > Yeah, these patches are intended to solve the problems with the colo use-case > where all external connections (migration, chardevs, nbd) are just for > replication. In other use-cases you'd enable the yank feature only on the > non-essential connections. > > > eg doing this in the chardev backend is not desirable, because the bugs > > with hanging QEMU are typically caused by the way the frontend device > > uses the chardev blocking I/O calls, instead of non-blocking I/O calls. > > > > > > Regards, > > Daniel > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v2 5/5] vhost: add device started check in migration set log
On Tue, May 12, 2020 at 11:47:34AM +0800, Li Feng wrote: > Hi, Dima. > > If vhost_migration_log return < 0, then vhost_log_global_start will > trigger a crash. > Does your patch have process this abort? > If a disconnect happens in the migration stage, the correct operation > is to stop the migration, right? > > 841 static void vhost_log_global_start(MemoryListener *listener) > 842 { > 843 int r; > 844 > 845 r = vhost_migration_log(listener, true); > 846 if (r < 0) { > 847 abort(); > 848 } > 849 } Yes, my patch process it by not returning an error ). That is one of the point we've talked about with Raphael and Michael in this thread. First of all in my patches i'm still following the same logic which has been already in upstream ./hw/virtio/vhost.c:vhost_migration_log(): ... 820 if (!dev->started) { 821 dev->log_enabled = enable; 822 return 0; 823 } ... It means, that if device not started, then continue migration without returning any error. So i followed the same logic, if we got a disconnect, then it will mean that device isn't started and we can continue migration. As a result no error is returned and assert() isn't hit. Also there is a question from Raphael to Michael about it you can find it in this thread, by i will add it also: > Subject: Re: [PATCH v2 5/5] vhost: add device started check in > migration set log > On Wed, May 06, 2020 at 06:08:34PM -0400, Raphael Norwitz wrote: >> In particular, we need to decide whether a migration should be >> allowed to continue if a device disconnects durning the migration >> stage. >> >> mst, any thoughts? > Why not? It can't change state while disconnected, so it just makes > things easier. So it looks like a correct way to handle it. Also our internal tests passed. Some words about our tests: - run src VM with vhost-usr-blk daemon used - run fio inside it - perform reconnect every X seconds (just kill and restart daemon), X is random - run dst VM - perform migration - fio should complete in dst VM And we cycle this test like forever. At least for now we see no new issues. No other comments mixed in below. > > Thanks, > > Feng Li > > Jason Wang 于2020年5月12日周二 上午11:33写道: > > > > > > On 2020/5/11 下午5:25, Dima Stepanov wrote: > > > On Mon, May 11, 2020 at 11:15:53AM +0800, Jason Wang wrote: > > >> On 2020/4/30 下午9:36, Dima Stepanov wrote: > > >>> If vhost-user daemon is used as a backend for the vhost device, then we > > >>> should consider a possibility of disconnect at any moment. If such > > >>> disconnect happened in the vhost_migration_log() routine the vhost > > >>> device structure will be clean up. > > >>> At the start of the vhost_migration_log() function there is a check: > > >>>if (!dev->started) { > > >>>dev->log_enabled = enable; > > >>>return 0; > > >>>} > > >>> To be consistent with this check add the same check after calling the > > >>> vhost_dev_set_log() routine. This in general help not to break a > > >>> migration due the assert() message. But it looks like that this code > > >>> should be revised to handle these errors more carefully. > > >>> > > >>> In case of vhost-user device backend the fail paths should consider the > > >>> state of the device. In this case we should skip some function calls > > >>> during rollback on the error paths, so not to get the NULL dereference > > >>> errors. > > >>> > > >>> Signed-off-by: Dima Stepanov > > >>> --- > > >>> hw/virtio/vhost.c | 39 +++ > > >>> 1 file changed, 35 insertions(+), 4 deletions(-) > > >>> > > >>> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > > >>> index 3ee50c4..d5ab96d 100644 > > >>> --- a/hw/virtio/vhost.c > > >>> +++ b/hw/virtio/vhost.c > > >>> @@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev > > >>> *dev, > > >>> static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) > > >>> { > > >>> int r, i, idx; > > >>> + > > >>> +if (!dev->started) { > > >>> +/* > > >>> + * If vhost-user daemon is used as a backend for the > > >>> + * device and the connection is broken, then the vhost_dev > > >>> + * structure will be reset all its values to 0. > > >>> + * Add additional check for the device state. > > >>> + */ > > >>> +return -1; > > >>> +} > > >>> + > > >>> r = vhost_dev_set_features(dev, enable_log); > > >>> if (r < 0) { > > >>> goto err_features; > > >>> @@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev > > >>> *dev, bool enable_log) > > >>> } > > >>> return 0; > > >>> err_vq: > > >>> -for (; i >= 0; --i) { > > >>> +/* > > >>> + * Disconnect with the vhost-user daemon can lead to the > > >>> + * vhost_dev_cleanup() call which will clean up vhost_dev > > >>> + * structure. > > >>> + */ > > >>> +for (; dev->started &&
[PATCH] qemu-nbd: Close inherited stderr
Hello, after e6df58a5, the inherited stderr 'old_stderr' won't get closed anymore if 'fork_process' is false. This causes other processes relying on EOF to infinitely block or crash. From 47ab9b517038d13117876a8bb3ef45c53d7f2f9e Mon Sep 17 00:00:00 2001 From: "Raphael Pour" Date: Tue, 12 May 2020 10:18:44 +0200 Subject: [PATCH] qemu-nbd: Close inherited stderr Close inherited stderr of the parent if fork_process is false. Otherwise no one will close it. (introduced by e6df58a5) Signed-off-by: Raphael Pour --- qemu-nbd.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/qemu-nbd.c b/qemu-nbd.c index 108a51f7e..f2981e18a 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -1032,8 +1032,15 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } -/* ... close the descriptor we inherited and go on. */ +/* ... close the descriptor we inherited and ... */ close(stderr_fd[1]); + +/* ... also close the old_stderr IF fork_process is false otherwise + * it will never get closed. + */ +if (!fork_process) { + close(old_stderr); +} } else { bool errors = false; char *buf; -- 2.25.4 -- Hetzner Online GmbH Am Datacenter-Park 1 08223 Falkenstein/Vogtland raphael.p...@hetzner.com www.hetzner.com Registergericht Ansbach, HRB 6089 Geschäftsführer: Martin Hetzner, Stephan Konvickova, Günther Müller signature.asc Description: OpenPGP digital signature
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
On Mon, 11 May 2020 16:46:45 +0100 "Dr. David Alan Gilbert" wrote: > * Daniel P. Berrangé (berra...@redhat.com) wrote: > > ... > > That way if QEMU does get stuck, you can start by tearing down the > > least distruptive channel. eg try tearing down the migration connection > > first (which shouldn't negatively impact the guest), and only if that > > doesn't work then, move on to tear down the NBD connection (which risks > > data loss) > > I wonder if a different way would be to make all network connections > register with yank, but then make yank take a list of connections to > shutdown(2). Good Idea. We could name the connections (/yank callbacks) in the form "nbd:", "chardev:" and "migration" (and add "netdev:...", etc. in the future). Then make yank take a list of connection names as you suggest and silently ignore connections that don't exist. And maybe even add a 'query-yank' oob command returning a list of registered connections so the management application can do pattern matching if it wants. Comments? Regards, Lukas Straub > Dave > > > Regards, > > Daniel > > -- > > |: https://berrange.com -o-https://www.flickr.com/photos/dberrange > > :| > > |: https://libvirt.org -o-https://fstop138.berrange.com > > :| > > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange > > :| > -- > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK > pgpYTKir4lhNS.pgp Description: OpenPGP digital signature
Re: [PATCH v2 5/5] vhost: add device started check in migration set log
On Tue, May 12, 2020 at 11:32:50AM +0800, Jason Wang wrote: > > On 2020/5/11 下午5:25, Dima Stepanov wrote: > >On Mon, May 11, 2020 at 11:15:53AM +0800, Jason Wang wrote: > >>On 2020/4/30 下午9:36, Dima Stepanov wrote: > >>>If vhost-user daemon is used as a backend for the vhost device, then we > >>>should consider a possibility of disconnect at any moment. If such > >>>disconnect happened in the vhost_migration_log() routine the vhost > >>>device structure will be clean up. > >>>At the start of the vhost_migration_log() function there is a check: > >>> if (!dev->started) { > >>> dev->log_enabled = enable; > >>> return 0; > >>> } > >>>To be consistent with this check add the same check after calling the > >>>vhost_dev_set_log() routine. This in general help not to break a > >>>migration due the assert() message. But it looks like that this code > >>>should be revised to handle these errors more carefully. > >>> > >>>In case of vhost-user device backend the fail paths should consider the > >>>state of the device. In this case we should skip some function calls > >>>during rollback on the error paths, so not to get the NULL dereference > >>>errors. > >>> > >>>Signed-off-by: Dima Stepanov > >>>--- > >>> hw/virtio/vhost.c | 39 +++ > >>> 1 file changed, 35 insertions(+), 4 deletions(-) > >>> > >>>diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > >>>index 3ee50c4..d5ab96d 100644 > >>>--- a/hw/virtio/vhost.c > >>>+++ b/hw/virtio/vhost.c > >>>@@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev > >>>*dev, > >>> static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) > >>> { > >>> int r, i, idx; > >>>+ > >>>+if (!dev->started) { > >>>+/* > >>>+ * If vhost-user daemon is used as a backend for the > >>>+ * device and the connection is broken, then the vhost_dev > >>>+ * structure will be reset all its values to 0. > >>>+ * Add additional check for the device state. > >>>+ */ > >>>+return -1; > >>>+} > >>>+ > >>> r = vhost_dev_set_features(dev, enable_log); > >>> if (r < 0) { > >>> goto err_features; > >>>@@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev *dev, > >>>bool enable_log) > >>> } > >>> return 0; > >>> err_vq: > >>>-for (; i >= 0; --i) { > >>>+/* > >>>+ * Disconnect with the vhost-user daemon can lead to the > >>>+ * vhost_dev_cleanup() call which will clean up vhost_dev > >>>+ * structure. > >>>+ */ > >>>+for (; dev->started && (i >= 0); --i) { > >>> idx = dev->vhost_ops->vhost_get_vq_index( > >> > >>Why need the check of dev->started here, can started be modified outside > >>mainloop? If yes, I don't get the check of !dev->started in the beginning of > >>this function. > >> > >No dev->started can't change outside the mainloop. The main problem is > >only for the vhost_user_blk daemon. Consider the case when we > >successfully pass the dev->started check at the beginning of the > >function, but after it we hit the disconnect on the next call on the > >second or third iteration: > > r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, enable_log); > >The unix socket backend device will call the disconnect routine for this > >device and reset the structure. So the structure will be reset (and > >dev->started set to false) inside this set_addr() call. > > > I still don't get here. I think the disconnect can not happen in the middle > of vhost_dev_set_log() since both of them were running in mainloop. And even > if it can, we probably need other synchronization mechanism other than > simple check here. Disconnect isn't happened in the separate thread it is happened in this routine inside vhost_dev_set_log. When for instance vhost_user_write() call failed: vhost_user_set_log_base() vhost_user_write() vhost_user_blk_disconnect() vhost_dev_cleanup() vhost_user_backend_cleanup() So the point is that if we somehow got a disconnect with the vhost-user-blk daemon before the vhost_user_write() call then it will continue clean up by running vhost_user_blk_disconnect() function. I wrote a more detailed backtrace stack in the separate thread, which is pretty similar to what we have here: Re: [PATCH v2 4/5] vhost: check vring address before calling unmap The places are different but the problem is pretty similar. So if vhost-user commands handshake then everything is fine and reconnect will work as expected. The only problem is how to handle reconnect properly between vhost-user command send/receive. As i wrote we have a test: - run src VM with vhost-usr-blk daemon used - run fio inside it - perform reconnect every X seconds (just kill and restart daemon), X is random - run dst VM - perform migration - fio should complete in dst VM And we cycle this test like forever. So it fails once per ~25 iteration. By adding some delays in
Re: [PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT
On 07/05/2020 0:49, Eric Blake wrote: On 5/6/20 4:34 PM, Eyal Moscovici wrote: Following commit f46bfdbfc8f95cf65d7818ef68a801e063c40332 (util/cutils: Change qemu_strtosz*() from int64_t to uint64_t) which added a similar check to cvtnum. As a result there is no need to check it separately outside of cvtnum. Acked-by: Mark Kanda Signed-off-by: Eyal Moscovici --- qemu-img.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 6a4327aaba..116a9c6349 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv) int64_t sval; sval = cvtnum(optarg); - if (sval < 0 || sval > INT_MAX) { + if (sval < 0) { error_report("Invalid buffer size specified"); INT_MAX is smaller than cvtnum's check for INT64_MAX. This code change allows larger buffer sizes, which is probably not a good idea. I was the most hesitant about this patch because of the size difference. I decided to submit it because the type is int64 which pairs better with the MAX_INT64 check and I couldn't find a concrete reason to cap the variable at MAX_INT. Do you a concrete reason? Because the max size should rerally come into effect on very fringe cases and if you are asking for a really big buffer you should know the risks. return 1; } @@ -4320,7 +4320,7 @@ static int img_bench(int argc, char **argv) int64_t sval; sval = cvtnum(optarg); - if (sval < 0 || sval > INT_MAX) { + if (sval < 0) { error_report("Invalid step size specified"); return 1; } @@ -4493,7 +4493,7 @@ static int img_dd_bs(const char *arg, res = cvtnum(arg); - if (res <= 0 || res > INT_MAX) { + if (res <= 0) { error_report("invalid number: '%s'", arg); return 1; } NACK.
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote: > On Mon, 11 May 2020 16:46:45 +0100 > "Dr. David Alan Gilbert" wrote: > > > * Daniel P. Berrangé (berra...@redhat.com) wrote: > > > ... > > > That way if QEMU does get stuck, you can start by tearing down the > > > least distruptive channel. eg try tearing down the migration connection > > > first (which shouldn't negatively impact the guest), and only if that > > > doesn't work then, move on to tear down the NBD connection (which risks > > > data loss) > > > > I wonder if a different way would be to make all network connections > > register with yank, but then make yank take a list of connections to > > shutdown(2). > > Good Idea. We could name the connections (/yank callbacks) in the > form "nbd:", "chardev:" and "migration" > (and add "netdev:...", etc. in the future). Then make yank take a > list of connection names as you suggest and silently ignore connections > that don't exist. And maybe even add a 'query-yank' oob command returning > a list of registered connections so the management application can do > pattern matching if it wants. Yes, that would make the yank command much more flexible in how it can be used. As an alternative to using formatted strings like this, it could be modelled more explicitly in QAPI { 'struct': 'YankChannels', 'data': { 'chardev': [ 'string' ], 'nbd': ['string'], 'migration': bool } } In this example, 'chardev' would accept a list of chardev IDs which have it enabled, 'nbd' would accept a list of block node IDs which have it enabled, and migration is a singleton on/off. The benefit of this modelling is that you can introspect QEMU to discover what classes of channels support being yanked by this QEMU build, as well as what instances are configured to be yanked. ie you can distinguish between a QEMU that doesn't support yanking network devices, from a QEMU that does support yanking network devices, but doesn't have it enabled for any network device instances. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH v2 2/5] qemu_img: add error report to cvtnum
On 07/05/2020 0:59, Eric Blake wrote: On 5/6/20 4:34 PM, Eyal Moscovici wrote: All calls to cvtnum check the return value and print the same error message more or less. And so error reporting moved to cvtnum to reduce duplicate code and provide a single error message. Acked-by: Mark Kanda Signed-off-by: Eyal Moscovici --- qemu-img.c | 63 -- tests/qemu-iotests/049.out | 4 +-- 2 files changed, 28 insertions(+), 39 deletions(-) - err = qemu_strtosz(s, NULL, &value); - if (err < 0) { + err = qemu_strtosz(arg_value, NULL, &value); + if (err < 0 && err != -ERANGE) { + error_report("Invalid %s specified! You may use " + "k, M, G, T, P or E suffixes for ", arg_name); + error_report("kilobytes, megabytes, gigabytes, terabytes, " + "petabytes and exabytes."); return err; } - if (value > INT64_MAX) { + if (err == -ERANGE || value > INT64_MAX) { + error_report("Invalid %s specified! Must be less than 8 EiB!", Copied from our pre-existing errors, but why are we shouting at our user? This would be a good time to s/!/./ to tone it down a bit. Sure, will fix. @@ -4491,10 +4488,12 @@ static int img_dd_bs(const char *arg, { int64_t res; - res = cvtnum(arg); + res = cvtnum("bs", arg); - if (res <= 0) { - error_report("invalid number: '%s'", arg); + if (res < 0) { + return 1; + } else if (res == 0) { + error_report("Invalid bs specified! It cannot be 0."); Maybe it's worth two functions: int64_t cvtnum_full(const char *name, const char *value, int64_t min, int64_t max) and then a common helper: int64_t cvtnum(const char *name, const char *value) { return cvtnum_full(name, value, 0, INT64_MAX); } many existing callers remain with cvtnum(), but callers like this could be cvtnum("bs", arg, 1, INT64_MAX). You'd still have to special-case other restrictions, such as whether a number must a power-of-2, but that's fewer places. Great idea, I will create two functions. +++ b/tests/qemu-iotests/049.out @@ -92,13 +92,13 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=1649267441664 cluster_size=65536 l == 3. Invalid sizes == qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1024 -qemu-img: Image size must be less than 8 EiB! +qemu-img: Invalid image size specified! Must be less than 8 EiB! Nice that you checked for iotest fallout. Is this really the only impacted test? Thanks, yes.
Re: [PATCH v2 5/5] qemu-img: Add --start-offset and --max-length to map
On 07/05/2020 1:04, Eric Blake wrote: On 5/6/20 4:34 PM, Eyal Moscovici wrote: The mapping operation of large disks especially ones stored over a long chain of QCOW2 files can take a long time to finish. Additionally when mapping fails there was no way recover by restarting the mapping from the failed location. The new options, --start-offset and --max-length allows the user to divide these type of map operations into shorter independent tasks. Reviewed-by: Eric Blake This patch has some changes from v1. Among others,... @@ -3041,6 +3045,18 @@ static int img_map(int argc, char **argv) case OPTION_OUTPUT: output = optarg; break; + case 's': + start_offset = cvtnum("start offset", optarg); + if (start_offset < 0) { + return 1; + } + break; the new semantics of cvtnum() in this series is enough of a difference that I would have removed R-b to make sure the updated patch gets re-reviewed, if it had been me as author. But in this case, it does look like the changes are all addressed to comments I suggested in v1, so I'm fine that you left my R-b. Ok, got it.
Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure
On Fri, May 8, 2020 at 9:03 PM Eric Blake wrote: > > It's useful to know how much space can be occupied by qcow2 persistent > bitmaps, even though such metadata is unrelated to the guest-visible > data. Report this value as an additional field, present when > measuring an existing image and the output format supports bitmaps. > Update iotest 178 and 190 to updated output, as well as new coverage > in 190 demonstrating non-zero values made possible with the > recently-added qemu-img bitmap command. > > The addition of a new field demonstrates why we should always > zero-initialize qapi C structs; while the qcow2 driver still fully > populates all fields, the raw and crypto drivers had to be tweaked to > avoid uninitialized data. > > See also: https://bugzilla.redhat.com/1779904 > > Reported-by: Nir Soffer > Signed-off-by: Eric Blake > --- > qapi/block-core.json | 15 +++ > block/crypto.c | 2 +- > block/qcow2.c| 37 --- > block/raw-format.c | 2 +- > qemu-img.c | 3 +++ > tests/qemu-iotests/178.out.qcow2 | 16 > tests/qemu-iotests/190 | 43 ++-- > tests/qemu-iotests/190.out | 23 - > 8 files changed, 128 insertions(+), 13 deletions(-) > > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 943df1926a91..9a7a388c7ad3 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -633,18 +633,23 @@ > # efficiently so file size may be smaller than virtual disk size. > # > # The values are upper bounds that are guaranteed to fit the new image file. > -# Subsequent modification, such as internal snapshot or bitmap creation, may > -# require additional space and is not covered here. > +# Subsequent modification, such as internal snapshot or further bitmap > +# creation, may require additional space and is not covered here. > # > -# @required: Size required for a new image file, in bytes. > +# @required: Size required for a new image file, in bytes, when copying just > +#guest-visible contents. > # > # @fully-allocated: Image file size, in bytes, once data has been written > -# to all sectors. > +# to all sectors, when copying just guest-visible contents. This does not break old code since previously we always reported only guest visible content here, but it changes the semantics, and now you cannot allocate "required" size, you need to allocate "required" size with "bitmaps" size. If we add a new extension all users will have to change the calculation again. I think keeping the semantics that "required" and "fully-allocated" are the size you need based on the parameters of the command is easier to use and more consistent. Current the measure command contract is that invoking it with similar parameters as used in convert will give the right measurement needed for the convert command. > +# > +# @bitmaps: Additional size required for bitmap metadata in a source image, > +# if that bitmap metadata can be copied in addition to guest > +# contents. (since 5.1) Reporting bitmaps without specifying that bitmaps are needed is less consistent with "qemu-img convert", but has the advantage that we don't need to check if measure supports bitmaps. But this will work only if new versions always report "bitmaps" even when the value is zero. With the current way, to measure an image we need to do: qemu-img info --output json ... check if image contains bitmaps qemu-img measure --output json ... check if bitmaps are reported. If image has bitmaps and bitmaps are not reported, we know that we have an old version of qemu-img that does not know how to measure bitmaps. If bitmaps are reported in both commands we will use the value when creating block devices. If we always report bitmaps even when they are zero, we don't need to run "qemu-img info". But my preferred interface is: qemu-img measure --bitmaps ... And report bitmaps only if the user asked to get the value. In this case the required and fully-allocated values will include bitmaps. To learn if qemu-img measure understand bitmaps we can check --help output, like we did in the past until we can require the version on all platforms. An advantage is not having to change old tests. Nir > # > # Since: 2.10 > ## > { 'struct': 'BlockMeasureInfo', > - 'data': {'required': 'int', 'fully-allocated': 'int'} } > + 'data': {'required': 'int', 'fully-allocated': 'int', '*bitmaps': 'int'} } > > ## > # @query-block: > diff --git a/block/crypto.c b/block/crypto.c > index 6b21d6bf6c01..eadbcb248563 100644 > --- a/block/crypto.c > +++ b/block/crypto.c > @@ -552,7 +552,7 @@ static BlockMeasureInfo *block_crypto_measure(QemuOpts > *opts, > * Unallocated blocks are still encrypted so allocation status makes no > * difference to the file size. > */ > -info = g
[PATCH v4 0/6] scripts: More Python fixes
Trivial Python3 fixes, again... Since v3: - Fixed missing scripts/qemugdb/timers.py (kwolf) - Cover more scripts - Check for __main__ in few scripts Since v2: - Remove patch updating MAINTAINERS Since v1: - Added Alex Bennée A-b tags - Addressed John Snow review comments - Use /usr/bin/env - Do not modify os.path (dropped last patch) Philippe Mathieu-Daudé (6): scripts/qemugdb: Remove shebang header scripts/qemu-gdb: Use Python 3 interpreter scripts/qmp: Use Python 3 interpreter scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main() scripts/modules/module_block: Use Python 3 interpreter & add pseudo-main tests/migration/guestperf: Use Python 3 interpreter scripts/kvm/vmxcap | 7 --- scripts/modules/module_block.py| 31 +++--- scripts/qemu-gdb.py| 4 ++-- scripts/qemugdb/__init__.py| 3 +-- scripts/qemugdb/aio.py | 3 +-- scripts/qemugdb/coroutine.py | 3 +-- scripts/qemugdb/mtree.py | 4 +--- scripts/qemugdb/tcg.py | 1 - scripts/qemugdb/timers.py | 1 - scripts/qmp/qom-get| 2 +- scripts/qmp/qom-list | 2 +- scripts/qmp/qom-set| 2 +- scripts/qmp/qom-tree | 2 +- tests/migration/guestperf-batch.py | 2 +- tests/migration/guestperf-plot.py | 2 +- tests/migration/guestperf.py | 2 +- 16 files changed, 33 insertions(+), 38 deletions(-) -- 2.21.3
[PATCH v4 3/6] scripts/qmp: Use Python 3 interpreter
From: Philippe Mathieu-Daudé Signed-off-by: Philippe Mathieu-Daudé --- scripts/qmp/qom-get | 2 +- scripts/qmp/qom-list | 2 +- scripts/qmp/qom-set | 2 +- scripts/qmp/qom-tree | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/qmp/qom-get b/scripts/qmp/qom-get index 007b4cd442..7c5ede91bb 100755 --- a/scripts/qmp/qom-get +++ b/scripts/qmp/qom-get @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-list b/scripts/qmp/qom-list index 03bda3446b..bb68fd65d4 100755 --- a/scripts/qmp/qom-list +++ b/scripts/qmp/qom-list @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-set b/scripts/qmp/qom-set index c37fe78b00..19881d85e9 100755 --- a/scripts/qmp/qom-set +++ b/scripts/qmp/qom-set @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # diff --git a/scripts/qmp/qom-tree b/scripts/qmp/qom-tree index 1c8acf61e7..fa91147a03 100755 --- a/scripts/qmp/qom-tree +++ b/scripts/qmp/qom-tree @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## # QEMU Object Model test tools # -- 2.21.3
[PATCH v4 4/6] scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main()
Signed-off-by: Philippe Mathieu-Daudé --- scripts/kvm/vmxcap | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index 971ed0e721..6fe66d5f57 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # # tool for querying VMX capabilities # @@ -275,5 +275,6 @@ controls = [ ), ] -for c in controls: -c.show() +if __name__ == '__main__': +for c in controls: +c.show() -- 2.21.3
[PATCH v4 5/6] scripts/modules/module_block: Use Python 3 interpreter & add pseudo-main
Signed-off-by: Philippe Mathieu-Daudé --- scripts/modules/module_block.py | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/scripts/modules/module_block.py b/scripts/modules/module_block.py index f23191fac1..2e7021b952 100644 --- a/scripts/modules/module_block.py +++ b/scripts/modules/module_block.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # # Module information generator # @@ -10,7 +10,6 @@ # This work is licensed under the terms of the GNU GPL, version 2. # See the COPYING file in the top-level directory. -import sys import os def get_string_struct(line): @@ -80,19 +79,21 @@ def print_bottom(fheader): #endif ''') -# First argument: output file -# All other arguments: modules source files (.c) -output_file = sys.argv[1] -with open(output_file, 'w') as fheader: -print_top(fheader) +if __name__ == '__main__': +import sys +# First argument: output file +# All other arguments: modules source files (.c) +output_file = sys.argv[1] +with open(output_file, 'w') as fheader: +print_top(fheader) -for filename in sys.argv[2:]: -if os.path.isfile(filename): -process_file(fheader, filename) -else: -print("File " + filename + " does not exist.", file=sys.stderr) -sys.exit(1) +for filename in sys.argv[2:]: +if os.path.isfile(filename): +process_file(fheader, filename) +else: +print("File " + filename + " does not exist.", file=sys.stderr) +sys.exit(1) -print_bottom(fheader) +print_bottom(fheader) -sys.exit(0) +sys.exit(0) -- 2.21.3
[PATCH v4 2/6] scripts/qemu-gdb: Use Python 3 interpreter
From: Philippe Mathieu-Daudé Signed-off-by: Philippe Mathieu-Daudé --- scripts/qemu-gdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/qemu-gdb.py b/scripts/qemu-gdb.py index f2a305c42e..e0bfa7b5a4 100644 --- a/scripts/qemu-gdb.py +++ b/scripts/qemu-gdb.py @@ -1,5 +1,5 @@ -#!/usr/bin/python - +#!/usr/bin/env python3 +# # GDB debugging support # # Copyright 2012 Red Hat, Inc. and/or its affiliates -- 2.21.3
[PATCH v4 1/6] scripts/qemugdb: Remove shebang header
From: Philippe Mathieu-Daudé These scripts are loaded as plugin by GDB (and they don't have any __main__ entry point). Remove the shebang header. Acked-by: Alex Bennée Signed-off-by: Philippe Mathieu-Daudé --- scripts/qemugdb/__init__.py | 3 +-- scripts/qemugdb/aio.py | 3 +-- scripts/qemugdb/coroutine.py | 3 +-- scripts/qemugdb/mtree.py | 4 +--- scripts/qemugdb/tcg.py | 1 - scripts/qemugdb/timers.py| 1 - 6 files changed, 4 insertions(+), 11 deletions(-) diff --git a/scripts/qemugdb/__init__.py b/scripts/qemugdb/__init__.py index 969f552b26..da8ff612e5 100644 --- a/scripts/qemugdb/__init__.py +++ b/scripts/qemugdb/__init__.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright (c) 2015 Linaro Ltd diff --git a/scripts/qemugdb/aio.py b/scripts/qemugdb/aio.py index 2ba00c..d7c1ba0c28 100644 --- a/scripts/qemugdb/aio.py +++ b/scripts/qemugdb/aio.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support: aio/iohandler debug # # Copyright (c) 2015 Red Hat, Inc. diff --git a/scripts/qemugdb/coroutine.py b/scripts/qemugdb/coroutine.py index 41e079d0e2..db61389022 100644 --- a/scripts/qemugdb/coroutine.py +++ b/scripts/qemugdb/coroutine.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright 2012 Red Hat, Inc. and/or its affiliates diff --git a/scripts/qemugdb/mtree.py b/scripts/qemugdb/mtree.py index 3030a60d3f..8fe42c3c12 100644 --- a/scripts/qemugdb/mtree.py +++ b/scripts/qemugdb/mtree.py @@ -1,5 +1,4 @@ -#!/usr/bin/python - +# # GDB debugging support # # Copyright 2012 Red Hat, Inc. and/or its affiliates @@ -84,4 +83,3 @@ def print_item(self, ptr, offset = gdb.Value(0), level = 0): while not isnull(subregion): self.print_item(subregion, addr, level) subregion = subregion['subregions_link']['tqe_next'] - diff --git a/scripts/qemugdb/tcg.py b/scripts/qemugdb/tcg.py index 18880fc9a7..16c03c06a9 100644 --- a/scripts/qemugdb/tcg.py +++ b/scripts/qemugdb/tcg.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # -*- coding: utf-8 -*- # # GDB debugging support, TCG status diff --git a/scripts/qemugdb/timers.py b/scripts/qemugdb/timers.py index f0e132d27a..46537b27cf 100644 --- a/scripts/qemugdb/timers.py +++ b/scripts/qemugdb/timers.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # -*- coding: utf-8 -*- # GDB debugging support # -- 2.21.3
[PATCH v4 6/6] tests/migration/guestperf: Use Python 3 interpreter
Signed-off-by: Philippe Mathieu-Daudé --- tests/migration/guestperf-batch.py | 2 +- tests/migration/guestperf-plot.py | 2 +- tests/migration/guestperf.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/migration/guestperf-batch.py b/tests/migration/guestperf-batch.py index cb150ce804..f1e900908d 100755 --- a/tests/migration/guestperf-batch.py +++ b/tests/migration/guestperf-batch.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # # Migration test batch comparison invokation # diff --git a/tests/migration/guestperf-plot.py b/tests/migration/guestperf-plot.py index d70bb7a557..907151011a 100755 --- a/tests/migration/guestperf-plot.py +++ b/tests/migration/guestperf-plot.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # # Migration test graph plotting command # diff --git a/tests/migration/guestperf.py b/tests/migration/guestperf.py index 99b027e8ba..ba1c4bc4ca 100755 --- a/tests/migration/guestperf.py +++ b/tests/migration/guestperf.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 # # Migration test direct invokation command # -- 2.21.3
Re: [PATCH v3 1/2] scripts/qemugdb: Remove shebang header
On 5/12/20 10:55 AM, Kevin Wolf wrote: Am 12.05.2020 um 09:06 hat Philippe Mathieu-Daudé geschrieben: These scripts are loaded as plugin by GDB (and they don't have any __main__ entry point). Remove the shebang header. Acked-by: Alex Bennée Signed-off-by: Philippe Mathieu-Daudé --- scripts/qemugdb/__init__.py | 3 +-- scripts/qemugdb/aio.py | 3 +-- scripts/qemugdb/coroutine.py | 3 +-- scripts/qemugdb/mtree.py | 4 +--- scripts/qemugdb/tcg.py | 1 - There is still a shebang line left in scripts/qemugdb/timers.py. Oops, thanks for catching this. Kevin
Re: [RFC] bdrv_flush: only use fast path when in owned AioContext
Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben: > Just because we're in a coroutine doesn't imply ownership of the context > of the flushed drive. In such a case use the slow path which explicitly > enters bdrv_flush_co_entry in the correct AioContext. > > Signed-off-by: Stefan Reiter > --- > > We've experienced some lockups in this codepath when taking snapshots of VMs > with drives that have IO-Threads enabled (we have an async 'savevm' > implementation running from a coroutine). > > Currently no reproducer for upstream versions I could find, but in testing > this > patch fixes all issues we're seeing and I think the logic checks out. > > The fast path pattern is repeated a few times in this file, so if this change > makes sense, it's probably worth evaluating the other occurences as well. What do you mean by "owning" the context? If it's about taking the AioContext lock, isn't the problem more with calling bdrv_flush() from code that doesn't take the locks? Though I think we have some code that doesn't only rely on holding the AioContext locks, but that actually depends on running in the right thread, so the change looks right anyway. Kevin > block/io.c | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/block/io.c b/block/io.c > index aba67f66b9..ee7310fa13 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -2895,8 +2895,9 @@ int bdrv_flush(BlockDriverState *bs) > .ret = NOT_DONE, > }; > > -if (qemu_in_coroutine()) { > -/* Fast-path if already in coroutine context */ > +if (qemu_in_coroutine() && > +bdrv_get_aio_context(bs) == qemu_get_current_aio_context()) { > +/* Fast-path if already in coroutine and we own the drive's context > */ > bdrv_flush_co_entry(&flush_co); > } else { > co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co); > -- > 2.20.1 > >
Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure
On 12.05.20 12:17, Nir Soffer wrote: > On Fri, May 8, 2020 at 9:03 PM Eric Blake wrote: >> >> It's useful to know how much space can be occupied by qcow2 persistent >> bitmaps, even though such metadata is unrelated to the guest-visible >> data. Report this value as an additional field, present when >> measuring an existing image and the output format supports bitmaps. >> Update iotest 178 and 190 to updated output, as well as new coverage >> in 190 demonstrating non-zero values made possible with the >> recently-added qemu-img bitmap command. >> >> The addition of a new field demonstrates why we should always >> zero-initialize qapi C structs; while the qcow2 driver still fully >> populates all fields, the raw and crypto drivers had to be tweaked to >> avoid uninitialized data. >> >> See also: https://bugzilla.redhat.com/1779904 >> >> Reported-by: Nir Soffer >> Signed-off-by: Eric Blake >> --- >> qapi/block-core.json | 15 +++ >> block/crypto.c | 2 +- >> block/qcow2.c| 37 --- >> block/raw-format.c | 2 +- >> qemu-img.c | 3 +++ >> tests/qemu-iotests/178.out.qcow2 | 16 >> tests/qemu-iotests/190 | 43 ++-- >> tests/qemu-iotests/190.out | 23 - >> 8 files changed, 128 insertions(+), 13 deletions(-) >> >> diff --git a/qapi/block-core.json b/qapi/block-core.json >> index 943df1926a91..9a7a388c7ad3 100644 >> --- a/qapi/block-core.json >> +++ b/qapi/block-core.json >> @@ -633,18 +633,23 @@ >> # efficiently so file size may be smaller than virtual disk size. >> # >> # The values are upper bounds that are guaranteed to fit the new image file. >> -# Subsequent modification, such as internal snapshot or bitmap creation, may >> -# require additional space and is not covered here. >> +# Subsequent modification, such as internal snapshot or further bitmap >> +# creation, may require additional space and is not covered here. >> # >> -# @required: Size required for a new image file, in bytes. >> +# @required: Size required for a new image file, in bytes, when copying just >> +#guest-visible contents. >> # >> # @fully-allocated: Image file size, in bytes, once data has been written >> -# to all sectors. >> +# to all sectors, when copying just guest-visible >> contents. > > This does not break old code since previously we always reported only > guest visible content > here, but it changes the semantics, and now you cannot allocate > "required" size, you need > to allocate "required" size with "bitmaps" size. Only if you copy the bitmaps, though, which requires using the --bitmaps switch for convert. > If we add a new > extension all users will have to > change the calculation again. It was my impression that existing users won’t have to do that, because they don’t use --bitmaps yet. In contrast, if we included the bitmap size in @required or @fully-allocated, then previous users that didn’t copy bitmaps to the destination (which they couldn’t) would allocate too much space. ...revisiting this after reading more of your mail: With a --bitmaps switch, existing users wouldn’t suffer from such compatibility problems. However, then users (that now want to copy bitmaps) will have to pass the new --bitmaps flag in the command line, and I don’t see how that’s less complicated than just adding @bitmaps to @required. > I think keeping the semantics that "required" and "fully-allocated" > are the size you need based > on the parameters of the command is easier to use and more consistent. > Current the measure > command contract is that invoking it with similar parameters as used > in convert will give > the right measurement needed for the convert command. > >> +# >> +# @bitmaps: Additional size required for bitmap metadata in a source image, >> +# if that bitmap metadata can be copied in addition to guest >> +# contents. (since 5.1) > > Reporting bitmaps without specifying that bitmaps are needed is less > consistent > with "qemu-img convert", but has the advantage that we don't need to > check if measure > supports bitmaps. But this will work only if new versions always > report "bitmaps" even when > the value is zero. > > With the current way, to measure an image we need to do: > > qemu-img info --output json ... > check if image contains bitmaps > qemu-img measure --output json ... > check if bitmaps are reported. > > If image has bitmaps and bitmaps are not reported, we know that we have an old > version of qemu-img that does not know how to measure bitmaps. Well, but you’ll also see that convert --bitmaps will fail. But I can see that you probably want to know about that before you create the target image. > If bitmaps are reported in both commands we will use the value when creating > block devices. And otherwise? Do you error out, or d
Re: [PATCH v5 13/15] acpi: drop build_piix4_pm()
On Mon, May 11, 2020 at 09:37:32PM +0200, Igor Mammedov wrote: > On Thu, 7 May 2020 15:16:38 +0200 > Gerd Hoffmann wrote: > > > The _SB.PCI0.PX13.P13C opregion (holds isa device enable bits) > > is not used any more, remove it from DSDT. > > > > Signed-off-by: Gerd Hoffmann > > --- > > hw/i386/acpi-build.c | 16 > > 1 file changed, 16 deletions(-) > > > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > > index 765409a90eb6..c1e63cce5e8e 100644 > > --- a/hw/i386/acpi-build.c > > +++ b/hw/i386/acpi-build.c > > @@ -1441,21 +1441,6 @@ static void build_q35_isa_bridge(Aml *table) > > aml_append(table, scope); > > } > > > > -static void build_piix4_pm(Aml *table) > > -{ > > -Aml *dev; > > -Aml *scope; > > - > > -scope = aml_scope("_SB.PCI0"); > > -dev = aml_device("PX13"); > > -aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010003))); > I agree about removing P13C but I'm not sure if it's safe to remove > whole isa bridge The isa bridge is _SB0.PCI0.ISA (piix4 function 0). _SB.PCI0.PX13 is the acpi pm device (piix4 function 3). So this does _not_ remove the isa bridge. And given that PX13 has nothing declared beside the opregion we are removing and I have not found any other references to the PX13 device I assumed it is ok to drop that one too. Didn't notice any odd side effects in testing. take care, Gerd
Re: [PATCH v4 0/6] scripts: More Python fixes
Am 12.05.2020 um 12:32 hat Philippe Mathieu-Daudé geschrieben: > Trivial Python3 fixes, again... > > Since v3: > - Fixed missing scripts/qemugdb/timers.py (kwolf) > - Cover more scripts > - Check for __main__ in few scripts I'm not sure if the __main__ check actually provides anything useful in source files of standalone tools that aren't supposed to be imported from somewhere else. But of course, it's not wrong either. Reviewed-by: Kevin Wolf
Re: [RFC] bdrv_flush: only use fast path when in owned AioContext
Am 12.05.2020 um 12:57 hat Kevin Wolf geschrieben: > Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben: > > Just because we're in a coroutine doesn't imply ownership of the context > > of the flushed drive. In such a case use the slow path which explicitly > > enters bdrv_flush_co_entry in the correct AioContext. > > > > Signed-off-by: Stefan Reiter > > --- > > > > We've experienced some lockups in this codepath when taking snapshots of VMs > > with drives that have IO-Threads enabled (we have an async 'savevm' > > implementation running from a coroutine). > > > > Currently no reproducer for upstream versions I could find, but in testing > > this > > patch fixes all issues we're seeing and I think the logic checks out. > > > > The fast path pattern is repeated a few times in this file, so if this > > change > > makes sense, it's probably worth evaluating the other occurences as well. > > What do you mean by "owning" the context? If it's about taking the > AioContext lock, isn't the problem more with calling bdrv_flush() from > code that doesn't take the locks? > > Though I think we have some code that doesn't only rely on holding the > AioContext locks, but that actually depends on running in the right > thread, so the change looks right anyway. Well, the idea is right, but the change itself isn't, of course. If we're already in coroutine context, we must not busy wait with BDRV_POLL_WHILE(). I'll see if I can put something together after lunch. Kevin
Re: [PATCH v4 4/6] scripts/kvm/vmxcap: Use Python 3 interpreter and add pseudo-main()
On 12/05/20 12:32, Philippe Mathieu-Daudé wrote: > Signed-off-by: Philippe Mathieu-Daudé > --- > scripts/kvm/vmxcap | 7 --- > 1 file changed, 4 insertions(+), 3 deletions(-) > > diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap > index 971ed0e721..6fe66d5f57 100755 > --- a/scripts/kvm/vmxcap > +++ b/scripts/kvm/vmxcap > @@ -1,4 +1,4 @@ > -#!/usr/bin/python > +#!/usr/bin/env python3 > # > # tool for querying VMX capabilities > # > @@ -275,5 +275,6 @@ controls = [ > ), > ] > > -for c in controls: > -c.show() > +if __name__ == '__main__': > +for c in controls: > +c.show() > Acked-by: Paolo Bonzini
Re: [RFC] bdrv_flush: only use fast path when in owned AioContext
On 5/12/20 1:32 PM, Kevin Wolf wrote: Am 12.05.2020 um 12:57 hat Kevin Wolf geschrieben: Am 11.05.2020 um 18:50 hat Stefan Reiter geschrieben: Just because we're in a coroutine doesn't imply ownership of the context of the flushed drive. In such a case use the slow path which explicitly enters bdrv_flush_co_entry in the correct AioContext. Signed-off-by: Stefan Reiter --- We've experienced some lockups in this codepath when taking snapshots of VMs with drives that have IO-Threads enabled (we have an async 'savevm' implementation running from a coroutine). Currently no reproducer for upstream versions I could find, but in testing this patch fixes all issues we're seeing and I think the logic checks out. The fast path pattern is repeated a few times in this file, so if this change makes sense, it's probably worth evaluating the other occurences as well. What do you mean by "owning" the context? If it's about taking the AioContext lock, isn't the problem more with calling bdrv_flush() from code that doesn't take the locks? Though I think we have some code that doesn't only rely on holding the AioContext locks, but that actually depends on running in the right thread, so the change looks right anyway. "Owning" as in it only works (doesn't hang) when bdrv_flush_co_entry runs on the same AioContext that the BlockDriverState it's flushing belongs to. We hold the locks for all AioContexts we want to flush in our code (in this case called from do_vm_stop/bdrv_flush_all so we're even in a drained section). Well, the idea is right, but the change itself isn't, of course. If we're already in coroutine context, we must not busy wait with BDRV_POLL_WHILE(). I'll see if I can put something together after lunch. Kevin Thanks for taking a look!
Re: [PATCH] qemu-nbd: Close inherited stderr
On 5/12/20 3:56 AM, Raphael Pour wrote: Hello, after e6df58a5, the inherited stderr 'old_stderr' won't get closed anymore if 'fork_process' is false. This causes other processes relying on EOF to infinitely block or crash. From 47ab9b517038d13117876a8bb3ef45c53d7f2f9e Mon Sep 17 00:00:00 2001 From: "Raphael Pour" Date: Tue, 12 May 2020 10:18:44 +0200 Subject: [PATCH] qemu-nbd: Close inherited stderr Close inherited stderr of the parent if fork_process is false. Otherwise no one will close it. (introduced by e6df58a5) Signed-off-by: Raphael Pour --- qemu-nbd.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) Wouldn't it just be simpler to not dup in the first place? diff --git i/qemu-nbd.c w/qemu-nbd.c index 4aa005004ebd..6ba2544feb3a 100644 --- i/qemu-nbd.c +++ w/qemu-nbd.c @@ -916,7 +916,9 @@ int main(int argc, char **argv) } else if (pid == 0) { close(stderr_fd[0]); -old_stderr = dup(STDERR_FILENO); +if (fork_process) { +old_stderr = dup(STDERR_FILENO); +} ret = qemu_daemon(1, 0); /* Temporarily redirect stderr to the parent's pipe... */ -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT
On 5/12/20 4:39 AM, Eyal Moscovici wrote: +++ b/qemu-img.c @@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv)              int64_t sval;               sval = cvtnum(optarg); -           if (sval < 0 || sval > INT_MAX) { +           if (sval < 0) {                  error_report("Invalid buffer size specified"); INT_MAX is smaller than cvtnum's check for INT64_MAX. This code change allows larger buffer sizes, which is probably not a good idea. I was the most hesitant about this patch because of the size difference. I decided to submit it because the type is int64 which pairs better with the MAX_INT64 check and I couldn't find a concrete reason to cap the variable at MAX_INT. Do you a concrete reason? Because the max size should rerally come into effect on very fringe cases and if you are asking for a really big buffer you should know the risks. The commit message does not call out a change in behavior. If you are sure that you want a change in behavior, then justify that: mention that your patch specifically changes the behavior to allow larger buffers, and why that is okay. Otherwise, it looks like your patch is accidental in its behavior change, which costs reviewer time. In this particular case, I see no reason to allow larger buffers. That is, the existing limits made sense (benchmarking anything larger than the maximum qcow2 cluster size of 2M is unlikely to produce useful results, let alone something as large as 2G, so allowing > 2G is not helping the user). So even if your commit message did a good job of explaining that the change was intentional, it is a tough sell why we need it. If all your commit is intended to do is refactoring, then it should not be changing behavior. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
[RFC PATCH 0/3] block: Synchronous bdrv_*() from coroutine in different AioContext
Stefan (Reiter), after looking a bit closer at this, I think there is no bug in QEMU, but the bug is in your coroutine code that calls block layer functions without moving into the right AioContext first. I've written this series anyway as it potentially makes the life of callers easier and would probably make your buggy code correct. However, it doesn't feel right to commit something like patch 2 without having a user for it. Is there a reason why you can't upstream your async snapshot code? The series would also happen fix a bug in my recent patch to convert qmp_block_resize() to coroutines, but I feel it's not how I would naturally fix it. Switching the thread already in the QMP handler before calling bdrv_truncate() would feel more appropriate. I wonder if it wouldn't actually be the same for your snapshot code. Kevin Wolf (3): block: Factor out bdrv_run_co() block: Allow bdrv_run_co() from different AioContext block: Assert we're running in the right thread block/io.c | 122 - 1 file changed, 45 insertions(+), 77 deletions(-) -- 2.25.3
[RFC PATCH 1/3] block: Factor out bdrv_run_co()
We have a few bdrv_*() functions that can either spawn a new coroutine and wait for it with BDRV_POLL_WHILE() or use a fastpath if they are alreeady running in a coroutine. All of them duplicate basically the same code. Factor the common code into a new function bdrv_run_co(). Signed-off-by: Kevin Wolf --- block/io.c | 104 +++-- 1 file changed, 28 insertions(+), 76 deletions(-) diff --git a/block/io.c b/block/io.c index 7d30e61edc..c1badaadc9 100644 --- a/block/io.c +++ b/block/io.c @@ -891,6 +891,22 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, return 0; } +static int bdrv_run_co(BlockDriverState *bs, CoroutineEntry *entry, + void *opaque, int *ret) +{ +if (qemu_in_coroutine()) { +/* Fast-path if already in coroutine context */ +entry(opaque); +} else { +Coroutine *co = qemu_coroutine_create(entry, opaque); +*ret = NOT_DONE; +bdrv_coroutine_enter(bs, co); +BDRV_POLL_WHILE(bs, *ret == NOT_DONE); +} + +return *ret; +} + typedef struct RwCo { BdrvChild *child; int64_t offset; @@ -923,25 +939,15 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset, QEMUIOVector *qiov, bool is_write, BdrvRequestFlags flags) { -Coroutine *co; RwCo rwco = { .child = child, .offset = offset, .qiov = qiov, .is_write = is_write, -.ret = NOT_DONE, .flags = flags, }; -if (qemu_in_coroutine()) { -/* Fast-path if already in coroutine context */ -bdrv_rw_co_entry(&rwco); -} else { -co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); -bdrv_coroutine_enter(child->bs, co); -BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); -} -return rwco.ret; +return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco, &rwco.ret); } int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, @@ -2230,7 +2236,6 @@ typedef struct BdrvCoBlockStatusData { int64_t *map; BlockDriverState **file; int ret; -bool done; } BdrvCoBlockStatusData; int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, @@ -2492,7 +2497,6 @@ static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) data->want_zero, data->offset, data->bytes, data->pnum, data->map, data->file); -data->done = true; aio_wait_kick(); } @@ -2508,7 +2512,6 @@ static int bdrv_common_block_status_above(BlockDriverState *bs, int64_t *map, BlockDriverState **file) { -Coroutine *co; BdrvCoBlockStatusData data = { .bs = bs, .base = base, @@ -2518,18 +2521,9 @@ static int bdrv_common_block_status_above(BlockDriverState *bs, .pnum = pnum, .map = map, .file = file, -.done = false, }; -if (qemu_in_coroutine()) { -/* Fast-path if already in coroutine context */ -bdrv_block_status_above_co_entry(&data); -} else { -co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data); -bdrv_coroutine_enter(bs, co); -BDRV_POLL_WHILE(bs, !data.done); -} -return data.ret; +return bdrv_run_co(bs, bdrv_block_status_above_co_entry, &data, &data.ret); } int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, @@ -2669,22 +2663,13 @@ static inline int bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, bool is_read) { -if (qemu_in_coroutine()) { -return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); -} else { -BdrvVmstateCo data = { -.bs = bs, -.qiov = qiov, -.pos= pos, -.is_read= is_read, -.ret= -EINPROGRESS, -}; -Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); - -bdrv_coroutine_enter(bs, co); -BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); -return data.ret; -} +BdrvVmstateCo data = { +.bs = bs, +.qiov = qiov, +.pos= pos, +.is_read= is_read, +}; +return bdrv_run_co(bs, bdrv_co_rw_vmstate_entry, &data, &data.ret); } int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, @@ -2890,22 +2875,11 @@ early_exit: int bdrv_flush(BlockDriverState *bs) { -Coroutine *co; FlushCo flush_co = { .bs = bs, -.ret = NOT_DONE, }; -if (qemu_in_coroutine()) { -/* Fast-path if already in coroutine context */ -bdrv_flush_co_entry(&flush_co); -} else { -co = qemu_coroutine_create(bdrv_flush_co_en
[RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext
Coroutine functions that are entered through bdrv_run_co() are already safe to call from synchronous code in a different AioContext because bdrv_coroutine_enter() will schedule them in the context of the node. However, the coroutine fastpath still requires that we're already in the right AioContext when called in coroutine context. In order to make the behaviour more consistent and to make life a bit easier for callers, let's check the AioContext and automatically move the current coroutine around if we're not in the right context yet. Signed-off-by: Kevin Wolf --- block/io.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index c1badaadc9..7808e8bdc0 100644 --- a/block/io.c +++ b/block/io.c @@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, CoroutineEntry *entry, void *opaque, int *ret) { if (qemu_in_coroutine()) { -/* Fast-path if already in coroutine context */ +Coroutine *self = qemu_coroutine_self(); +AioContext *bs_ctx = bdrv_get_aio_context(bs); +AioContext *co_ctx = qemu_coroutine_get_aio_context(self); + +if (bs_ctx != co_ctx) { +/* Move to the iothread of the node */ +aio_co_schedule(bs_ctx, self); +qemu_coroutine_yield(); +} entry(opaque); +if (bs_ctx != co_ctx) { +/* Move back to the original AioContext */ +aio_co_schedule(bs_ctx, self); +qemu_coroutine_yield(); +} } else { Coroutine *co = qemu_coroutine_create(entry, opaque); *ret = NOT_DONE; -- 2.25.3
[RFC PATCH 3/3] block: Assert we're running in the right thread
tracked_request_begin() is called for most I/O operations, so it's a good place to assert that we're indeed running in the home thread of the node's AioContext. Signed-off-by: Kevin Wolf --- block/io.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/io.c b/block/io.c index 7808e8bdc0..924bf5ba46 100644 --- a/block/io.c +++ b/block/io.c @@ -695,14 +695,17 @@ static void tracked_request_begin(BdrvTrackedRequest *req, uint64_t bytes, enum BdrvTrackedRequestType type) { +Coroutine *self = qemu_coroutine_self(); + assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes); +assert(bs->aio_context == qemu_coroutine_get_aio_context(self)); *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, .bytes = bytes, .type = type, -.co = qemu_coroutine_self(), +.co = self, .serialising= false, .overlap_offset = offset, .overlap_bytes = bytes, -- 2.25.3
[PATCH v3 11/15] copy-on-read: Support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 74e01ee..4030d56 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -152,8 +156,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v3 02/15] copy-on-read: Support compressed writes
From: Max Reitz Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 242d3ff..c4fa468 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -106,6 +106,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, } +static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ +return bdrv_co_pwritev(bs->file, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + + static void cor_eject(BlockDriverState *bs, bool eject_flag) { bdrv_eject(bs->file->bs, eject_flag); @@ -130,6 +140,7 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_co_pwritev= cor_co_pwritev, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, +.bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, .bdrv_eject = cor_eject, .bdrv_lock_medium = cor_lock_medium, -- 1.8.3.1
[PATCH v3 00/15] Apply COR-filter to the block-stream permanently
With this series, all the block-stream COR operations pass through the COR-filter. The patches 01-08/15 are taken from the series "block: Deal with filters" by Max Reitz, the full version of that can be found in the branches: https://git.xanclic.moe/XanClic/qemu child-access-functions-v6 https://github.com/XanClic/qemu child-access-functions-v6 When running iotests, apply "char-socket: Fix race condition" to avoid sporadic segmentation faults. v3: 01: The COR filter insert/remove functions moved to block/copy-on-read.c to be a part of API. 02: block/stream.c code refactoring. 03: The separate call to block_job_add_bdrv() is used to block operations on the active node after the filter inserted and the job created. 04: The iotests case 030::test_overlapping_4 was modified to unbound the block-stream job from the base node. 05: The COR driver functions preadv/pwritev replaced with their analogous preadv/pwritev_part. v2: 01: No more skipping filters while checking for operation blockers. However, we exclude filters between the bottom node and base because we do not set the operation blockers for filters anymore. 02: As stated above, we do not set the operation blockers for filters anymore. So, skip filters when we block operations for the target node. 03: The comment added for the patch 4/7. 04: The QAPI target version changed to 5.1. 05: The 'filter-node-name' now excluded from using in the test #030. If we need it no more in a final version of the series, the patch 5/7 may be removed. 06: The COR-filter included into the frozen chain of a block-stream job. The 'above_base' node pointer is left because it is essential for finding the base node in case of filters above. Andrey Shinkevich (7): block: prepare block-stream for using COR-filter copy-on-read: Support change filename functions copy-on-read: Support preadv/pwritev_part functions copy-on-read: add filter append/drop functions qapi: add filter-node-name to block-stream iotests: prepare 245 for using filter in block-stream block: apply COR-filter to block-stream jobs Max Reitz (8): block: Mark commit and mirror as filter drivers copy-on-read: Support compressed writes block: Add child access functions block: Add chain helper functions block: Include filters when freezing backing chain block: Use CAFs in block status functions commit: Deal with filters when blocking intermediate nodes block: Use CAFs when working with backing chains block.c| 275 ++--- block/commit.c | 85 ++--- block/copy-on-read.c | 155 +-- block/io.c | 19 +-- block/mirror.c | 6 +- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 89 + blockdev.c | 19 ++- include/block/block.h | 6 +- include/block/block_int.h | 67 +- qapi/block-core.json | 6 + tests/qemu-iotests/030 | 17 +-- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 10 +- 14 files changed, 621 insertions(+), 139 deletions(-) -- 1.8.3.1
[PATCH v3 12/15] copy-on-read: add filter append/drop functions
Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 98 1 file changed, 98 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 4030d56..31e1619 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,8 +24,15 @@ #include "qemu/cutils.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; + static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -57,6 +64,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -150,6 +168,7 @@ static int cor_change_backing_file(BlockDriverState *bs, static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -178,4 +197,83 @@ static void bdrv_copy_on_read_init(void) bdrv_register(&bdrv_copy_on_read); } + +static BlockDriverState *create_filter_node(BlockDriverState *bs, +const char *filter_node_name, +Error **errp) +{ +QDict *opts = qdict_new(); + +qdict_put_str(opts, "driver", "copy-on-read"); +qdict_put_str(opts, "file", bdrv_get_node_name(bs)); +if (filter_node_name) { +qdict_put_str(opts, "node-name", filter_node_name); +} + +return bdrv_open(NULL, NULL, opts, BDRV_O_RDWR, errp); +} + + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + const char *filter_node_name, + Error **errp) +{ +BlockDriverState *cor_filter_bs; +BDRVStateCOR *state; +Error *local_err = NULL; + +cor_filter_bs = create_filter_node(bs, filter_node_name, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create filter node: "); +return NULL; +} + +if (!filter_node_name) { +cor_filter_bs->implicit = true; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, &local_err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +state = cor_filter_bs->opaque; +state->active = true; + +return cor_filter_bs; +} + + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filtered_rw_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, &error_abort); +bdrv_replace_node(cor_filter_bs, bs, &error_abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + block_init(bdrv_copy_on_read_init); -- 1.8.3.1
[PATCH v3 09/15] block: prepare block-stream for using COR-filter
This patch is the first one in the series where the COR-filter node will be hard-coded for using in the block-stream job. The block jobs may be run in parallel. Exclude conflicts with filter nodes used for a concurrent job while checking for the blocked operations. It incurs changes in the iotests 030::test_overlapping_4 that now passes with no conflict because the stream job does not have a real dependency on its base and on a filter above it. Signed-off-by: Andrey Shinkevich --- blockdev.c | 11 +-- tests/qemu-iotests/030 | 9 - 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/blockdev.c b/blockdev.c index b3c840e..97c2ba2 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2763,6 +2763,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, Error *local_err = NULL; const char *base_name = NULL; int job_flags = JOB_DEFAULT; +BlockDriverState *bottom_cow_node; if (!has_on_error) { on_error = BLOCKDEV_ON_ERROR_REPORT; @@ -2807,8 +2808,14 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, base_name = base_bs->filename; } -/* Check for op blockers in the whole chain between bs and base */ -for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) { +bottom_cow_node = bdrv_find_overlay(bs, base_bs); +if (!bottom_cow_node) { +error_setg(errp, "bottom node is not found, nothing to stream"); +goto out; +} +/* Check for op blockers in the whole chain between bs and bottom */ +for (iter = bs; iter && iter != bdrv_filtered_bs(bottom_cow_node); + iter = bdrv_filtered_bs(iter)) { if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) { goto out; } diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index 104e3ce..d7638cd 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -368,8 +368,7 @@ class TestParallelOps(iotests.QMPTestCase): self.wait_until_completed(drive='commit-drive0') # In this case the base node of the stream job is the same as the -# top node of commit job. Since this results in the commit filter -# node being part of the stream chain, this is not allowed. +# top node of commit job. def test_overlapping_4(self): self.assert_no_active_block_jobs() @@ -381,13 +380,13 @@ class TestParallelOps(iotests.QMPTestCase): # Stream from node2 into node4 result = self.vm.qmp('block-stream', device='node4', base_node='node2', job_id='node4') -self.assert_qmp(result, 'error/desc', -"Cannot freeze 'backing' link to 'commit-filter'") +self.assert_qmp(result, 'return', {}) result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0) self.assert_qmp(result, 'return', {}) -self.wait_until_completed() +self.vm.run_job(job='drive0', auto_dismiss=True) +self.vm.run_job(job='node4', auto_dismiss=True) self.assert_no_active_block_jobs() # In this case the base node of the stream job is the commit job's -- 1.8.3.1
[PATCH v3 04/15] block: Add chain helper functions
From: Max Reitz Add some helper functions for skipping filters in a chain of block nodes. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block.c | 55 +++ include/block/block_int.h | 3 +++ 2 files changed, 58 insertions(+) diff --git a/block.c b/block.c index b2aae2e..5b4ebfe 100644 --- a/block.c +++ b/block.c @@ -6863,3 +6863,58 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs) { return bdrv_filtered_rw_child(bs) ?: bs->file; } + +static BlockDriverState *bdrv_skip_filters(BlockDriverState *bs, + bool stop_on_explicit_filter) +{ +BdrvChild *filtered; + +if (!bs) { +return NULL; +} + +while (!(stop_on_explicit_filter && !bs->implicit)) { +filtered = bdrv_filtered_rw_child(bs); +if (!filtered) { +break; +} +bs = filtered->bs; +} +/* + * Note that this treats nodes with bs->drv == NULL as not being + * R/W filters (bs->drv == NULL should be replaced by something + * else anyway). + * The advantage of this behavior is that this function will thus + * always return a non-NULL value (given a non-NULL @bs). + */ + +return bs; +} + +/* + * Return the first BDS that has not been added implicitly or that + * does not have an RW-filtered child down the chain starting from @bs + * (including @bs itself). + */ +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs) +{ +return bdrv_skip_filters(bs, true); +} + +/* + * Return the first BDS that does not have an RW-filtered child down + * the chain starting from @bs (including @bs itself). + */ +BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs) +{ +return bdrv_skip_filters(bs, false); +} + +/* + * For a backing chain, return the first non-filter backing image of + * the first non-filter image. + */ +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) +{ +return bdrv_skip_rw_filters(bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs))); +} diff --git a/include/block/block_int.h b/include/block/block_int.h index dca59e9..86f7666 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1347,6 +1347,9 @@ BdrvChild *bdrv_filtered_child(BlockDriverState *bs); BdrvChild *bdrv_metadata_child(BlockDriverState *bs); BdrvChild *bdrv_storage_child(BlockDriverState *bs); BdrvChild *bdrv_primary_child(BlockDriverState *bs); +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); +BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs); +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); static inline BlockDriverState *child_bs(BdrvChild *child) { -- 1.8.3.1
[PATCH v3 08/15] block: Use CAFs when working with backing chains
From: Max Reitz Use child access functions when iterating through backing chains so filters do not break the chain. Signed-off-by: Max Reitz --- block.c | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/block.c b/block.c index 459412e..69e9152 100644 --- a/block.c +++ b/block.c @@ -4593,7 +4593,8 @@ int bdrv_change_backing_file(BlockDriverState *bs, } /* - * Finds the image layer in the chain that has 'bs' as its backing file. + * Finds the image layer in the chain that has 'bs' (or a filter on + * top of it) as its backing file. * * active is the current topmost image. * @@ -4605,11 +4606,18 @@ int bdrv_change_backing_file(BlockDriverState *bs, BlockDriverState *bdrv_find_overlay(BlockDriverState *active, BlockDriverState *bs) { -while (active && bs != backing_bs(active)) { -active = backing_bs(active); +bs = bdrv_skip_rw_filters(bs); +active = bdrv_skip_rw_filters(active); + +while (active) { +BlockDriverState *next = bdrv_backing_chain_next(active); +if (bs == next) { +return active; +} +active = next; } -return active; +return NULL; } /* Given a BDS, searches for the base layer. */ @@ -4761,9 +4769,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, * other intermediate nodes have been dropped. * If 'top' is an implicit node (e.g. "commit_top") we should skip * it because no one inherits from it. We use explicit_top for that. */ -while (explicit_top && explicit_top->implicit) { -explicit_top = backing_bs(explicit_top); -} +explicit_top = bdrv_skip_implicit_filters(explicit_top); update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top); /* success - we can delete the intermediate states, and link top->base */ @@ -5205,7 +5211,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device, bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) { while (top && top != base) { -top = backing_bs(top); +top = bdrv_filtered_bs(top); } return top != NULL; @@ -5466,7 +5472,17 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, is_protocol = path_has_protocol(backing_file); -for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) { +/* + * Being largely a legacy function, skip any filters here + * (because filters do not have normal filenames, so they cannot + * match anyway; and allowing json:{} filenames is a bit out of + * scope). + */ +for (curr_bs = bdrv_skip_rw_filters(bs); + bdrv_filtered_cow_child(curr_bs) != NULL; + curr_bs = bdrv_backing_chain_next(curr_bs)) +{ +BlockDriverState *bs_below = bdrv_backing_chain_next(curr_bs); /* If either of the filename paths is actually a protocol, then * compare unmodified paths; otherwise make paths relative */ @@ -5474,7 +5490,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, char *backing_file_full_ret; if (strcmp(backing_file, curr_bs->backing_file) == 0) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } /* Also check against the full backing filename for the image */ @@ -5484,7 +5500,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, bool equal = strcmp(backing_file, backing_file_full_ret) == 0; g_free(backing_file_full_ret); if (equal) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } } @@ -5510,7 +5526,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, g_free(filename_tmp); if (strcmp(backing_file_full, filename_full) == 0) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } } -- 1.8.3.1
[PATCH v3 07/15] commit: Deal with filters when blocking intermediate nodes
From: Max Reitz This includes some permission limiting (for example, we only need to take the RESIZE permission if the base is smaller than the top). Signed-off-by: Max Reitz Signed-off-by: Andrey Shinkevich --- block/commit.c | 75 ++ 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/block/commit.c b/block/commit.c index e1f45a4..4df145d 100644 --- a/block/commit.c +++ b/block/commit.c @@ -37,6 +37,7 @@ typedef struct CommitBlockJob { BlockBackend *top; BlockBackend *base; BlockDriverState *base_bs; +BlockDriverState *above_base; BlockdevOnError on_error; bool base_read_only; bool chain_frozen; @@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ -ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, +ret = bdrv_is_allocated_above(blk_bs(s->top), s->above_base, true, offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); @@ -253,15 +254,35 @@ void commit_start(const char *job_id, BlockDriverState *bs, CommitBlockJob *s; BlockDriverState *iter; BlockDriverState *commit_top_bs = NULL; +BlockDriverState *filtered_base; Error *local_err = NULL; +int64_t base_size, top_size; +uint64_t perms, iter_shared_perms; int ret; assert(top != bs); -if (top == base) { +if (bdrv_skip_rw_filters(top) == bdrv_skip_rw_filters(base)) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; } +base_size = bdrv_getlength(base); +if (base_size < 0) { +error_setg_errno(errp, -base_size, "Could not inquire base image size"); +return; +} + +top_size = bdrv_getlength(top); +if (top_size < 0) { +error_setg_errno(errp, -top_size, "Could not inquire top image size"); +return; +} + +perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; +if (base_size < top_size) { +perms |= BLK_PERM_RESIZE; +} + s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL, speed, creation_flags, NULL, NULL, errp); if (!s) { @@ -301,17 +322,43 @@ void commit_start(const char *job_id, BlockDriverState *bs, s->commit_top_bs = commit_top_bs; -/* Block all nodes between top and base, because they will - * disappear from the chain after this operation. */ -assert(bdrv_chain_contains(top, base)); -for (iter = top; iter != base; iter = backing_bs(iter)) { -/* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves - * at s->base (if writes are blocked for a node, they are also blocked - * for its backing file). The other options would be a second filter - * driver above s->base. */ +/* + * Block all nodes between top and base, because they will + * disappear from the chain after this operation. + * Note that this assumes that the user is fine with removing all + * nodes (including R/W filters) between top and base. Assuring + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ +s->above_base = bdrv_find_overlay(top, base); +assert(s->above_base); + +/* + * The topmost node with + * bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base) + */ +filtered_base = bdrv_filtered_cow_bs(s->above_base); +assert(bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base)); + +/* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves + * at s->base (if writes are blocked for a node, they are also blocked + * for its backing file). The other options would be a second filter + * driver above s->base. + */ +iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + +for (iter = top; iter != base; iter = bdrv_filtered_bs(iter)) { +if (iter == filtered_base) { +/* + * From here on, all nodes are filters on the base. This + * allows us to share BLK_PERM_CONSISTENT_READ. + */ +iter_shared_perms |= BLK_PERM_CONSISTENT_READ; +} + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -328,9 +375,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, } s->base = blk_new(s->common.job.aio_context, - BLK_PERM_CONSISTENT_READ - | BLK_PERM_WRITE - | BLK_PERM_RESIZE, +
[PATCH v3 14/15] iotests: prepare 245 for using filter in block-stream
The preliminary patch modifies the test 245 to prepare the block-stream job for using COR-filter. The filter breaks the backing chain being connected to the underlying node by file child link. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/245 | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index 4f5f0bb..5bcddc4 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -896,15 +896,19 @@ class TestBlockdevReopen(iotests.QMPTestCase): # make hd1 read-only and block-stream requires it to be read-write # (Which error message appears depends on whether the stream job is # already done with copying at this point.) -self.reopen(opts, {}, +opts = hd_opts(1) +opts['backing'] = hd_opts(2) +opts['backing']['backing'] = None +self.reopen(opts, {'read-only': True}, ["Can't set node 'hd1' to r/o with copy-on-read enabled", "Cannot make block node read-only, there is a writer on it"]) # We can't remove hd2 while the stream job is ongoing -opts['backing']['backing'] = None -self.reopen(opts, {'backing.read-only': False}, "Cannot change 'backing' link from 'hd1' to 'hd2'") +opts['backing'] = None +self.reopen(opts, {'read-only': False}, "Cannot change 'backing' link from 'hd1' to 'hd2'") # We can detach hd1 from hd0 because it doesn't affect the stream job +opts = hd_opts(0) opts['backing'] = None self.reopen(opts) -- 1.8.3.1
[PATCH v3 10/15] copy-on-read: Support change filename functions
The COR-filter driver should support a redirecting function to refresh filenames. Otherwise, a file name of the filter will be copied instead of the one of a data node. It is also true for the function bdrv_change_backing_file(). Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c4fa468..74e01ee 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -21,6 +21,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "block/block_int.h" #include "qemu/module.h" @@ -128,6 +129,21 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) } +static void cor_refresh_filename(BlockDriverState *bs) +{ +pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), +bs->file->bs->filename); +} + + +static int cor_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ +return bdrv_change_backing_file(bs->file->bs, backing_file, backing_fmt); +} + + static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", @@ -146,6 +162,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_lock_medium = cor_lock_medium, .bdrv_co_block_status = bdrv_co_block_status_from_file, +.bdrv_refresh_filename = cor_refresh_filename, +.bdrv_change_backing_file = cor_change_backing_file, .has_variable_length= true, .is_filter = true, -- 1.8.3.1
[PATCH v3 03/15] block: Add child access functions
From: Max Reitz There are BDS children that the general block layer code can access, namely bs->file and bs->backing. Since the introduction of filters and external data files, their meaning is not quite clear. bs->backing can be a COW source, or it can be an R/W-filtered child; bs->file can be an R/W-filtered child, it can be data and metadata storage, or it can be just metadata storage. This overloading really is not helpful. This patch adds function that retrieve the correct child for each exact purpose. Later patches in this series will make use of them. Doing so will allow us to handle filter nodes and external data files in a meaningful way. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich --- block.c | 99 +++ include/block/block_int.h | 57 +-- 2 files changed, 153 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 0653ccb..b2aae2e 100644 --- a/block.c +++ b/block.c @@ -6764,3 +6764,102 @@ void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) parent_bs->drv->bdrv_del_child(parent_bs, child, errp); } + +/* + * Return the child that @bs acts as an overlay for, and from which data may be + * copied in COW or COR operations. Usually this is the backing file. + */ +BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (bs->drv->is_filter) { +return NULL; +} + +return bs->backing; +} + +/* + * If @bs acts as a pass-through filter for one of its children, + * return that child. "Pass-through" means that write operations to + * @bs are forwarded to that child instead of triggering COW. + */ +BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (!bs->drv->is_filter) { +return NULL; +} + +/* Only one of @backing or @file may be used */ +assert(!(bs->backing && bs->file)); + +return bs->backing ?: bs->file; +} + +/* + * Return any filtered child, independently of how it reacts to write + * accesses and whether data is copied onto this BDS through COR. + */ +BdrvChild *bdrv_filtered_child(BlockDriverState *bs) +{ +BdrvChild *cow_child = bdrv_filtered_cow_child(bs); +BdrvChild *rw_child = bdrv_filtered_rw_child(bs); + +/* There can only be one filtered child at a time */ +assert(!(cow_child && rw_child)); + +return cow_child ?: rw_child; +} + +/* + * Return the child that stores the metadata for this node. + */ +BdrvChild *bdrv_metadata_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +/* Filters do not have metadata */ +if (bs->drv->is_filter) { +return NULL; +} + +return bs->file; +} + +/* + * Return the child that stores the data that is allocated on this + * node. This may or may not include metadata. + */ +BdrvChild *bdrv_storage_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (bs->drv->bdrv_storage_child) { +return bs->drv->bdrv_storage_child(bs); +} + +return bdrv_filtered_rw_child(bs) ?: bs->file; +} + +/* + * Return the primary child of this node: For filters, that is the + * filtered child. For other nodes, that is usually the child storing + * metadata. + * (A generally more helpful description is that this is (usually) the + * child that has the same filename as @bs.) + */ +BdrvChild *bdrv_primary_child(BlockDriverState *bs) +{ +return bdrv_filtered_rw_child(bs) ?: bs->file; +} diff --git a/include/block/block_int.h b/include/block/block_int.h index df6d027..dca59e9 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -89,9 +89,11 @@ struct BlockDriver { int instance_size; /* set to true if the BlockDriver is a block filter. Block filters pass - * certain callbacks that refer to data (see block.c) to their bs->file if - * the driver doesn't implement them. Drivers that do not wish to forward - * must implement them and return -ENOTSUP. + * certain callbacks that refer to data (see block.c) to their bs->file + * or bs->backing (whichever one exists) if the driver doesn't implement + * them. Drivers that do not wish to forward must implement them and return + * -ENOTSUP. + * Note that filters are not allowed to modify data. */ bool is_filter; /* @@ -585,6 +587,13 @@ struct BlockDriver { * If this pointer is NULL, the array is considered empty. * "filename" and "driver" are always considered strong. */ const char *const *strong_runtime_opts; + +/** + * Return the data storage child, if there is exactly one. If + * this function is not implemented, the block layer will assume + * bs->file to be this child. + */ +BdrvChild *(*bdrv_storage_child)(
[PATCH v3 06/15] block: Use CAFs in block status functions
From: Max Reitz Use the child access functions in the block status inquiry functions as appropriate. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/io.c | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/io.c b/block/io.c index 7d30e61..47d8096 100644 --- a/block/io.c +++ b/block/io.c @@ -2387,11 +2387,12 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { ret |= BDRV_BLOCK_ALLOCATED; } else if (want_zero) { +BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs); + if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; -} else if (bs->backing) { -BlockDriverState *bs2 = bs->backing->bs; -int64_t size2 = bdrv_getlength(bs2); +} else if (cow_bs) { +int64_t size2 = bdrv_getlength(cow_bs); if (size2 >= 0 && offset >= size2) { ret |= BDRV_BLOCK_ZERO; @@ -2457,7 +2458,7 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, bool first = true; assert(bs != base); -for (p = bs; p != base; p = backing_bs(p)) { +for (p = bs; p != base; p = bdrv_filtered_bs(p)) { ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, file); if (ret < 0) { @@ -2543,7 +2544,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { -return bdrv_block_status_above(bs, backing_bs(bs), +return bdrv_block_status_above(bs, bdrv_filtered_bs(bs), offset, bytes, pnum, map, file); } @@ -2553,9 +2554,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int ret; int64_t dummy; -ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, - bytes, pnum ? pnum : &dummy, NULL, - NULL); +ret = bdrv_common_block_status_above(bs, bdrv_filtered_bs(bs), false, + offset, bytes, pnum ? pnum : &dummy, + NULL, NULL); if (ret < 0) { return ret; } @@ -2618,7 +2619,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, break; } -intermediate = backing_bs(intermediate); +intermediate = bdrv_filtered_bs(intermediate); } *pnum = n; -- 1.8.3.1
[PATCH v3 05/15] block: Include filters when freezing backing chain
From: Max Reitz In order to make filters work in backing chains, the associated functions must be able to deal with them and freeze all filter links, be they COW or R/W filter links. In the process, rename these functions to reflect that they now act on generalized chains of filter nodes instead of backing chains alone. While at it, add some comments that note which functions require their caller to ensure that a given child link is not frozen, and how the callers do so. Signed-off-by: Max Reitz Signed-off-by: Andrey Shinkevich --- block.c | 81 ++- block/commit.c| 8 ++--- block/mirror.c| 4 +-- block/stream.c| 8 ++--- include/block/block.h | 6 ++-- 5 files changed, 60 insertions(+), 47 deletions(-) diff --git a/block.c b/block.c index 5b4ebfe..459412e 100644 --- a/block.c +++ b/block.c @@ -2513,12 +2513,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child, * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this * function uses bdrv_set_perm() to update the permissions according to the new * reference that @new_bs gets. + * + * Callers must ensure that child->frozen is false. */ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs) { BlockDriverState *old_bs = child->bs; uint64_t perm, shared_perm; +/* Asserts that child->frozen == false */ bdrv_replace_child_noperm(child, new_bs); /* @@ -2676,6 +2679,7 @@ static void bdrv_detach_child(BdrvChild *child) g_free(child); } +/* Callers must ensure that child->frozen is false. */ void bdrv_root_unref_child(BdrvChild *child) { BlockDriverState *child_bs; @@ -2685,10 +2689,6 @@ void bdrv_root_unref_child(BdrvChild *child) bdrv_unref(child_bs); } -/** - * Clear all inherits_from pointers from children and grandchildren of - * @root that point to @root, where necessary. - */ static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child) { BdrvChild *c; @@ -2713,6 +2713,7 @@ static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child) } } +/* Callers must ensure that child->frozen is false. */ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child) { if (child == NULL) { @@ -2756,7 +2757,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) && bdrv_inherits_from_recursive(backing_hd, bs); -if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) { +if (bdrv_is_chain_frozen(bs, child_bs(bs->backing), errp)) { return; } @@ -2765,6 +2766,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, } if (bs->backing) { +/* Cannot be frozen, we checked that above */ bdrv_unref_child(bs, bs->backing); bs->backing = NULL; } @@ -3912,8 +3914,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, return -EPERM; } /* Check if the backing link that we want to replace is frozen */ -if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs), - errp)) { +if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) { return -EPERM; } reopen_state->replace_backing_bs = true; @@ -4248,6 +4249,7 @@ static void bdrv_close(BlockDriverState *bs) if (bs->drv) { if (bs->drv->bdrv_close) { +/* Must unfreeze all children, so bdrv_unref_child() works */ bs->drv->bdrv_close(bs); } bs->drv = NULL; @@ -4617,20 +4619,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs) } /* - * Return true if at least one of the backing links between @bs and - * @base is frozen. @errp is set if that's the case. + * Return true if at least one of the (COW and R/W) filter links + * between @bs and @base is frozen. @errp is set if that's the case. * @base must be reachable from @bs, or NULL. */ -bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, - Error **errp) +bool bdrv_is_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + Error **errp) { BlockDriverState *i; +BdrvChild *child; + +for (i = bs; i != base; i = child_bs(child)) { +child = bdrv_filtered_child(i); -for (i = bs; i != base; i = backing_bs(i)) { -if (i->backing && i->backing->frozen) { +if (child && child->frozen) { error_setg(errp, "Cannot change '%s' link from '%s' to '%s'", - i->backing->name, i->node_name, - backing_bs(i)->node_name); + child->name, i->node_name, child->bs->node_name); return true; } } @@ -4639
[PATCH v3 15/15] block: apply COR-filter to block-stream jobs
The patch completes the series with the COR-filter insertion to any block-stream operation. It also makes changes to the iotests 030 and 141.out. Signed-off-by: Andrey Shinkevich --- block/stream.c | 83 -- tests/qemu-iotests/030 | 8 +++-- tests/qemu-iotests/141.out | 2 +- 3 files changed, 64 insertions(+), 29 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0b35f8..a74a07b 100644 --- a/block/stream.c +++ b/block/stream.c @@ -19,6 +19,7 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -32,8 +33,11 @@ enum { typedef struct StreamBlockJob { BlockJob common; BlockDriverState *bottom; +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; char *backing_file_str; +char *base_fmt; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -52,33 +56,25 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = &s->common; -bdrv_unfreeze_chain(blk_bs(bjob->blk), s->bottom); +bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = &s->common; -BlockDriverState *bs = blk_bs(bjob->blk); +BlockDriverState *bs = s->target_bs; BlockDriverState *base = backing_bs(s->bottom); Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_chain(bs, s->bottom); +bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom); s->chain_frozen = false; if (bs->backing) { -const char *base_id = NULL, *base_fmt = NULL; -if (base) { -base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; -} -} bdrv_set_backing_hd(bs, base, &local_err); -ret = bdrv_change_backing_file(bs, base_id, base_fmt); +ret = bdrv_change_backing_file(bs, s->backing_file_str, + s->base_fmt); if (local_err) { error_report_err(local_err); return -EPERM; @@ -92,7 +88,9 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; -BlockDriverState *bs = blk_bs(bjob->blk); +BlockDriverState *bs = s->target_bs; + +bdrv_cor_filter_drop(s->cor_filter_bs); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { @@ -102,13 +100,14 @@ static void stream_clean(Job *job) } g_free(s->backing_file_str); +g_free(s->base_fmt); } static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); +BlockDriverState *bs = s->target_bs; bool enable_cor = !backing_bs(s->bottom); int64_t len; int64_t offset = 0; @@ -156,8 +155,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ -ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true, - offset, n, &n); +ret = bdrv_is_allocated_above(bdrv_filtered_cow_bs(bs), s->bottom, + true, offset, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { n = len - offset; @@ -225,7 +224,13 @@ void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *iter; bool bs_read_only; int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; +BlockDriverState *cor_filter_bs = NULL; BlockDriverState *bottom = bdrv_find_overlay(bs, base); +char *base_fmt = NULL; + +if (base && base->drv) { +base_fmt = g_strdup(base->drv->format_name); +} if (bdrv_freeze_chain(bs, bottom, errp) < 0) { return; @@ -240,17 +245,35 @@ void stream_start(const char *job_id, BlockDriverState *bs, } } -/* Prevent concurrent jobs trying to modify the graph structure here, we - * already have our own plans. Also don't allow resize as the image size is - * queried only at the job start and then cached. */ -s = block_job_create(job_id, &stream_job_driver, NULL, bs, - basic_flags | BLK_PERM_GRAPH_MOD, - basic_flags | BLK_PERM_WRITE, +cor_filter_bs = bdrv_cor_filter_append(bs,
[PATCH v3 01/15] block: Mark commit and mirror as filter drivers
From: Max Reitz The commit and mirror block nodes are filters, so they should be marked as such. (Strictly speaking, BDS.is_filter's documentation states that a filter's child must be bs->file. The following patch will relax this restriction, however.) Signed-off-by: Max Reitz Reviewed-by: Alberto Garcia Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich --- block/commit.c | 2 ++ block/mirror.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/block/commit.c b/block/commit.c index 87f6096..445a280 100644 --- a/block/commit.c +++ b/block/commit.c @@ -240,6 +240,8 @@ static BlockDriver bdrv_commit_top = { .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, .bdrv_child_perm= bdrv_commit_top_child_perm, + +.is_filter = true, }; void commit_start(const char *job_id, BlockDriverState *bs, diff --git a/block/mirror.c b/block/mirror.c index aca95c9..b6de24b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1527,6 +1527,8 @@ static BlockDriver bdrv_mirror_top = { .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_mirror_top_refresh_filename, .bdrv_child_perm= bdrv_mirror_top_child_perm, + +.is_filter = true, }; static BlockJob *mirror_start_job( -- 1.8.3.1
[PATCH v3 13/15] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. That will be needed for further iotests implementations. Signed-off-by: Andrey Shinkevich --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 8 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4c8c375..6f6636b 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - &error); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, &error); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 3cca1f9..e0b35f8 100644 --- a/block/stream.c +++ b/block/stream.c @@ -217,7 +217,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index 97c2ba2..67918fc 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2753,6 +2753,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2769,6 +2770,10 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, on_error = BLOCKDEV_ON_ERROR_REPORT; } +if (!has_filter_node_name) { +filter_node_name = NULL; +} + bs = bdrv_lookup_bs(device, device, errp); if (!bs) { return; @@ -2840,7 +2845,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, &local_err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, &local_err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 86f7666..0fdfa9c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1094,6 +1094,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1106,7 +1109,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index 943df19..df6981e 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2509,6 +2509,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.1) +# # @auto-finalize: When false, this job will wait in a PENDIN
Re: [PATCH v2 1/4] iotests/109: Don't mirror with mismatched size
11.05.2020 16:58, Kevin Wolf wrote: This patch makes the raw image the same size as the file in a different format that is mirrored as raw to it to avoid errors when mirror starts to enforce that source and target are the same size. We check only that the first 512 bytes are zeroed (instead of 64k) because some image formats create image files that are smaller than 64k, so trying to read 64k would result in I/O errors. Apart from this, 512 is more appropriate anyway because the raw format driver protects specifically the first 512 bytes. Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH] hw: Use QEMU_IS_ALIGNED() on parallel flash block size
On Mon, May 11, 2020 at 1:54 PM Philippe Mathieu-Daudé wrote: > > Use the QEMU_IS_ALIGNED() macro to verify the flash block size > is properly aligned. It is quicker to process when reviewing. > > Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Alistair Francis Alistair > --- > hw/arm/sbsa-ref.c | 2 +- > hw/arm/virt.c | 2 +- > hw/block/pflash_cfi01.c | 2 +- > hw/block/pflash_cfi02.c | 2 +- > hw/i386/pc_sysfw.c | 2 +- > hw/riscv/virt.c | 2 +- > 6 files changed, 6 insertions(+), 6 deletions(-) > > diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c > index 8409ba853d..b379e4a76a 100644 > --- a/hw/arm/sbsa-ref.c > +++ b/hw/arm/sbsa-ref.c > @@ -241,7 +241,7 @@ static void sbsa_flash_map1(PFlashCFI01 *flash, > { > DeviceState *dev = DEVICE(flash); > > -assert(size % SBSA_FLASH_SECTOR_SIZE == 0); > +assert(QEMU_IS_ALIGNED(size, SBSA_FLASH_SECTOR_SIZE)); > assert(size / SBSA_FLASH_SECTOR_SIZE <= UINT32_MAX); > qdev_prop_set_uint32(dev, "num-blocks", size / SBSA_FLASH_SECTOR_SIZE); > qdev_init_nofail(dev); > diff --git a/hw/arm/virt.c b/hw/arm/virt.c > index 634db0cfe9..0a99fddb3d 100644 > --- a/hw/arm/virt.c > +++ b/hw/arm/virt.c > @@ -978,7 +978,7 @@ static void virt_flash_map1(PFlashCFI01 *flash, > { > DeviceState *dev = DEVICE(flash); > > -assert(size % VIRT_FLASH_SECTOR_SIZE == 0); > +assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE)); > assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); > qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); > qdev_init_nofail(dev); > diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c > index f586bac269..11922c0f96 100644 > --- a/hw/block/pflash_cfi01.c > +++ b/hw/block/pflash_cfi01.c > @@ -964,7 +964,7 @@ PFlashCFI01 *pflash_cfi01_register(hwaddr base, > if (blk) { > qdev_prop_set_drive(dev, "drive", blk, &error_abort); > } > -assert(size % sector_len == 0); > +assert(QEMU_IS_ALIGNED(size, sector_len)); > qdev_prop_set_uint32(dev, "num-blocks", size / sector_len); > qdev_prop_set_uint64(dev, "sector-length", sector_len); > qdev_prop_set_uint8(dev, "width", bank_width); > diff --git a/hw/block/pflash_cfi02.c b/hw/block/pflash_cfi02.c > index c6b6f2d082..895f7daee3 100644 > --- a/hw/block/pflash_cfi02.c > +++ b/hw/block/pflash_cfi02.c > @@ -1003,7 +1003,7 @@ PFlashCFI02 *pflash_cfi02_register(hwaddr base, > if (blk) { > qdev_prop_set_drive(dev, "drive", blk, &error_abort); > } > -assert(size % sector_len == 0); > +assert(QEMU_IS_ALIGNED(size, sector_len)); > qdev_prop_set_uint32(dev, "num-blocks", size / sector_len); > qdev_prop_set_uint32(dev, "sector-length", sector_len); > qdev_prop_set_uint8(dev, "width", width); > diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c > index f5f3f466b0..fad41f0e73 100644 > --- a/hw/i386/pc_sysfw.c > +++ b/hw/i386/pc_sysfw.c > @@ -168,7 +168,7 @@ static void pc_system_flash_map(PCMachineState *pcms, > blk_name(blk), strerror(-size)); > exit(1); > } > -if (size == 0 || size % FLASH_SECTOR_SIZE != 0) { > +if (size == 0 || !QEMU_IS_ALIGNED(size, FLASH_SECTOR_SIZE)) { > error_report("system firmware block device %s has invalid size " > "%" PRId64, > blk_name(blk), size); > diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c > index daae3ebdbb..71481d59c2 100644 > --- a/hw/riscv/virt.c > +++ b/hw/riscv/virt.c > @@ -112,7 +112,7 @@ static void virt_flash_map1(PFlashCFI01 *flash, > { > DeviceState *dev = DEVICE(flash); > > -assert(size % VIRT_FLASH_SECTOR_SIZE == 0); > +assert(QEMU_IS_ALIGNED(size, VIRT_FLASH_SECTOR_SIZE)); > assert(size / VIRT_FLASH_SECTOR_SIZE <= UINT32_MAX); > qdev_prop_set_uint32(dev, "num-blocks", size / VIRT_FLASH_SECTOR_SIZE); > qdev_init_nofail(dev); > -- > 2.21.3 > >
Re: [PATCH v5 13/15] acpi: drop build_piix4_pm()
On Tue, 12 May 2020 13:16:05 +0200 Gerd Hoffmann wrote: > On Mon, May 11, 2020 at 09:37:32PM +0200, Igor Mammedov wrote: > > On Thu, 7 May 2020 15:16:38 +0200 > > Gerd Hoffmann wrote: > > > > > The _SB.PCI0.PX13.P13C opregion (holds isa device enable bits) > > > is not used any more, remove it from DSDT. > > > > > > Signed-off-by: Gerd Hoffmann > > > --- > > > hw/i386/acpi-build.c | 16 > > > 1 file changed, 16 deletions(-) > > > > > > diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c > > > index 765409a90eb6..c1e63cce5e8e 100644 > > > --- a/hw/i386/acpi-build.c > > > +++ b/hw/i386/acpi-build.c > > > @@ -1441,21 +1441,6 @@ static void build_q35_isa_bridge(Aml *table) > > > aml_append(table, scope); > > > } > > > > > > -static void build_piix4_pm(Aml *table) > > > -{ > > > -Aml *dev; > > > -Aml *scope; > > > - > > > -scope = aml_scope("_SB.PCI0"); > > > -dev = aml_device("PX13"); > > > -aml_append(dev, aml_name_decl("_ADR", aml_int(0x00010003))); > > I agree about removing P13C but I'm not sure if it's safe to remove > > whole isa bridge > > The isa bridge is _SB0.PCI0.ISA (piix4 function 0). > _SB.PCI0.PX13 is the acpi pm device (piix4 function 3). > > So this does _not_ remove the isa bridge. And given that PX13 has > nothing declared beside the opregion we are removing and I have not > found any other references to the PX13 device I assumed it is ok to drop > that one too. Didn't notice any odd side effects in testing. I don't see HID or some other way to tell guest that it should load a driver for it so probably we can remove it. If it breaks some legacy OS we can always add it back. > > take care, > Gerd >
Re: [RFC PATCH 1/3] block: Factor out bdrv_run_co()
On 5/12/20 9:43 AM, Kevin Wolf wrote: We have a few bdrv_*() functions that can either spawn a new coroutine and wait for it with BDRV_POLL_WHILE() or use a fastpath if they are alreeady running in a coroutine. All of them duplicate basically the already same code. Factor the common code into a new function bdrv_run_co(). Signed-off-by: Kevin Wolf --- block/io.c | 104 +++-- 1 file changed, 28 insertions(+), 76 deletions(-) diff --git a/block/io.c b/block/io.c index 7d30e61edc..c1badaadc9 100644 --- a/block/io.c +++ b/block/io.c @@ -891,6 +891,22 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, return 0; } +static int bdrv_run_co(BlockDriverState *bs, CoroutineEntry *entry, + void *opaque, int *ret) +{ +if (qemu_in_coroutine()) { +/* Fast-path if already in coroutine context */ +entry(opaque); +} else { +Coroutine *co = qemu_coroutine_create(entry, opaque); +*ret = NOT_DONE; +bdrv_coroutine_enter(bs, co); +BDRV_POLL_WHILE(bs, *ret == NOT_DONE); For my reference, NOT_DONE is defined as INT_MAX, which does not seem to be used as a return value in other situations. @@ -923,25 +939,15 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset, QEMUIOVector *qiov, bool is_write, BdrvRequestFlags flags) { -Coroutine *co; RwCo rwco = { .child = child, .offset = offset, .qiov = qiov, .is_write = is_write, -.ret = NOT_DONE, .flags = flags, }; -if (qemu_in_coroutine()) { -/* Fast-path if already in coroutine context */ -bdrv_rw_co_entry(&rwco); -} else { -co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco); -bdrv_coroutine_enter(child->bs, co); -BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE); -} -return rwco.ret; +return bdrv_run_co(child->bs, bdrv_rw_co_entry, &rwco, &rwco.ret); So code that previously looped on NOT_DONE is obviously safe, while... } int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, @@ -2230,7 +2236,6 @@ typedef struct BdrvCoBlockStatusData { int64_t *map; BlockDriverState **file; int ret; -bool done; } BdrvCoBlockStatusData; int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs, @@ -2492,7 +2497,6 @@ static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque) data->want_zero, data->offset, data->bytes, data->pnum, data->map, data->file); -data->done = true; aio_wait_kick(); ...code that looped on something else now has to be checked that data->ret is still being set to something useful. Fortunately that is true here. @@ -2669,22 +2663,13 @@ static inline int bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos, bool is_read) { -if (qemu_in_coroutine()) { -return bdrv_co_rw_vmstate(bs, qiov, pos, is_read); -} else { -BdrvVmstateCo data = { -.bs = bs, -.qiov = qiov, -.pos= pos, -.is_read= is_read, -.ret= -EINPROGRESS, -}; -Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data); - -bdrv_coroutine_enter(bs, co); -BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS); -return data.ret; It's a little harder to see whether -EINPROGRESS might ever be returned by a driver, but again this looks safe. Here, it's a little less obvious whether any driver might return -EINPROGRESS, but it looks like if they did that, Reviewed-by: Eric Blake Conflicts with Vladimir's patches which try to add more coroutine wrappers (but those need a rebase anyway): https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg04559.html -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 2/4] iotests/229: Use blkdebug to inject an error
11.05.2020 16:58, Kevin Wolf wrote: 229 relies on the mirror running into an I/O error when the target is smaller than the source. After changing mirror to catch this condition while starting the job, this test case won't get a job that is paused for an I/O error any more. Use blkdebug instead to inject an error. Signed-off-by: Kevin Wolf Message-Id:<20200507145228.323412-2-kw...@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext
On 5/12/20 4:43 PM, Kevin Wolf wrote: > Coroutine functions that are entered through bdrv_run_co() are already > safe to call from synchronous code in a different AioContext because > bdrv_coroutine_enter() will schedule them in the context of the node. > > However, the coroutine fastpath still requires that we're already in the > right AioContext when called in coroutine context. > > In order to make the behaviour more consistent and to make life a bit > easier for callers, let's check the AioContext and automatically move > the current coroutine around if we're not in the right context yet. > > Signed-off-by: Kevin Wolf > --- > block/io.c | 15 ++- > 1 file changed, 14 insertions(+), 1 deletion(-) > > diff --git a/block/io.c b/block/io.c > index c1badaadc9..7808e8bdc0 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, > CoroutineEntry *entry, > void *opaque, int *ret) > { > if (qemu_in_coroutine()) { > -/* Fast-path if already in coroutine context */ > +Coroutine *self = qemu_coroutine_self(); > +AioContext *bs_ctx = bdrv_get_aio_context(bs); > +AioContext *co_ctx = qemu_coroutine_get_aio_context(self); > + > +if (bs_ctx != co_ctx) { > +/* Move to the iothread of the node */ > +aio_co_schedule(bs_ctx, self); > +qemu_coroutine_yield(); > +} > entry(opaque); > +if (bs_ctx != co_ctx) { > +/* Move back to the original AioContext */ > +aio_co_schedule(bs_ctx, self); shouldn't it use co_ctx here, as else it's just scheduled again on the one from bs? Looks OK for me besides that. > +qemu_coroutine_yield(); > +} > } else { > Coroutine *co = qemu_coroutine_create(entry, opaque); > *ret = NOT_DONE; >
[PATCH v4 12/15] copy-on-read: add filter append/drop functions
Provide API for the COR-filter insertion/removal. Also, drop the filter child permissions for an inactive state when the filter node is being removed. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 102 +++ 1 file changed, 102 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 4030d56..229a97c 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -24,11 +24,20 @@ #include "qemu/cutils.h" #include "block/block_int.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "block/copy-on-read.h" +typedef struct BDRVStateCOR { +bool active; +} BDRVStateCOR; + static int cor_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { +BDRVStateCOR *state = bs->opaque; + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, errp); if (!bs->file) { @@ -42,6 +51,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & bs->file->bs->supported_zero_flags); +state->active = true; + return 0; } @@ -57,6 +68,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared) { +BDRVStateCOR *s = bs->opaque; + +if (!s->active) { +/* + * While the filter is being removed + */ +*nperm = 0; +*nshared = BLK_PERM_ALL; +return; +} + *nperm = perm & PERM_PASSTHROUGH; *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; @@ -150,6 +172,7 @@ static int cor_change_backing_file(BlockDriverState *bs, static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", +.instance_size = sizeof(BDRVStateCOR), .bdrv_open = cor_open, .bdrv_child_perm= cor_child_perm, @@ -178,4 +201,83 @@ static void bdrv_copy_on_read_init(void) bdrv_register(&bdrv_copy_on_read); } + +static BlockDriverState *create_filter_node(BlockDriverState *bs, +const char *filter_node_name, +Error **errp) +{ +QDict *opts = qdict_new(); + +qdict_put_str(opts, "driver", "copy-on-read"); +qdict_put_str(opts, "file", bdrv_get_node_name(bs)); +if (filter_node_name) { +qdict_put_str(opts, "node-name", filter_node_name); +} + +return bdrv_open(NULL, NULL, opts, BDRV_O_RDWR, errp); +} + + +BlockDriverState *bdrv_cor_filter_append(BlockDriverState *bs, + const char *filter_node_name, + Error **errp) +{ +BlockDriverState *cor_filter_bs; +BDRVStateCOR *state; +Error *local_err = NULL; + +cor_filter_bs = create_filter_node(bs, filter_node_name, errp); +if (cor_filter_bs == NULL) { +error_prepend(errp, "Could not create filter node: "); +return NULL; +} + +if (!filter_node_name) { +cor_filter_bs->implicit = true; +} + +bdrv_drained_begin(bs); +bdrv_replace_node(bs, cor_filter_bs, &local_err); +bdrv_drained_end(bs); + +if (local_err) { +bdrv_unref(cor_filter_bs); +error_propagate(errp, local_err); +return NULL; +} + +state = cor_filter_bs->opaque; +state->active = true; + +return cor_filter_bs; +} + + +void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs) +{ +BdrvChild *child; +BlockDriverState *bs; +BDRVStateCOR *s = cor_filter_bs->opaque; + +child = bdrv_filtered_rw_child(cor_filter_bs); +if (!child) { +return; +} +bs = child->bs; + +/* Retain the BDS until we complete the graph change. */ +bdrv_ref(bs); +/* Hold a guest back from writing while permissions are being reset. */ +bdrv_drained_begin(bs); +/* Drop permissions before the graph change. */ +s->active = false; +bdrv_child_refresh_perms(cor_filter_bs, child, &error_abort); +bdrv_replace_node(cor_filter_bs, bs, &error_abort); + +bdrv_drained_end(bs); +bdrv_unref(bs); +bdrv_unref(cor_filter_bs); +} + + block_init(bdrv_copy_on_read_init); -- 1.8.3.1
[PATCH v4 02/15] copy-on-read: Support compressed writes
From: Max Reitz Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/copy-on-read.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 242d3ff..c4fa468 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -106,6 +106,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, } +static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov) +{ +return bdrv_co_pwritev(bs->file, offset, bytes, qiov, + BDRV_REQ_WRITE_COMPRESSED); +} + + static void cor_eject(BlockDriverState *bs, bool eject_flag) { bdrv_eject(bs->file->bs, eject_flag); @@ -130,6 +140,7 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_co_pwritev= cor_co_pwritev, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, +.bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, .bdrv_eject = cor_eject, .bdrv_lock_medium = cor_lock_medium, -- 1.8.3.1
[PATCH v4 11/15] copy-on-read: Support preadv/pwritev_part functions
Add support for the recently introduced functions bdrv_co_preadv_part() and bdrv_co_pwritev_part() to the COR-filter driver. Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 74e01ee..4030d56 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs) } -static int coroutine_fn cor_co_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + size_t qiov_offset, + int flags) { -return bdrv_co_preadv(bs->file, offset, bytes, qiov, - flags | BDRV_REQ_COPY_ON_READ); +return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, + flags | BDRV_REQ_COPY_ON_READ); } -static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags) +static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs, +uint64_t offset, +uint64_t bytes, +QEMUIOVector *qiov, +size_t qiov_offset, int flags) { - -return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, +flags); } @@ -152,8 +156,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_getlength = cor_getlength, -.bdrv_co_preadv = cor_co_preadv, -.bdrv_co_pwritev= cor_co_pwritev, +.bdrv_co_preadv_part= cor_co_preadv_part, +.bdrv_co_pwritev_part = cor_co_pwritev_part, .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, .bdrv_co_pdiscard = cor_co_pdiscard, .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed, -- 1.8.3.1
[PATCH v4 13/15] qapi: add filter-node-name to block-stream
Provide the possibility to pass the 'filter-node-name' parameter to the block-stream job as it is done for the commit block job. That will be needed for further iotests implementations. Signed-off-by: Andrey Shinkevich --- block/monitor/block-hmp-cmds.c | 4 ++-- block/stream.c | 4 +++- blockdev.c | 8 +++- include/block/block_int.h | 7 ++- qapi/block-core.json | 6 ++ 5 files changed, 24 insertions(+), 5 deletions(-) diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index 4c8c375..6f6636b 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -507,8 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict) qmp_block_stream(true, device, device, base != NULL, base, false, NULL, false, NULL, qdict_haskey(qdict, "speed"), speed, true, - BLOCKDEV_ON_ERROR_REPORT, false, false, false, false, - &error); + BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, false, + false, &error); hmp_handle_error(mon, error); } diff --git a/block/stream.c b/block/stream.c index 3cca1f9..e0b35f8 100644 --- a/block/stream.c +++ b/block/stream.c @@ -217,7 +217,9 @@ static const BlockJobDriver stream_job_driver = { void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp) + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp) { StreamBlockJob *s; BlockDriverState *iter; diff --git a/blockdev.c b/blockdev.c index 97c2ba2..67918fc 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2753,6 +2753,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, bool has_on_error, BlockdevOnError on_error, + bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, Error **errp) @@ -2769,6 +2770,10 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, on_error = BLOCKDEV_ON_ERROR_REPORT; } +if (!has_filter_node_name) { +filter_node_name = NULL; +} + bs = bdrv_lookup_bs(device, device, errp); if (!bs) { return; @@ -2840,7 +2845,8 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, } stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name, - job_flags, has_speed ? speed : 0, on_error, &local_err); + job_flags, has_speed ? speed : 0, on_error, + filter_node_name, &local_err); if (local_err) { error_propagate(errp, local_err); goto out; diff --git a/include/block/block_int.h b/include/block/block_int.h index 86f7666..0fdfa9c 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1094,6 +1094,9 @@ int is_windows_drive(const char *filename); * See @BlockJobCreateFlags * @speed: The maximum speed, in bytes per second, or 0 for unlimited. * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means + * that a node name should be autogenerated. * @errp: Error object. * * Start a streaming operation on @bs. Clusters that are unallocated @@ -1106,7 +1109,9 @@ int is_windows_drive(const char *filename); void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *base, const char *backing_file_str, int creation_flags, int64_t speed, - BlockdevOnError on_error, Error **errp); + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); /** * commit_start: diff --git a/qapi/block-core.json b/qapi/block-core.json index 943df19..df6981e 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2509,6 +2509,11 @@ #'stop' and 'enospc' can only be used if the block device #supports io-status (see BlockInfo). Since 1.3. # +# @filter-node-name: the node name that should be assigned to the +#filter driver that the stream job inserts into the graph +#above @device. If this option is not given, a node name is +#autogenerated. (Since: 5.1) +# # @auto-finalize: When false, this job will wait in a PENDIN
[PATCH v4 10/15] copy-on-read: Support change filename functions
The COR-filter driver should support a redirecting function to refresh filenames. Otherwise, a file name of the filter will be copied instead of the one of a data node. It is also true for the function bdrv_change_backing_file(). Signed-off-by: Andrey Shinkevich --- block/copy-on-read.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/block/copy-on-read.c b/block/copy-on-read.c index c4fa468..74e01ee 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -21,6 +21,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "block/block_int.h" #include "qemu/module.h" @@ -128,6 +129,21 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) } +static void cor_refresh_filename(BlockDriverState *bs) +{ +pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), +bs->file->bs->filename); +} + + +static int cor_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ +return bdrv_change_backing_file(bs->file->bs, backing_file, backing_fmt); +} + + static BlockDriver bdrv_copy_on_read = { .format_name= "copy-on-read", @@ -146,6 +162,8 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_lock_medium = cor_lock_medium, .bdrv_co_block_status = bdrv_co_block_status_from_file, +.bdrv_refresh_filename = cor_refresh_filename, +.bdrv_change_backing_file = cor_change_backing_file, .has_variable_length= true, .is_filter = true, -- 1.8.3.1
[PATCH v4 00/15] Apply COR-filter to the block-stream permanently
With this series, all the block-stream COR operations pass through the COR-filter. The patches 01-08/15 are taken from the series "block: Deal with filters" by Max Reitz, the full version of that can be found in the branches: https://git.xanclic.moe/XanClic/qemu child-access-functions-v6 https://github.com/XanClic/qemu child-access-functions-v6 When running iotests, apply "char-socket: Fix race condition" to avoid sporadic segmentation faults. v4: 01: Initialization of the COR-filter BDRVStateCOR member. v3: 01: The COR filter insert/remove functions moved to block/copy-on-read.c to be a part of API. 02: block/stream.c code refactoring. 03: The separate call to block_job_add_bdrv() is used to block operations on the active node after the filter inserted and the job created. 04: The iotests case 030::test_overlapping_4 was modified to unbound the block-stream job from the base node. 05: The COR driver functions preadv/pwritev replaced with their analogous preadv/pwritev_part. v2: 01: No more skipping filters while checking for operation blockers. However, we exclude filters between the bottom node and base because we do not set the operation blockers for filters anymore. 02: As stated above, we do not set the operation blockers for filters anymore. So, skip filters when we block operations for the target node. 03: The comment added for the patch 4/7. 04: The QAPI target version changed to 5.1. 05: The 'filter-node-name' now excluded from using in the test #030. If we need it no more in a final version of the series, the patch 5/7 may be removed. 06: The COR-filter included into the frozen chain of a block-stream job. The 'above_base' node pointer is left because it is essential for finding the base node in case of filters above. Andrey Shinkevich (7): block: prepare block-stream for using COR-filter copy-on-read: Support change filename functions copy-on-read: Support preadv/pwritev_part functions copy-on-read: add filter append/drop functions qapi: add filter-node-name to block-stream iotests: prepare 245 for using filter in block-stream block: apply COR-filter to block-stream jobs Max Reitz (8): block: Mark commit and mirror as filter drivers copy-on-read: Support compressed writes block: Add child access functions block: Add chain helper functions block: Include filters when freezing backing chain block: Use CAFs in block status functions commit: Deal with filters when blocking intermediate nodes block: Use CAFs when working with backing chains block.c| 275 ++--- block/commit.c | 85 ++--- block/copy-on-read.c | 159 ++-- block/io.c | 19 +-- block/mirror.c | 6 +- block/monitor/block-hmp-cmds.c | 4 +- block/stream.c | 89 + blockdev.c | 19 ++- include/block/block.h | 6 +- include/block/block_int.h | 67 +- qapi/block-core.json | 6 + tests/qemu-iotests/030 | 17 +-- tests/qemu-iotests/141.out | 2 +- tests/qemu-iotests/245 | 10 +- 14 files changed, 625 insertions(+), 139 deletions(-) -- 1.8.3.1
[PATCH v4 14/15] iotests: prepare 245 for using filter in block-stream
The preliminary patch modifies the test 245 to prepare the block-stream job for using COR-filter. The filter breaks the backing chain being connected to the underlying node by file child link. Signed-off-by: Andrey Shinkevich --- tests/qemu-iotests/245 | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index 4f5f0bb..5bcddc4 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -896,15 +896,19 @@ class TestBlockdevReopen(iotests.QMPTestCase): # make hd1 read-only and block-stream requires it to be read-write # (Which error message appears depends on whether the stream job is # already done with copying at this point.) -self.reopen(opts, {}, +opts = hd_opts(1) +opts['backing'] = hd_opts(2) +opts['backing']['backing'] = None +self.reopen(opts, {'read-only': True}, ["Can't set node 'hd1' to r/o with copy-on-read enabled", "Cannot make block node read-only, there is a writer on it"]) # We can't remove hd2 while the stream job is ongoing -opts['backing']['backing'] = None -self.reopen(opts, {'backing.read-only': False}, "Cannot change 'backing' link from 'hd1' to 'hd2'") +opts['backing'] = None +self.reopen(opts, {'read-only': False}, "Cannot change 'backing' link from 'hd1' to 'hd2'") # We can detach hd1 from hd0 because it doesn't affect the stream job +opts = hd_opts(0) opts['backing'] = None self.reopen(opts) -- 1.8.3.1
[PATCH v4 06/15] block: Use CAFs in block status functions
From: Max Reitz Use the child access functions in the block status inquiry functions as appropriate. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/io.c | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/io.c b/block/io.c index 7d30e61..47d8096 100644 --- a/block/io.c +++ b/block/io.c @@ -2387,11 +2387,12 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs, if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { ret |= BDRV_BLOCK_ALLOCATED; } else if (want_zero) { +BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs); + if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; -} else if (bs->backing) { -BlockDriverState *bs2 = bs->backing->bs; -int64_t size2 = bdrv_getlength(bs2); +} else if (cow_bs) { +int64_t size2 = bdrv_getlength(cow_bs); if (size2 >= 0 && offset >= size2) { ret |= BDRV_BLOCK_ZERO; @@ -2457,7 +2458,7 @@ static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs, bool first = true; assert(bs != base); -for (p = bs; p != base; p = backing_bs(p)) { +for (p = bs; p != base; p = bdrv_filtered_bs(p)) { ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map, file); if (ret < 0) { @@ -2543,7 +2544,7 @@ int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum, int64_t *map, BlockDriverState **file) { -return bdrv_block_status_above(bs, backing_bs(bs), +return bdrv_block_status_above(bs, bdrv_filtered_bs(bs), offset, bytes, pnum, map, file); } @@ -2553,9 +2554,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int ret; int64_t dummy; -ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset, - bytes, pnum ? pnum : &dummy, NULL, - NULL); +ret = bdrv_common_block_status_above(bs, bdrv_filtered_bs(bs), false, + offset, bytes, pnum ? pnum : &dummy, + NULL, NULL); if (ret < 0) { return ret; } @@ -2618,7 +2619,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, break; } -intermediate = backing_bs(intermediate); +intermediate = bdrv_filtered_bs(intermediate); } *pnum = n; -- 1.8.3.1
[PATCH v4 15/15] block: apply COR-filter to block-stream jobs
The patch completes the series with the COR-filter insertion to any block-stream operation. It also makes changes to the iotests 030 and 141.out. Signed-off-by: Andrey Shinkevich --- block/stream.c | 83 -- tests/qemu-iotests/030 | 8 +++-- tests/qemu-iotests/141.out | 2 +- 3 files changed, 64 insertions(+), 29 deletions(-) diff --git a/block/stream.c b/block/stream.c index e0b35f8..a74a07b 100644 --- a/block/stream.c +++ b/block/stream.c @@ -19,6 +19,7 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "block/copy-on-read.h" enum { /* @@ -32,8 +33,11 @@ enum { typedef struct StreamBlockJob { BlockJob common; BlockDriverState *bottom; +BlockDriverState *cor_filter_bs; +BlockDriverState *target_bs; BlockdevOnError on_error; char *backing_file_str; +char *base_fmt; bool bs_read_only; bool chain_frozen; } StreamBlockJob; @@ -52,33 +56,25 @@ static void stream_abort(Job *job) StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); if (s->chain_frozen) { -BlockJob *bjob = &s->common; -bdrv_unfreeze_chain(blk_bs(bjob->blk), s->bottom); +bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom); } } static int stream_prepare(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); -BlockJob *bjob = &s->common; -BlockDriverState *bs = blk_bs(bjob->blk); +BlockDriverState *bs = s->target_bs; BlockDriverState *base = backing_bs(s->bottom); Error *local_err = NULL; int ret = 0; -bdrv_unfreeze_chain(bs, s->bottom); +bdrv_unfreeze_chain(s->cor_filter_bs, s->bottom); s->chain_frozen = false; if (bs->backing) { -const char *base_id = NULL, *base_fmt = NULL; -if (base) { -base_id = s->backing_file_str; -if (base->drv) { -base_fmt = base->drv->format_name; -} -} bdrv_set_backing_hd(bs, base, &local_err); -ret = bdrv_change_backing_file(bs, base_id, base_fmt); +ret = bdrv_change_backing_file(bs, s->backing_file_str, + s->base_fmt); if (local_err) { error_report_err(local_err); return -EPERM; @@ -92,7 +88,9 @@ static void stream_clean(Job *job) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockJob *bjob = &s->common; -BlockDriverState *bs = blk_bs(bjob->blk); +BlockDriverState *bs = s->target_bs; + +bdrv_cor_filter_drop(s->cor_filter_bs); /* Reopen the image back in read-only mode if necessary */ if (s->bs_read_only) { @@ -102,13 +100,14 @@ static void stream_clean(Job *job) } g_free(s->backing_file_str); +g_free(s->base_fmt); } static int coroutine_fn stream_run(Job *job, Error **errp) { StreamBlockJob *s = container_of(job, StreamBlockJob, common.job); BlockBackend *blk = s->common.blk; -BlockDriverState *bs = blk_bs(blk); +BlockDriverState *bs = s->target_bs; bool enable_cor = !backing_bs(s->bottom); int64_t len; int64_t offset = 0; @@ -156,8 +155,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ -ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true, - offset, n, &n); +ret = bdrv_is_allocated_above(bdrv_filtered_cow_bs(bs), s->bottom, + true, offset, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { n = len - offset; @@ -225,7 +224,13 @@ void stream_start(const char *job_id, BlockDriverState *bs, BlockDriverState *iter; bool bs_read_only; int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED; +BlockDriverState *cor_filter_bs = NULL; BlockDriverState *bottom = bdrv_find_overlay(bs, base); +char *base_fmt = NULL; + +if (base && base->drv) { +base_fmt = g_strdup(base->drv->format_name); +} if (bdrv_freeze_chain(bs, bottom, errp) < 0) { return; @@ -240,17 +245,35 @@ void stream_start(const char *job_id, BlockDriverState *bs, } } -/* Prevent concurrent jobs trying to modify the graph structure here, we - * already have our own plans. Also don't allow resize as the image size is - * queried only at the job start and then cached. */ -s = block_job_create(job_id, &stream_job_driver, NULL, bs, - basic_flags | BLK_PERM_GRAPH_MOD, - basic_flags | BLK_PERM_WRITE, +cor_filter_bs = bdrv_cor_filter_append(bs,
[PATCH v4 09/15] block: prepare block-stream for using COR-filter
This patch is the first one in the series where the COR-filter node will be hard-coded for using in the block-stream job. The block jobs may be run in parallel. Exclude conflicts with filter nodes used for a concurrent job while checking for the blocked operations. It incurs changes in the iotests 030::test_overlapping_4 that now passes with no conflict because the stream job does not have a real dependency on its base and on a filter above it. Signed-off-by: Andrey Shinkevich --- blockdev.c | 11 +-- tests/qemu-iotests/030 | 9 - 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/blockdev.c b/blockdev.c index b3c840e..97c2ba2 100644 --- a/blockdev.c +++ b/blockdev.c @@ -2763,6 +2763,7 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, Error *local_err = NULL; const char *base_name = NULL; int job_flags = JOB_DEFAULT; +BlockDriverState *bottom_cow_node; if (!has_on_error) { on_error = BLOCKDEV_ON_ERROR_REPORT; @@ -2807,8 +2808,14 @@ void qmp_block_stream(bool has_job_id, const char *job_id, const char *device, base_name = base_bs->filename; } -/* Check for op blockers in the whole chain between bs and base */ -for (iter = bs; iter && iter != base_bs; iter = backing_bs(iter)) { +bottom_cow_node = bdrv_find_overlay(bs, base_bs); +if (!bottom_cow_node) { +error_setg(errp, "bottom node is not found, nothing to stream"); +goto out; +} +/* Check for op blockers in the whole chain between bs and bottom */ +for (iter = bs; iter && iter != bdrv_filtered_bs(bottom_cow_node); + iter = bdrv_filtered_bs(iter)) { if (bdrv_op_is_blocked(iter, BLOCK_OP_TYPE_STREAM, errp)) { goto out; } diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030 index 104e3ce..d7638cd 100755 --- a/tests/qemu-iotests/030 +++ b/tests/qemu-iotests/030 @@ -368,8 +368,7 @@ class TestParallelOps(iotests.QMPTestCase): self.wait_until_completed(drive='commit-drive0') # In this case the base node of the stream job is the same as the -# top node of commit job. Since this results in the commit filter -# node being part of the stream chain, this is not allowed. +# top node of commit job. def test_overlapping_4(self): self.assert_no_active_block_jobs() @@ -381,13 +380,13 @@ class TestParallelOps(iotests.QMPTestCase): # Stream from node2 into node4 result = self.vm.qmp('block-stream', device='node4', base_node='node2', job_id='node4') -self.assert_qmp(result, 'error/desc', -"Cannot freeze 'backing' link to 'commit-filter'") +self.assert_qmp(result, 'return', {}) result = self.vm.qmp('block-job-set-speed', device='drive0', speed=0) self.assert_qmp(result, 'return', {}) -self.wait_until_completed() +self.vm.run_job(job='drive0', auto_dismiss=True) +self.vm.run_job(job='node4', auto_dismiss=True) self.assert_no_active_block_jobs() # In this case the base node of the stream job is the commit job's -- 1.8.3.1
[PATCH v4 08/15] block: Use CAFs when working with backing chains
From: Max Reitz Use child access functions when iterating through backing chains so filters do not break the chain. Signed-off-by: Max Reitz --- block.c | 40 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/block.c b/block.c index 459412e..69e9152 100644 --- a/block.c +++ b/block.c @@ -4593,7 +4593,8 @@ int bdrv_change_backing_file(BlockDriverState *bs, } /* - * Finds the image layer in the chain that has 'bs' as its backing file. + * Finds the image layer in the chain that has 'bs' (or a filter on + * top of it) as its backing file. * * active is the current topmost image. * @@ -4605,11 +4606,18 @@ int bdrv_change_backing_file(BlockDriverState *bs, BlockDriverState *bdrv_find_overlay(BlockDriverState *active, BlockDriverState *bs) { -while (active && bs != backing_bs(active)) { -active = backing_bs(active); +bs = bdrv_skip_rw_filters(bs); +active = bdrv_skip_rw_filters(active); + +while (active) { +BlockDriverState *next = bdrv_backing_chain_next(active); +if (bs == next) { +return active; +} +active = next; } -return active; +return NULL; } /* Given a BDS, searches for the base layer. */ @@ -4761,9 +4769,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, * other intermediate nodes have been dropped. * If 'top' is an implicit node (e.g. "commit_top") we should skip * it because no one inherits from it. We use explicit_top for that. */ -while (explicit_top && explicit_top->implicit) { -explicit_top = backing_bs(explicit_top); -} +explicit_top = bdrv_skip_implicit_filters(explicit_top); update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top); /* success - we can delete the intermediate states, and link top->base */ @@ -5205,7 +5211,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device, bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) { while (top && top != base) { -top = backing_bs(top); +top = bdrv_filtered_bs(top); } return top != NULL; @@ -5466,7 +5472,17 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, is_protocol = path_has_protocol(backing_file); -for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) { +/* + * Being largely a legacy function, skip any filters here + * (because filters do not have normal filenames, so they cannot + * match anyway; and allowing json:{} filenames is a bit out of + * scope). + */ +for (curr_bs = bdrv_skip_rw_filters(bs); + bdrv_filtered_cow_child(curr_bs) != NULL; + curr_bs = bdrv_backing_chain_next(curr_bs)) +{ +BlockDriverState *bs_below = bdrv_backing_chain_next(curr_bs); /* If either of the filename paths is actually a protocol, then * compare unmodified paths; otherwise make paths relative */ @@ -5474,7 +5490,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, char *backing_file_full_ret; if (strcmp(backing_file, curr_bs->backing_file) == 0) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } /* Also check against the full backing filename for the image */ @@ -5484,7 +5500,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, bool equal = strcmp(backing_file, backing_file_full_ret) == 0; g_free(backing_file_full_ret); if (equal) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } } @@ -5510,7 +5526,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, g_free(filename_tmp); if (strcmp(backing_file_full, filename_full) == 0) { -retval = curr_bs->backing->bs; +retval = bs_below; break; } } -- 1.8.3.1
[PATCH v4 05/15] block: Include filters when freezing backing chain
From: Max Reitz In order to make filters work in backing chains, the associated functions must be able to deal with them and freeze all filter links, be they COW or R/W filter links. In the process, rename these functions to reflect that they now act on generalized chains of filter nodes instead of backing chains alone. While at it, add some comments that note which functions require their caller to ensure that a given child link is not frozen, and how the callers do so. Signed-off-by: Max Reitz Signed-off-by: Andrey Shinkevich --- block.c | 81 ++- block/commit.c| 8 ++--- block/mirror.c| 4 +-- block/stream.c| 8 ++--- include/block/block.h | 6 ++-- 5 files changed, 60 insertions(+), 47 deletions(-) diff --git a/block.c b/block.c index 5b4ebfe..459412e 100644 --- a/block.c +++ b/block.c @@ -2513,12 +2513,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child, * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this * function uses bdrv_set_perm() to update the permissions according to the new * reference that @new_bs gets. + * + * Callers must ensure that child->frozen is false. */ static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs) { BlockDriverState *old_bs = child->bs; uint64_t perm, shared_perm; +/* Asserts that child->frozen == false */ bdrv_replace_child_noperm(child, new_bs); /* @@ -2676,6 +2679,7 @@ static void bdrv_detach_child(BdrvChild *child) g_free(child); } +/* Callers must ensure that child->frozen is false. */ void bdrv_root_unref_child(BdrvChild *child) { BlockDriverState *child_bs; @@ -2685,10 +2689,6 @@ void bdrv_root_unref_child(BdrvChild *child) bdrv_unref(child_bs); } -/** - * Clear all inherits_from pointers from children and grandchildren of - * @root that point to @root, where necessary. - */ static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child) { BdrvChild *c; @@ -2713,6 +2713,7 @@ static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child) } } +/* Callers must ensure that child->frozen is false. */ void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child) { if (child == NULL) { @@ -2756,7 +2757,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) && bdrv_inherits_from_recursive(backing_hd, bs); -if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) { +if (bdrv_is_chain_frozen(bs, child_bs(bs->backing), errp)) { return; } @@ -2765,6 +2766,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, } if (bs->backing) { +/* Cannot be frozen, we checked that above */ bdrv_unref_child(bs, bs->backing); bs->backing = NULL; } @@ -3912,8 +3914,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState *reopen_state, return -EPERM; } /* Check if the backing link that we want to replace is frozen */ -if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs), - errp)) { +if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) { return -EPERM; } reopen_state->replace_backing_bs = true; @@ -4248,6 +4249,7 @@ static void bdrv_close(BlockDriverState *bs) if (bs->drv) { if (bs->drv->bdrv_close) { +/* Must unfreeze all children, so bdrv_unref_child() works */ bs->drv->bdrv_close(bs); } bs->drv = NULL; @@ -4617,20 +4619,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs) } /* - * Return true if at least one of the backing links between @bs and - * @base is frozen. @errp is set if that's the case. + * Return true if at least one of the (COW and R/W) filter links + * between @bs and @base is frozen. @errp is set if that's the case. * @base must be reachable from @bs, or NULL. */ -bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, - Error **errp) +bool bdrv_is_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + Error **errp) { BlockDriverState *i; +BdrvChild *child; + +for (i = bs; i != base; i = child_bs(child)) { +child = bdrv_filtered_child(i); -for (i = bs; i != base; i = backing_bs(i)) { -if (i->backing && i->backing->frozen) { +if (child && child->frozen) { error_setg(errp, "Cannot change '%s' link from '%s' to '%s'", - i->backing->name, i->node_name, - backing_bs(i)->node_name); + child->name, i->node_name, child->bs->node_name); return true; } } @@ -4639
[PATCH v4 07/15] commit: Deal with filters when blocking intermediate nodes
From: Max Reitz This includes some permission limiting (for example, we only need to take the RESIZE permission if the base is smaller than the top). Signed-off-by: Max Reitz Signed-off-by: Andrey Shinkevich --- block/commit.c | 75 ++ 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/block/commit.c b/block/commit.c index e1f45a4..4df145d 100644 --- a/block/commit.c +++ b/block/commit.c @@ -37,6 +37,7 @@ typedef struct CommitBlockJob { BlockBackend *top; BlockBackend *base; BlockDriverState *base_bs; +BlockDriverState *above_base; BlockdevOnError on_error; bool base_read_only; bool chain_frozen; @@ -153,7 +154,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) break; } /* Copy if allocated above the base */ -ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false, +ret = bdrv_is_allocated_above(blk_bs(s->top), s->above_base, true, offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); @@ -253,15 +254,35 @@ void commit_start(const char *job_id, BlockDriverState *bs, CommitBlockJob *s; BlockDriverState *iter; BlockDriverState *commit_top_bs = NULL; +BlockDriverState *filtered_base; Error *local_err = NULL; +int64_t base_size, top_size; +uint64_t perms, iter_shared_perms; int ret; assert(top != bs); -if (top == base) { +if (bdrv_skip_rw_filters(top) == bdrv_skip_rw_filters(base)) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; } +base_size = bdrv_getlength(base); +if (base_size < 0) { +error_setg_errno(errp, -base_size, "Could not inquire base image size"); +return; +} + +top_size = bdrv_getlength(top); +if (top_size < 0) { +error_setg_errno(errp, -top_size, "Could not inquire top image size"); +return; +} + +perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE; +if (base_size < top_size) { +perms |= BLK_PERM_RESIZE; +} + s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL, speed, creation_flags, NULL, NULL, errp); if (!s) { @@ -301,17 +322,43 @@ void commit_start(const char *job_id, BlockDriverState *bs, s->commit_top_bs = commit_top_bs; -/* Block all nodes between top and base, because they will - * disappear from the chain after this operation. */ -assert(bdrv_chain_contains(top, base)); -for (iter = top; iter != base; iter = backing_bs(iter)) { -/* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves - * at s->base (if writes are blocked for a node, they are also blocked - * for its backing file). The other options would be a second filter - * driver above s->base. */ +/* + * Block all nodes between top and base, because they will + * disappear from the chain after this operation. + * Note that this assumes that the user is fine with removing all + * nodes (including R/W filters) between top and base. Assuring + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ +s->above_base = bdrv_find_overlay(top, base); +assert(s->above_base); + +/* + * The topmost node with + * bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base) + */ +filtered_base = bdrv_filtered_cow_bs(s->above_base); +assert(bdrv_skip_rw_filters(filtered_base) == bdrv_skip_rw_filters(base)); + +/* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves + * at s->base (if writes are blocked for a node, they are also blocked + * for its backing file). The other options would be a second filter + * driver above s->base. + */ +iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + +for (iter = top; iter != base; iter = bdrv_filtered_bs(iter)) { +if (iter == filtered_base) { +/* + * From here on, all nodes are filters on the base. This + * allows us to share BLK_PERM_CONSISTENT_READ. + */ +iter_shared_perms |= BLK_PERM_CONSISTENT_READ; +} + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } @@ -328,9 +375,7 @@ void commit_start(const char *job_id, BlockDriverState *bs, } s->base = blk_new(s->common.job.aio_context, - BLK_PERM_CONSISTENT_READ - | BLK_PERM_WRITE - | BLK_PERM_RESIZE, +
[PATCH v4 01/15] block: Mark commit and mirror as filter drivers
From: Max Reitz The commit and mirror block nodes are filters, so they should be marked as such. (Strictly speaking, BDS.is_filter's documentation states that a filter's child must be bs->file. The following patch will relax this restriction, however.) Signed-off-by: Max Reitz Reviewed-by: Alberto Garcia Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich --- block/commit.c | 2 ++ block/mirror.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/block/commit.c b/block/commit.c index 87f6096..445a280 100644 --- a/block/commit.c +++ b/block/commit.c @@ -240,6 +240,8 @@ static BlockDriver bdrv_commit_top = { .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_commit_top_refresh_filename, .bdrv_child_perm= bdrv_commit_top_child_perm, + +.is_filter = true, }; void commit_start(const char *job_id, BlockDriverState *bs, diff --git a/block/mirror.c b/block/mirror.c index aca95c9..b6de24b 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1527,6 +1527,8 @@ static BlockDriver bdrv_mirror_top = { .bdrv_co_block_status = bdrv_co_block_status_from_backing, .bdrv_refresh_filename = bdrv_mirror_top_refresh_filename, .bdrv_child_perm= bdrv_mirror_top_child_perm, + +.is_filter = true, }; static BlockJob *mirror_start_job( -- 1.8.3.1
[PATCH v4 04/15] block: Add chain helper functions
From: Max Reitz Add some helper functions for skipping filters in a chain of block nodes. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy --- block.c | 55 +++ include/block/block_int.h | 3 +++ 2 files changed, 58 insertions(+) diff --git a/block.c b/block.c index b2aae2e..5b4ebfe 100644 --- a/block.c +++ b/block.c @@ -6863,3 +6863,58 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs) { return bdrv_filtered_rw_child(bs) ?: bs->file; } + +static BlockDriverState *bdrv_skip_filters(BlockDriverState *bs, + bool stop_on_explicit_filter) +{ +BdrvChild *filtered; + +if (!bs) { +return NULL; +} + +while (!(stop_on_explicit_filter && !bs->implicit)) { +filtered = bdrv_filtered_rw_child(bs); +if (!filtered) { +break; +} +bs = filtered->bs; +} +/* + * Note that this treats nodes with bs->drv == NULL as not being + * R/W filters (bs->drv == NULL should be replaced by something + * else anyway). + * The advantage of this behavior is that this function will thus + * always return a non-NULL value (given a non-NULL @bs). + */ + +return bs; +} + +/* + * Return the first BDS that has not been added implicitly or that + * does not have an RW-filtered child down the chain starting from @bs + * (including @bs itself). + */ +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs) +{ +return bdrv_skip_filters(bs, true); +} + +/* + * Return the first BDS that does not have an RW-filtered child down + * the chain starting from @bs (including @bs itself). + */ +BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs) +{ +return bdrv_skip_filters(bs, false); +} + +/* + * For a backing chain, return the first non-filter backing image of + * the first non-filter image. + */ +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs) +{ +return bdrv_skip_rw_filters(bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs))); +} diff --git a/include/block/block_int.h b/include/block/block_int.h index dca59e9..86f7666 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1347,6 +1347,9 @@ BdrvChild *bdrv_filtered_child(BlockDriverState *bs); BdrvChild *bdrv_metadata_child(BlockDriverState *bs); BdrvChild *bdrv_storage_child(BlockDriverState *bs); BdrvChild *bdrv_primary_child(BlockDriverState *bs); +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); +BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs); +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); static inline BlockDriverState *child_bs(BdrvChild *child) { -- 1.8.3.1
[PATCH v4 03/15] block: Add child access functions
From: Max Reitz There are BDS children that the general block layer code can access, namely bs->file and bs->backing. Since the introduction of filters and external data files, their meaning is not quite clear. bs->backing can be a COW source, or it can be an R/W-filtered child; bs->file can be an R/W-filtered child, it can be data and metadata storage, or it can be just metadata storage. This overloading really is not helpful. This patch adds function that retrieve the correct child for each exact purpose. Later patches in this series will make use of them. Doing so will allow us to handle filter nodes and external data files in a meaningful way. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich --- block.c | 99 +++ include/block/block_int.h | 57 +-- 2 files changed, 153 insertions(+), 3 deletions(-) diff --git a/block.c b/block.c index 0653ccb..b2aae2e 100644 --- a/block.c +++ b/block.c @@ -6764,3 +6764,102 @@ void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error **errp) parent_bs->drv->bdrv_del_child(parent_bs, child, errp); } + +/* + * Return the child that @bs acts as an overlay for, and from which data may be + * copied in COW or COR operations. Usually this is the backing file. + */ +BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (bs->drv->is_filter) { +return NULL; +} + +return bs->backing; +} + +/* + * If @bs acts as a pass-through filter for one of its children, + * return that child. "Pass-through" means that write operations to + * @bs are forwarded to that child instead of triggering COW. + */ +BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (!bs->drv->is_filter) { +return NULL; +} + +/* Only one of @backing or @file may be used */ +assert(!(bs->backing && bs->file)); + +return bs->backing ?: bs->file; +} + +/* + * Return any filtered child, independently of how it reacts to write + * accesses and whether data is copied onto this BDS through COR. + */ +BdrvChild *bdrv_filtered_child(BlockDriverState *bs) +{ +BdrvChild *cow_child = bdrv_filtered_cow_child(bs); +BdrvChild *rw_child = bdrv_filtered_rw_child(bs); + +/* There can only be one filtered child at a time */ +assert(!(cow_child && rw_child)); + +return cow_child ?: rw_child; +} + +/* + * Return the child that stores the metadata for this node. + */ +BdrvChild *bdrv_metadata_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +/* Filters do not have metadata */ +if (bs->drv->is_filter) { +return NULL; +} + +return bs->file; +} + +/* + * Return the child that stores the data that is allocated on this + * node. This may or may not include metadata. + */ +BdrvChild *bdrv_storage_child(BlockDriverState *bs) +{ +if (!bs || !bs->drv) { +return NULL; +} + +if (bs->drv->bdrv_storage_child) { +return bs->drv->bdrv_storage_child(bs); +} + +return bdrv_filtered_rw_child(bs) ?: bs->file; +} + +/* + * Return the primary child of this node: For filters, that is the + * filtered child. For other nodes, that is usually the child storing + * metadata. + * (A generally more helpful description is that this is (usually) the + * child that has the same filename as @bs.) + */ +BdrvChild *bdrv_primary_child(BlockDriverState *bs) +{ +return bdrv_filtered_rw_child(bs) ?: bs->file; +} diff --git a/include/block/block_int.h b/include/block/block_int.h index df6d027..dca59e9 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -89,9 +89,11 @@ struct BlockDriver { int instance_size; /* set to true if the BlockDriver is a block filter. Block filters pass - * certain callbacks that refer to data (see block.c) to their bs->file if - * the driver doesn't implement them. Drivers that do not wish to forward - * must implement them and return -ENOTSUP. + * certain callbacks that refer to data (see block.c) to their bs->file + * or bs->backing (whichever one exists) if the driver doesn't implement + * them. Drivers that do not wish to forward must implement them and return + * -ENOTSUP. + * Note that filters are not allowed to modify data. */ bool is_filter; /* @@ -585,6 +587,13 @@ struct BlockDriver { * If this pointer is NULL, the array is considered empty. * "filename" and "driver" are always considered strong. */ const char *const *strong_runtime_opts; + +/** + * Return the data storage child, if there is exactly one. If + * this function is not implemented, the block layer will assume + * bs->file to be this child. + */ +BdrvChild *(*bdrv_storage_child)(
Re: [PATCH v2 3/4] mirror: Make sure that source and target size match
11.05.2020 16:58, Kevin Wolf wrote: If the target is shorter than the source, mirror would copy data until it reaches the end of the target and then fail with an I/O error when trying to write past the end. If the target is longer than the source, the mirror job would complete successfully, but the target wouldn't actually be an accurate copy of the source image (it would contain some additional garbage at the end). Fix this by checking that both images have the same size when the job starts. Signed-off-by: Kevin Wolf Message-Id: <20200507145228.323412-3-kw...@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/mirror.c | 21 - 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index aca95c9bc9..201ffa26f9 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -872,6 +872,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) BlockDriverState *target_bs = blk_bs(s->target); bool need_drain = true; int64_t length; +int64_t target_length; BlockDriverInfo bdi; char backing_filename[2]; /* we only need 2 characters because we are only checking for a NULL string */ @@ -887,24 +888,26 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) goto immediate_exit; } +target_length = blk_getlength(s->target); +if (target_length < 0) { +ret = target_length; +goto immediate_exit; +} + /* Active commit must resize the base image if its size differs from the * active layer. */ if (s->base == blk_bs(s->target)) { -int64_t base_length; - -base_length = blk_getlength(s->target); -if (base_length < 0) { -ret = base_length; -goto immediate_exit; -} - -if (s->bdev_length > base_length) { +if (s->bdev_length > target_length) { ret = blk_truncate(s->target, s->bdev_length, false, PREALLOC_MODE_OFF, 0, NULL); if (ret < 0) { goto immediate_exit; } } Hmm, interesting, if base is larger, is our behavior correct? Blockdev becomes larger after commit and old data becomes available? I think we should zero the tail after old EOF or shrink the target.. +} else if (s->bdev_length != target_length) { +error_setg(errp, "Source and target image have different sizes"); +ret = -EINVAL; Seems, the only case, when mirror_run() sets errp. And, therefore, the only correct one.. +goto immediate_exit; } if (s->bdev_length == 0) { -- Best regards, Vladimir
Re: [PATCH v2 3/4] mirror: Make sure that source and target size match
12.05.2020 20:15, Vladimir Sementsov-Ogievskiy wrote: + } else if (s->bdev_length != target_length) { + error_setg(errp, "Source and target image have different sizes"); + ret = -EINVAL; Seems, the only case, when mirror_run() sets errp. And, therefore, the only correct one.. the only one failure case I mean, of course. -- Best regards, Vladimir
Re: [PATCH v4 00/15] Apply COR-filter to the block-stream permanently
12.05.2020 19:50, Andrey Shinkevich wrote: With this series, all the block-stream COR operations pass through the COR-filter. The patches 01-08/15 are taken from the series "block: Deal with filters" by Max Reitz, the full version of that can be found in the branches: https://git.xanclic.moe/XanClic/qemu child-access-functions-v6 https://github.com/XanClic/qemu child-access-functions-v6 When running iotests, apply "char-socket: Fix race condition" to avoid sporadic segmentation faults. v4: 01: Initialization of the COR-filter BDRVStateCOR member. Hmm... but 01 doesn't touch COR-filter v3: 01: The COR filter insert/remove functions moved to block/copy-on-read.c to be a part of API. 02: block/stream.c code refactoring. 03: The separate call to block_job_add_bdrv() is used to block operations on the active node after the filter inserted and the job created. 04: The iotests case 030::test_overlapping_4 was modified to unbound the block-stream job from the base node. 05: The COR driver functions preadv/pwritev replaced with their analogous preadv/pwritev_part. I assume, these changes are about your patches, which are 09-15, and Max's patches are unchanged, right? v2: 01: No more skipping filters while checking for operation blockers. However, we exclude filters between the bottom node and base because we do not set the operation blockers for filters anymore. 02: As stated above, we do not set the operation blockers for filters anymore. So, skip filters when we block operations for the target node. 03: The comment added for the patch 4/7. 04: The QAPI target version changed to 5.1. 05: The 'filter-node-name' now excluded from using in the test #030. If we need it no more in a final version of the series, the patch 5/7 may be removed. 06: The COR-filter included into the frozen chain of a block-stream job. The 'above_base' node pointer is left because it is essential for finding the base node in case of filters above. Andrey Shinkevich (7): block: prepare block-stream for using COR-filter copy-on-read: Support change filename functions copy-on-read: Support preadv/pwritev_part functions copy-on-read: add filter append/drop functions qapi: add filter-node-name to block-stream iotests: prepare 245 for using filter in block-stream block: apply COR-filter to block-stream jobs Max Reitz (8): block: Mark commit and mirror as filter drivers this is for commit copy-on-read: Support compressed writes for stream block: Add child access functions I do think, that for these series we need only filtered child and nothing more block: Add chain helper functions block: Include filters when freezing backing chain block: Use CAFs in block status functions commit: Deal with filters when blocking intermediate nodes block: Use CAFs when working with backing chains So, fix stream, commit and some thing used in it. Hi Max! Could you take a brief look and say, could we proceed in this way, taking part of your old series? How much it conflicts with your plans? Let me clarify. This all is needed, as we have old proposed feature (and patches): discarding blocks from intermediate images during block-stream. It helps to save disk space during stream process. And the correct way to get access to intermediate nodes (to be able to discard from them) is to append a filter. Firstly we proposed our own filter, but that was proposed on list to use existing COR filter for stream and it seemed a correct way. So we are trying to insert this COR filter. And the problem with it that without your series it breaks iotest 30, which does different magic with parallel stream and commit on the same backing chain. So, it was my proposal to extract something from your series, to make this test work. And the result is here. I thought that the necessary part of your series for stream/commit is smaller.. But still, 8 patches is not too much. The feature for stream is being postponed already for more than a year due to this trouble. We need to proceed somehow. And the feature is useful. -- Best regards, Vladimir
Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently
Patchew URL: https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/ Hi, This series failed the docker-quick@centos7 build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #!/bin/bash make docker-image-centos7 V=1 NETWORK=1 time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1 === TEST SCRIPT END === CC block/copy-on-read.o CC block/block-copy.o CC block/crypto.o /tmp/qemu-test/src/block/copy-on-read.c:29:32: fatal error: block/copy-on-read.h: No such file or directory #include "block/copy-on-read.h" ^ compilation terminated. --- CC scsi/pr-manager-helper.o CC block/dmg-bz2.o CC crypto/hash.o /tmp/qemu-test/src/block/stream.c:22:32: fatal error: block/copy-on-read.h: No such file or directory #include "block/copy-on-read.h" ^ compilation terminated. CC crypto/hash-nettle.o CC crypto/hmac.o make: *** [block/copy-on-read.o] Error 1 make: *** Waiting for unfinished jobs make: *** [block/stream.o] Error 1 Traceback (most recent call last): File "./tests/docker/docker.py", line 664, in sys.exit(main()) --- raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=09ff0b4367d645d083a08750a8b4b4c7', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-rfy6u1nc/src/docker-src.2020-05-12-14.18.19.3751:/var/tmp/qemu:z,ro', 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit status 2. filter=--filter=label=com.qemu.instance.uuid=09ff0b4367d645d083a08750a8b4b4c7 make[1]: *** [docker-run] Error 1 make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-rfy6u1nc/src' make: *** [docker-run-test-quick@centos7] Error 2 real2m14.644s user0m8.133s The full log is available at http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.docker-quick@centos7/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently
Patchew URL: https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/ Hi, This series failed the asan build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #!/bin/bash export ARCH=x86_64 make docker-image-fedora V=1 NETWORK=1 time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1 === TEST SCRIPT END === CC scsi/utils.o CC scsi/pr-manager.o CC scsi/pr-manager-helper.o /tmp/qemu-test/src/block/copy-on-read.c:29:10: fatal error: 'block/copy-on-read.h' file not found #include "block/copy-on-read.h" ^~ 1 error generated. make: *** [/tmp/qemu-test/src/rules.mak:69: block/copy-on-read.o] Error 1 make: *** Waiting for unfinished jobs /tmp/qemu-test/src/block/stream.c:22:10: fatal error: 'block/copy-on-read.h' file not found #include "block/copy-on-read.h" ^~ 1 error generated. make: *** [/tmp/qemu-test/src/rules.mak:69: block/stream.o] Error 1 Traceback (most recent call last): File "./tests/docker/docker.py", line 664, in sys.exit(main()) --- raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=0116cd1f21484d809ce1bbf09d8f2d13', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=x86_64-softmmu', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-980fkf38/src/docker-src.2020-05-12-14.21.09.10733:/var/tmp/qemu:z,ro', 'qemu:fedora', '/var/tmp/qemu/run', 'test-debug']' returned non-zero exit status 2. filter=--filter=label=com.qemu.instance.uuid=0116cd1f21484d809ce1bbf09d8f2d13 make[1]: *** [docker-run] Error 1 make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-980fkf38/src' make: *** [docker-run-test-debug@fedora] Error 2 real3m5.236s user0m8.401s The full log is available at http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.asan/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
Re: [PATCH v3 00/15] Apply COR-filter to the block-stream permanently
Patchew URL: https://patchew.org/QEMU/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/ Hi, This series failed the docker-mingw@fedora build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #! /bin/bash export ARCH=x86_64 make docker-image-fedora V=1 NETWORK=1 time make docker-test-mingw@fedora J=14 NETWORK=1 === TEST SCRIPT END === CC nbd/server.o CC nbd/client.o CC nbd/common.o /tmp/qemu-test/src/block/copy-on-read.c:29:10: fatal error: block/copy-on-read.h: No such file or directory #include "block/copy-on-read.h" ^~ compilation terminated. make: *** [/tmp/qemu-test/src/rules.mak:69: block/copy-on-read.o] Error 1 make: *** Waiting for unfinished jobs /tmp/qemu-test/src/block/stream.c:22:10: fatal error: block/copy-on-read.h: No such file or directory #include "block/copy-on-read.h" ^~ compilation terminated. make: *** [/tmp/qemu-test/src/rules.mak:69: block/stream.o] Error 1 Traceback (most recent call last): File "./tests/docker/docker.py", line 664, in sys.exit(main()) --- raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=e588e99e0e78491a81fad3d9b8bea6b1', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-wfo11d6n/src/docker-src.2020-05-12-14.25.07.16522:/var/tmp/qemu:z,ro', 'qemu:fedora', '/var/tmp/qemu/run', 'test-mingw']' returned non-zero exit status 2. filter=--filter=label=com.qemu.instance.uuid=e588e99e0e78491a81fad3d9b8bea6b1 make[1]: *** [docker-run] Error 1 make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-wfo11d6n/src' make: *** [docker-run-test-mingw@fedora] Error 2 real1m57.227s user0m8.202s The full log is available at http://patchew.org/logs/1589295196-773454-1-git-send-email-andrey.shinkev...@virtuozzo.com/testing.docker-mingw@fedora/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
Re: [PATCH v2 3/4] mirror: Make sure that source and target size match
Am 12.05.2020 um 19:15 hat Vladimir Sementsov-Ogievskiy geschrieben: > 11.05.2020 16:58, Kevin Wolf wrote: > > If the target is shorter than the source, mirror would copy data until > > it reaches the end of the target and then fail with an I/O error when > > trying to write past the end. > > > > If the target is longer than the source, the mirror job would complete > > successfully, but the target wouldn't actually be an accurate copy of > > the source image (it would contain some additional garbage at the end). > > > > Fix this by checking that both images have the same size when the job > > starts. > > > > Signed-off-by: Kevin Wolf > > Message-Id: <20200507145228.323412-3-kw...@redhat.com> > > Reviewed-by: Eric Blake > > Signed-off-by: Kevin Wolf > > --- > > block/mirror.c | 21 - > > 1 file changed, 12 insertions(+), 9 deletions(-) > > > > diff --git a/block/mirror.c b/block/mirror.c > > index aca95c9bc9..201ffa26f9 100644 > > --- a/block/mirror.c > > +++ b/block/mirror.c > > @@ -872,6 +872,7 @@ static int coroutine_fn mirror_run(Job *job, Error > > **errp) > > BlockDriverState *target_bs = blk_bs(s->target); > > bool need_drain = true; > > int64_t length; > > +int64_t target_length; > > BlockDriverInfo bdi; > > char backing_filename[2]; /* we only need 2 characters because we are > > only > >checking for a NULL string */ > > @@ -887,24 +888,26 @@ static int coroutine_fn mirror_run(Job *job, Error > > **errp) > > goto immediate_exit; > > } > > +target_length = blk_getlength(s->target); > > +if (target_length < 0) { > > +ret = target_length; > > +goto immediate_exit; > > +} > > + > > /* Active commit must resize the base image if its size differs from > > the > >* active layer. */ > > if (s->base == blk_bs(s->target)) { > > -int64_t base_length; > > - > > -base_length = blk_getlength(s->target); > > -if (base_length < 0) { > > -ret = base_length; > > -goto immediate_exit; > > -} > > - > > -if (s->bdev_length > base_length) { > > +if (s->bdev_length > target_length) { > > ret = blk_truncate(s->target, s->bdev_length, false, > > PREALLOC_MODE_OFF, 0, NULL); > > if (ret < 0) { > > goto immediate_exit; > > } > > } > > Hmm, interesting, if base is larger, is our behavior correct? Blockdev > becomes larger after commit and old data becomes available? I think we > should zero the tail after old EOF or shrink the target.. Yes, I agree, we should shrink it. But active commit is a different case than what I'm fixing in this patch, so I left it unmodified. We'll probably need a third series for commit after backup and mirror. > > +} else if (s->bdev_length != target_length) { > > +error_setg(errp, "Source and target image have different sizes"); > > +ret = -EINVAL; > > Seems, the only case, when mirror_run() sets errp. And, therefore, the > only correct one.. job_update_rc() takes care to fill job->err (with strerror()) if it hasn't been set yet, so the other places aren't strictly wrong, but probably provide suboptimal error messages. Kevin
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
* Daniel P. Berrangé (berra...@redhat.com) wrote: > On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote: > > On Mon, 11 May 2020 16:46:45 +0100 > > "Dr. David Alan Gilbert" wrote: > > > > > * Daniel P. Berrangé (berra...@redhat.com) wrote: > > > > ... > > > > That way if QEMU does get stuck, you can start by tearing down the > > > > least distruptive channel. eg try tearing down the migration connection > > > > first (which shouldn't negatively impact the guest), and only if that > > > > doesn't work then, move on to tear down the NBD connection (which risks > > > > data loss) > > > > > > I wonder if a different way would be to make all network connections > > > register with yank, but then make yank take a list of connections to > > > shutdown(2). > > > > Good Idea. We could name the connections (/yank callbacks) in the > > form "nbd:", "chardev:" and "migration" > > (and add "netdev:...", etc. in the future). Then make yank take a > > list of connection names as you suggest and silently ignore connections > > that don't exist. And maybe even add a 'query-yank' oob command returning > > a list of registered connections so the management application can do > > pattern matching if it wants. > > Yes, that would make the yank command much more flexible in how it can > be used. > > As an alternative to using formatted strings like this, it could be > modelled more explicitly in QAPI > > { 'struct': 'YankChannels', > 'data': { 'chardev': [ 'string' ], > 'nbd': ['string'], > 'migration': bool } } > > In this example, 'chardev' would accept a list of chardev IDs which > have it enabled, 'nbd' would accept a list of block node IDs which > have it enabled, and migration is a singleton on/off. Do we already have a QOM object name for each of these things? Is that nbd/blockdevice unique - i.e. can you have multiple nbd clients on the same node? > The benefit of this modelling is that you can introspect QEMU > to discover what classes of channels support being yanked by > this QEMU build, as well as what instances are configured to > be yanked. ie you can distinguish between a QEMU that doesn't > support yanking network devices, from a QEMU that does support > yanking network devices, but doesn't have it enabled for any > network device instances. What do we need to make it introspectable like that? Dave > Regards, > Daniel > -- > |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o-https://fstop138.berrange.com :| > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :| -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [RFC PATCH 2/3] block: Allow bdrv_run_co() from different AioContext
Am 12.05.2020 um 18:02 hat Thomas Lamprecht geschrieben: > On 5/12/20 4:43 PM, Kevin Wolf wrote: > > Coroutine functions that are entered through bdrv_run_co() are already > > safe to call from synchronous code in a different AioContext because > > bdrv_coroutine_enter() will schedule them in the context of the node. > > > > However, the coroutine fastpath still requires that we're already in the > > right AioContext when called in coroutine context. > > > > In order to make the behaviour more consistent and to make life a bit > > easier for callers, let's check the AioContext and automatically move > > the current coroutine around if we're not in the right context yet. > > > > Signed-off-by: Kevin Wolf > > --- > > block/io.c | 15 ++- > > 1 file changed, 14 insertions(+), 1 deletion(-) > > > > diff --git a/block/io.c b/block/io.c > > index c1badaadc9..7808e8bdc0 100644 > > --- a/block/io.c > > +++ b/block/io.c > > @@ -895,8 +895,21 @@ static int bdrv_run_co(BlockDriverState *bs, > > CoroutineEntry *entry, > > void *opaque, int *ret) > > { > > if (qemu_in_coroutine()) { > > -/* Fast-path if already in coroutine context */ > > +Coroutine *self = qemu_coroutine_self(); > > +AioContext *bs_ctx = bdrv_get_aio_context(bs); > > +AioContext *co_ctx = qemu_coroutine_get_aio_context(self); > > + > > +if (bs_ctx != co_ctx) { > > +/* Move to the iothread of the node */ > > +aio_co_schedule(bs_ctx, self); > > +qemu_coroutine_yield(); > > +} > > entry(opaque); > > +if (bs_ctx != co_ctx) { > > +/* Move back to the original AioContext */ > > +aio_co_schedule(bs_ctx, self); > > shouldn't it use co_ctx here, as else it's just scheduled again on the > one from bs? Oops, you're right, of course. > Looks OK for me besides that. Thanks! Kevin
Re: [PATCH v3 7/9] qcow2: Expose bitmaps' size during measure
On 5/12/20 6:10 AM, Max Reitz wrote: This does not break old code since previously we always reported only guest visible content here, but it changes the semantics, and now you cannot allocate "required" size, you need to allocate "required" size with "bitmaps" size. Only if you copy the bitmaps, though, which requires using the --bitmaps switch for convert. First, a usage question: would you rather that 'qemu-img convert --bitmaps' silently succeeds even when the image being converted has no bitmaps, or would you rather that it fails loudly when there are no bitmaps to be copied over? As implemented in this patch series, patch 8 currently silently succeeds. But in order to make patch 7 and 8 consistent with one another, I need to know which behavior is easier to use: failing convert if the source lacks bitmaps (and thus measure would suppress the bitmaps:0 output), or treating lack of bitmaps as nothing additional to copy and thereby succeeding (and thus measure should output bitmaps:0 to show that no additional space is needed because nothing else will be copied, successfully). If we add a new extension all users will have to change the calculation again. It was my impression that existing users won’t have to do that, because they don’t use --bitmaps yet. In contrast, if we included the bitmap size in @required or @fully-allocated, then previous users that didn’t copy bitmaps to the destination (which they couldn’t) would allocate too much space. ...revisiting this after reading more of your mail: With a --bitmaps switch, existing users wouldn’t suffer from such compatibility problems. However, then users (that now want to copy bitmaps) will have to pass the new --bitmaps flag in the command line, and I don’t see how that’s less complicated than just adding @bitmaps to @required. More concretely, suppose down the road that we add the ability to copy internal snapshots (not that you want to mix internal and external snapshots, but that such information already exists and therefore can be used as an example). Which is easier: $ qemu-img measure -f qcow2 image.qcow2 required size: 8716288 fully allocated size: 8716288 bitmaps size: 1024 internal snapshot size: 2048 where you now have to add three numbers prior to creating dest.qcow2 and calling: $ qemu-img convert image.qcow2 -f dest.qcow2 --bitmaps --snapshots or using: $ qemu-img measure --bitmaps --snapshots -f qcow2 image.qcow2 required size: 8719360 fully allocated size: 8719360 where you then call: $ qemu-img convert image.qcow2 -f dest.qcow2 --bitmaps --snapshots with a single size that matches the same arguments you pass to qemu-img convert? What about failure cases? What happens when qemu-img doesn't understand --snapshots but does understand --bitmaps? Do you have to try a series of up to three calls to find how much information is supported: $ qemu-img measure -f qcow2 image.qcow2 --bitmaps --snapshots error: unknown argument $ qemu-img measure -f qcow2 image.qcow2 --bitmaps error: unknown argument $ qemu-img measure -f qcow2 image.qcow2 data given, now you know that neither --bitmaps nor --snapshots will work or is it nicer to issue just one measure without options, getting separate output lines, and seeing which output lines exist to know which convert options are supported, at the minor expense of having to add values yourself? And then back to my question: should 'measure --bitmaps' fail if there are no bitmaps to be measured, or silently succeed and not change the output size? With the current way, to measure an image we need to do: qemu-img info --output json ... check if image contains bitmaps qemu-img measure --output json ... check if bitmaps are reported. Why do you have to check 'qemu-img info' first? If measure reports bitmaps, then you know bitmaps can be copied; if it doesn't, then you can check info as a fallback path to compute size yourself - but computing the size yourself doesn't help unless you also have fallbacks to copy the bitmaps via QMP commands, because that qemu-img will also lack 'qemu-img convert --bitmaps' or 'qemu-img bitmaps' to do it via qemu-img. If image has bitmaps and bitmaps are not reported, we know that we have an old version of qemu-img that does not know how to measure bitmaps. Well, but you’ll also see that convert --bitmaps will fail. But I can see that you probably want to know about that before you create the target image. If bitmaps are reported in both commands we will use the value when creating block devices. And otherwise? Do you error out, or do you just omit --bitmaps from convert? If we always report bitmaps even when they are zero, we don't need to run "qemu-img info". But my preferred interface is: qemu-img measure --bitmaps ... And report bitmaps only if the user asked to get the value. In this case the required and fully-allocated values will include bitmaps. Well, it would be consistent wit
Re: [PATCH 0/5] Introduce 'yank' oob qmp command to recover from hanging qemu
On Tue, 12 May 2020 10:43:37 +0100 Daniel P. Berrangé wrote: > On Tue, May 12, 2020 at 11:32:06AM +0200, Lukas Straub wrote: > > ... > > > > Good Idea. We could name the connections (/yank callbacks) in the > > form "nbd:", "chardev:" and "migration" > > (and add "netdev:...", etc. in the future). Then make yank take a > > list of connection names as you suggest and silently ignore connections > > that don't exist. And maybe even add a 'query-yank' oob command returning > > a list of registered connections so the management application can do > > pattern matching if it wants. > > Yes, that would make the yank command much more flexible in how it can > be used. > > As an alternative to using formatted strings like this, it could be > modelled more explicitly in QAPI > > { 'struct': 'YankChannels', > 'data': { 'chardev': [ 'string' ], > 'nbd': ['string'], > 'migration': bool } } > > In this example, 'chardev' would accept a list of chardev IDs which > have it enabled, 'nbd' would accept a list of block node IDs which > have it enabled, and migration is a singleton on/off. With the new command, the yank feature can then be enabled unconditionally on the instances. > The benefit of this modelling is that you can introspect QEMU > to discover what classes of channels support being yanked by > this QEMU build, as well as what instances are configured to > be yanked. ie you can distinguish between a QEMU that doesn't > support yanking network devices, from a QEMU that does support > yanking network devices, but doesn't have it enabled for any > network device instances. > > Regards, > Daniel pgp4JEcSFZzlS.pgp Description: OpenPGP digital signature
[PATCH] hw/ide: Make IDEDMAOps handlers take a const IDEDMA pointer
Handlers don't need to modify the IDEDMA structure. Make it const. Signed-off-by: Philippe Mathieu-Daudé --- include/hw/ide/internal.h | 12 ++-- hw/ide/ahci.c | 18 +- hw/ide/core.c | 6 +++--- hw/ide/macio.c| 6 +++--- hw/ide/pci.c | 12 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h index 55da35d768..1a7869e85d 100644 --- a/include/hw/ide/internal.h +++ b/include/hw/ide/internal.h @@ -322,12 +322,12 @@ typedef enum { IDE_HD, IDE_CD, IDE_CFATA } IDEDriveKind; typedef void EndTransferFunc(IDEState *); -typedef void DMAStartFunc(IDEDMA *, IDEState *, BlockCompletionFunc *); -typedef void DMAVoidFunc(IDEDMA *); -typedef int DMAIntFunc(IDEDMA *, bool); -typedef int32_t DMAInt32Func(IDEDMA *, int32_t len); -typedef void DMAu32Func(IDEDMA *, uint32_t); -typedef void DMAStopFunc(IDEDMA *, bool); +typedef void DMAStartFunc(const IDEDMA *, IDEState *, BlockCompletionFunc *); +typedef void DMAVoidFunc(const IDEDMA *); +typedef int DMAIntFunc(const IDEDMA *, bool); +typedef int32_t DMAInt32Func(const IDEDMA *, int32_t len); +typedef void DMAu32Func(const IDEDMA *, uint32_t); +typedef void DMAStopFunc(const IDEDMA *, bool); struct unreported_events { bool eject_request; diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 13d91e109a..168d34e9f2 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -44,7 +44,7 @@ static int handle_cmd(AHCIState *s, int port, uint8_t slot); static void ahci_reset_port(AHCIState *s, int port); static bool ahci_write_fis_d2h(AHCIDevice *ad); static void ahci_init_d2h(AHCIDevice *ad); -static int ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit); +static int ahci_dma_prepare_buf(const IDEDMA *dma, int32_t limit); static bool ahci_map_clb_address(AHCIDevice *ad); static bool ahci_map_fis_address(AHCIDevice *ad); static void ahci_unmap_clb_address(AHCIDevice *ad); @@ -1338,7 +1338,7 @@ out: } /* Transfer PIO data between RAM and device */ -static void ahci_pio_transfer(IDEDMA *dma) +static void ahci_pio_transfer(const IDEDMA *dma) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); IDEState *s = &ad->port.ifs[0]; @@ -1397,7 +1397,7 @@ out: } } -static void ahci_start_dma(IDEDMA *dma, IDEState *s, +static void ahci_start_dma(const IDEDMA *dma, IDEState *s, BlockCompletionFunc *dma_cb) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); @@ -1406,7 +1406,7 @@ static void ahci_start_dma(IDEDMA *dma, IDEState *s, dma_cb(s, 0); } -static void ahci_restart_dma(IDEDMA *dma) +static void ahci_restart_dma(const IDEDMA *dma) { /* Nothing to do, ahci_start_dma already resets s->io_buffer_offset. */ } @@ -1415,7 +1415,7 @@ static void ahci_restart_dma(IDEDMA *dma) * IDE/PIO restarts are handled by the core layer, but NCQ commands * need an extra kick from the AHCI HBA. */ -static void ahci_restart(IDEDMA *dma) +static void ahci_restart(const IDEDMA *dma) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); int i; @@ -1432,7 +1432,7 @@ static void ahci_restart(IDEDMA *dma) * Called in DMA and PIO R/W chains to read the PRDT. * Not shared with NCQ pathways. */ -static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit) +static int32_t ahci_dma_prepare_buf(const IDEDMA *dma, int32_t limit) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); IDEState *s = &ad->port.ifs[0]; @@ -1453,7 +1453,7 @@ static int32_t ahci_dma_prepare_buf(IDEDMA *dma, int32_t limit) * Called via dma_buf_commit, for both DMA and PIO paths. * sglist destruction is handled within dma_buf_commit. */ -static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes) +static void ahci_commit_buf(const IDEDMA *dma, uint32_t tx_bytes) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); @@ -1461,7 +1461,7 @@ static void ahci_commit_buf(IDEDMA *dma, uint32_t tx_bytes) ad->cur_cmd->status = cpu_to_le32(tx_bytes); } -static int ahci_dma_rw_buf(IDEDMA *dma, bool is_write) +static int ahci_dma_rw_buf(const IDEDMA *dma, bool is_write) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); IDEState *s = &ad->port.ifs[0]; @@ -1486,7 +1486,7 @@ static int ahci_dma_rw_buf(IDEDMA *dma, bool is_write) return 1; } -static void ahci_cmd_done(IDEDMA *dma) +static void ahci_cmd_done(const IDEDMA *dma) { AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma); diff --git a/hw/ide/core.c b/hw/ide/core.c index 689bb36409..d997a78e47 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -2570,16 +2570,16 @@ static void ide_init1(IDEBus *bus, int unit) ide_sector_write_timer_cb, s); } -static int ide_nop_int(IDEDMA *dma, bool is_write) +static int ide_nop_int(const IDEDMA *dma, bool is_write) { return 0; } -static void ide_nop(IDEDMA *dma) +static void ide_nop(const IDEDMA *dma) { } -static in