Re: [PATCH v2 14/22] qemu-iotests/199: better catch postcopy time
19.02.2020 16:16, Andrey Shinkevich wrote: On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote: The test aims to test _postcopy_ migration, and wants to do some write operations during postcopy time. Test considers migrate status=complete event on source as start of postcopy. This is completely wrong, completion is completion of the whole migration process. Let's instead consider destination start as start of postcopy, and use RESUME event for it. Next, as migration finish, let's use migration status=complete event on target, as such method is closer to what libvirt or another user will do, than tracking number of dirty-bitmaps. Finally, add a possibility to dump events for debug. And if set debug to True, we see, that actual postcopy period is very small relatively to the whole test duration time (~0.2 seconds to >40 seconds for me). This means, that test is very inefficient in what it supposed to do. Let's improve it in following commits. Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/199 | 72 +- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/tests/qemu-iotests/199 b/tests/qemu-iotests/199 index dda918450a..6599fc6fb4 100755 --- a/tests/qemu-iotests/199 +++ b/tests/qemu-iotests/199 @@ -20,17 +20,43 @@ import os import iotests -import time from iotests import qemu_img +debug = False + disk_a = os.path.join(iotests.test_dir, 'disk_a') disk_b = os.path.join(iotests.test_dir, 'disk_b') size = '256G' fifo = os.path.join(iotests.test_dir, 'mig_fifo') +def event_seconds(event): + return event['timestamp']['seconds'] + \ + event['timestamp']['microseconds'] / 100.0 + + +def event_dist(e1, e2): + return event_seconds(e2) - event_seconds(e1) + + class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): def tearDown(self): It's common to put the definition of setUp() ahead Preexisting, I don't want to update it in this patch + if debug: + self.vm_a_events += self.vm_a.get_qmp_events() + self.vm_b_events += self.vm_b.get_qmp_events() + for e in self.vm_a_events: + e['vm'] = 'SRC' + for e in self.vm_b_events: + e['vm'] = 'DST' + events = (self.vm_a_events + self.vm_b_events) + events = [(e['timestamp']['seconds'], + e['timestamp']['microseconds'], + e['vm'], + e['event'], + e.get('data', '')) for e in events] + for e in sorted(events): + print('{}.{:06} {} {} {}'.format(*e)) + self.vm_a.shutdown() self.vm_b.shutdown() os.remove(disk_a) @@ -47,6 +73,10 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): self.vm_a.launch() self.vm_b.launch() + # collect received events for debug + self.vm_a_events = [] + self.vm_b_events = [] + def test_postcopy(self): write_size = 0x4000 granularity = 512 @@ -77,15 +107,13 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): self.vm_a.hmp_qemu_io('drive0', 'write %d %d' % (s, chunk)) s += 0x1 - bitmaps_cap = {'capability': 'dirty-bitmaps', 'state': True} - events_cap = {'capability': 'events', 'state': True} + caps = [{'capability': 'dirty-bitmaps', 'state': True}, The name "capabilities" would be an appropriate identifier. This will result in following lines growing and not fit into one line. I'll leave "caps". Also, they are called "caps" in iotest 169 and in migration.c. And here in the context always used together with full word ('capability': or capabilities=). + {'capability': 'events', 'state': True}] - result = self.vm_a.qmp('migrate-set-capabilities', - capabilities=[bitmaps_cap, events_cap]) + result = self.vm_a.qmp('migrate-set-capabilities', capabilities=caps) self.assert_qmp(result, 'return', {}) - result = self.vm_b.qmp('migrate-set-capabilities', - capabilities=[bitmaps_cap]) + result = self.vm_b.qmp('migrate-set-capabilities', capabilities=caps) self.assert_qmp(result, 'return', {}) result = self.vm_a.qmp('migrate', uri='exec:cat>' + fifo) @@ -94,24 +122,38 @@ class TestDirtyBitmapPostcopyMigration(iotests.QMPTestCase): result = self.vm_a.qmp('migrate-start-postcopy') self.assert_qmp(result, 'return', {}) - while True: - event = self.vm_a.event_wait('MIGRATION') - if event['data']['status'] == 'completed': - break + e_resume = self.vm_b.event_wait('RESUME') "event_resume" gives a faster understanding OK, no problem + self.vm_b_events.append(e_resume) s = 0x8000 while s < write_size: se
Re: [PATCH v2 12/22] qemu-iotests/199: fix style
24.07.2020 01:03, Eric Blake wrote: On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Mostly, satisfy pep8 complains. Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/199 | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) With none of your series applied, I get: $ ./check -qcow2 199 ... 199 not run [16:52:34] [16:52:34] not suitable for this cache mode: writeback Not run: 199 Passed all 0 iotests 199 fail [16:53:37] [16:53:37] output mismatch (see 199.out.bad) --- /home/eblake/qemu/tests/qemu-iotests/199.out 2020-07-23 16:48:56.275529368 -0500 +++ /home/eblake/qemu/build/tests/qemu-iotests/199.out.bad 2020-07-23 16:53:37.728416207 -0500 @@ -1,5 +1,13 @@ -. +E +== +ERROR: test_postcopy (__main__.TestDirtyBitmapPostcopyMigration) +-- +Traceback (most recent call last): + File "199", line 41, in setUp + os.mkfifo(fifo) +FileExistsError: [Errno 17] File exists + -- Ran 1 tests -OK +FAILED (errors=1) Failures: 199 Failed 1 of 1 iotests Ah, 'scratch/mig_fifo' was left over from some other aborted run of the test. I removed that file (which implies it might be nice if the test handled that automatically, instead of making me do it), and tried again; now I got the desired: 199 pass [17:00:34] [17:01:48] 74s Passed all 1 iotests After trying to rebase your series, I once again got failures, but that could mean I botched the rebase (since quite a few of the code patches earlier in the series were non-trivially changed).> If you send a v3 (which would be really nice!), I'd hoist this and 13/22 first in the series, to get to a point where testing 199 works, to then make it easier to demonstrate what the rest of the 199 enhancements do in relation to the non-iotest patches. But I like that you separated the 199 improvements from the code - testing-wise, it's easy to apply the iotests patches first, make sure it fails, then apply the code patches, and make sure it passes, to prove that the enhanced test now covers what the code fixes did. A bit off-topic: Yes, that's our usual scheme: test goes after bug-fix, so careful reviewers always have to apply patches in different order to check is there a real bug-fix.. And the only benefit of such scheme - not to break git bisect. Still, I think we can do better: - apply test first, just put it into "bug" group - check script should ignore "bug" group (unless it explicitly specified, or direct test run) - bug-fix patch removes test from "bug" group So bisect is not broken and we achieve native ordering: problem exists (test fails) -> fixing -> no problem (test pass). I think, I'll add "bug" group as a follow up to my "[PATCH v4 0/9] Rework iotests/check", which I really hope will land one day. PS: I've successfully rebased the series, and test pass. I'll now fix all review notes and resend soon. -- Best regards, Vladimir
Re: [PATCH 4/7] ide: reorder set/get sector functions
On 7/24/20 7:22 AM, John Snow wrote: > Reorder these just a pinch to make them more obvious at a glance what > the addressing mode is. > > Signed-off-by: John Snow > --- > hw/ide/core.c | 26 +++--- > 1 file changed, 15 insertions(+), 11 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH 1/7] ide: rename cmd_write to ctrl_write
On 7/24/20 7:22 AM, John Snow wrote: > It's the Control register, part of the Control block -- Command is > misleading here. Rename all related functions and constants. > > Signed-off-by: John Snow > --- > include/hw/ide/internal.h | 9 + > hw/ide/core.c | 12 ++-- > hw/ide/ioport.c | 2 +- > hw/ide/macio.c| 2 +- > hw/ide/mmio.c | 8 > hw/ide/pci.c | 12 ++-- > hw/ide/trace-events | 2 +- > 7 files changed, 24 insertions(+), 23 deletions(-) Reviewed-by: Philippe Mathieu-Daudé
[PATCH 7/7] ide: cancel pending callbacks on SRST
The SRST implementation did not keep up with the rest of IDE; it is possible to perform a weak reset on an IDE device to remove the BSY/DRQ bits, and then issue writes to the control/device registers which can cause chaos with the state machine. Fix that by actually performing a real reset. Reported-by: Alexander Bulekov Fixes: https://bugs.launchpad.net/qemu/+bug/1878253 Fixes: https://bugs.launchpad.net/qemu/+bug/1887303 Fixes: https://bugs.launchpad.net/qemu/+bug/1887309 Signed-off-by: John Snow --- hw/ide/core.c | 58 +++ 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index e4c69a7fde..4da689abdf 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -2241,6 +2241,37 @@ uint32_t ide_status_read(void *opaque, uint32_t addr) return ret; } +static void ide_perform_srst(IDEState *s) +{ +s->status |= BUSY_STAT; + +/* Halt PIO (Via register state); PIO BH remains scheduled. */ +ide_transfer_halt(s); + +/* Cancel DMA -- may drain block device and invoke callbacks */ +ide_cancel_dma_sync(s); + +/* Cancel PIO callback, reset registers/signature, etc */ +ide_reset(s); + +if (s->drive_kind == IDE_CD) { +/* ATAPI drives do not set READY or SEEK */ +s->status = 0x00; +} +} + +static void ide_bus_perform_srst(void *opaque) +{ +IDEBus *bus = opaque; +IDEState *s; +int i; + +for (i = 0; i < 2; i++) { +s = &bus->ifs[i]; +ide_perform_srst(s); +} +} + void ide_ctrl_write(void *opaque, uint32_t addr, uint32_t val) { IDEBus *bus = opaque; @@ -2249,26 +2280,17 @@ void ide_ctrl_write(void *opaque, uint32_t addr, uint32_t val) trace_ide_ctrl_write(addr, val, bus); -/* common for both drives */ -if (!(bus->cmd & IDE_CTRL_RESET) && -(val & IDE_CTRL_RESET)) { -/* reset low to high */ -for(i = 0;i < 2; i++) { +/* Device0 and Device1 each have their own control register, + * but QEMU models it as just one register in the controller. */ +if ((bus->cmd & IDE_CTRL_RESET) && +!(val & IDE_CTRL_RESET)) { +/* SRST triggers on falling edge */ +for (i = 0; i < 2; i++) { s = &bus->ifs[i]; -s->status = BUSY_STAT | SEEK_STAT; -s->error = 0x01; -} -} else if ((bus->cmd & IDE_CTRL_RESET) && - !(val & IDE_CTRL_RESET)) { -/* high to low */ -for(i = 0;i < 2; i++) { -s = &bus->ifs[i]; -if (s->drive_kind == IDE_CD) -s->status = 0x00; /* NOTE: READY is _not_ set */ -else -s->status = READY_STAT | SEEK_STAT; -ide_set_signature(s); +s->status |= BUSY_STAT; } +aio_bh_schedule_oneshot(qemu_get_aio_context(), +ide_bus_perform_srst, bus); } bus->cmd = val; -- 2.26.2
[PATCH 5/7] ide: remove magic constants from the device register
(In QEMU, we call this the "select" register.) My memory isn't good enough to memorize what these magic runes do. Label them to prevent mixups from happening in the future. Side note: I assume it's safe to always set 0xA0 even though ATA2 claims these bits are reserved, because ATA3 immediately reinstated that these bits should be always on. ATA4 and subsequent specs only claim that the fields are obsolete, so I assume it's safe to leave these set and that it should work with the widest array of guests. Signed-off-by: John Snow --- include/hw/ide/internal.h | 11 +++ hw/ide/core.c | 26 ++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h index 16d806e0cf..d5a6ba1056 100644 --- a/include/hw/ide/internal.h +++ b/include/hw/ide/internal.h @@ -29,6 +29,17 @@ typedef struct IDEDMAOps IDEDMAOps; #define MAX_IDE_DEVS 2 +/* Device/Head ("select") Register */ +#define ATA_DEV_SELECT 0x10 +/* ATA1,3: Defined as '1'. + * ATA2: Reserved. + * ATA3-7: obsolete. */ +#define ATA_DEV_ALWAYS_ON 0xA0 +#define ATA_DEV_LBA 0x40 +#define ATA_DEV_LBA_MSB 0x0F /* LBA 24:27 */ +#define ATA_DEV_HS 0x0F /* HS 3:0 */ + + /* Bits of HD_STATUS */ #define ERR_STAT 0x01 #define INDEX_STAT 0x02 diff --git a/hw/ide/core.c b/hw/ide/core.c index f35864070b..5f4f004312 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -367,7 +367,7 @@ fill_buffer: static void ide_set_signature(IDEState *s) { -s->select &= 0xf0; /* clear head */ +s->select &= ~(ATA_DEV_HS); /* clear head */ /* put signature */ s->nsector = 1; s->sector = 1; @@ -586,7 +586,7 @@ void ide_transfer_stop(IDEState *s) int64_t ide_get_sector(IDEState *s) { int64_t sector_num; -if (s->select & 0x40) { +if (s->select & (ATA_DEV_LBA)) { if (s->lba48) { sector_num = ((int64_t)s->hob_hcyl << 40) | ((int64_t) s->hob_lcyl << 32) | @@ -595,13 +595,13 @@ int64_t ide_get_sector(IDEState *s) ((int64_t) s->lcyl << 8) | s->sector; } else { /* LBA28 */ -sector_num = ((s->select & 0x0f) << 24) | (s->hcyl << 16) | -(s->lcyl << 8) | s->sector; +sector_num = ((s->select & (ATA_DEV_LBA_MSB)) << 24) | +(s->hcyl << 16) | (s->lcyl << 8) | s->sector; } } else { /* CHS */ sector_num = ((s->hcyl << 8) | s->lcyl) * s->heads * s->sectors + -(s->select & 0x0f) * s->sectors + (s->sector - 1); +(s->select & (ATA_DEV_HS)) * s->sectors + (s->sector - 1); } return sector_num; @@ -610,7 +610,7 @@ int64_t ide_get_sector(IDEState *s) void ide_set_sector(IDEState *s, int64_t sector_num) { unsigned int cyl, r; -if (s->select & 0x40) { +if (s->select & (ATA_DEV_LBA)) { if (s->lba48) { s->sector = sector_num; s->lcyl = sector_num >> 8; @@ -620,7 +620,8 @@ void ide_set_sector(IDEState *s, int64_t sector_num) s->hob_hcyl = sector_num >> 40; } else { /* LBA28 */ -s->select = (s->select & 0xf0) | (sector_num >> 24); +s->select = (s->select & ~(ATA_DEV_LBA_MSB)) | +((sector_num >> 24) & (ATA_DEV_LBA_MSB)); s->hcyl = (sector_num >> 16); s->lcyl = (sector_num >> 8); s->sector = (sector_num); @@ -631,7 +632,8 @@ void ide_set_sector(IDEState *s, int64_t sector_num) r = sector_num % (s->heads * s->sectors); s->hcyl = cyl >> 8; s->lcyl = cyl; -s->select = (s->select & 0xf0) | ((r / s->sectors) & 0x0f); +s->select = (s->select & ~(ATA_DEV_HS)) | +((r / s->sectors) & (ATA_DEV_HS)); s->sector = (r % s->sectors) + 1; } } @@ -1302,10 +1304,10 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) break; case ATA_IOPORT_WR_DEVICE_HEAD: ide_clear_hob(bus); -bus->ifs[0].select = val | 0xa0; -bus->ifs[1].select = val | 0xa0; +bus->ifs[0].select = val | (ATA_DEV_ALWAYS_ON); +bus->ifs[1].select = val | (ATA_DEV_ALWAYS_ON); /* select drive */ -bus->unit = (val >> 4) & 1; +bus->unit = (val & (ATA_DEV_SELECT)) ? 1 : 0; break; default: case ATA_IOPORT_WR_COMMAND: @@ -1343,7 +1345,7 @@ static void ide_reset(IDEState *s) s->hob_lcyl = 0; s->hob_hcyl = 0; -s->select = 0xa0; +s->select = (ATA_DEV_ALWAYS_ON); s->status = READY_STAT | SEEK_STAT; s->lba48 = 0; -- 2.26.2
[PATCH 4/7] ide: reorder set/get sector functions
Reorder these just a pinch to make them more obvious at a glance what the addressing mode is. Signed-off-by: John Snow --- hw/ide/core.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index a880b91b47..f35864070b 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -587,21 +587,23 @@ int64_t ide_get_sector(IDEState *s) { int64_t sector_num; if (s->select & 0x40) { -/* lba */ -if (!s->lba48) { -sector_num = ((s->select & 0x0f) << 24) | (s->hcyl << 16) | -(s->lcyl << 8) | s->sector; -} else { +if (s->lba48) { sector_num = ((int64_t)s->hob_hcyl << 40) | ((int64_t) s->hob_lcyl << 32) | ((int64_t) s->hob_sector << 24) | ((int64_t) s->hcyl << 16) | ((int64_t) s->lcyl << 8) | s->sector; +} else { +/* LBA28 */ +sector_num = ((s->select & 0x0f) << 24) | (s->hcyl << 16) | +(s->lcyl << 8) | s->sector; } } else { +/* CHS */ sector_num = ((s->hcyl << 8) | s->lcyl) * s->heads * s->sectors + (s->select & 0x0f) * s->sectors + (s->sector - 1); } + return sector_num; } @@ -609,20 +611,22 @@ void ide_set_sector(IDEState *s, int64_t sector_num) { unsigned int cyl, r; if (s->select & 0x40) { -if (!s->lba48) { -s->select = (s->select & 0xf0) | (sector_num >> 24); -s->hcyl = (sector_num >> 16); -s->lcyl = (sector_num >> 8); -s->sector = (sector_num); -} else { +if (s->lba48) { s->sector = sector_num; s->lcyl = sector_num >> 8; s->hcyl = sector_num >> 16; s->hob_sector = sector_num >> 24; s->hob_lcyl = sector_num >> 32; s->hob_hcyl = sector_num >> 40; +} else { +/* LBA28 */ +s->select = (s->select & 0xf0) | (sector_num >> 24); +s->hcyl = (sector_num >> 16); +s->lcyl = (sector_num >> 8); +s->sector = (sector_num); } } else { +/* CHS */ cyl = sector_num / (s->heads * s->sectors); r = sector_num % (s->heads * s->sectors); s->hcyl = cyl >> 8; -- 2.26.2
[PATCH 1/7] ide: rename cmd_write to ctrl_write
It's the Control register, part of the Control block -- Command is misleading here. Rename all related functions and constants. Signed-off-by: John Snow --- include/hw/ide/internal.h | 9 + hw/ide/core.c | 12 ++-- hw/ide/ioport.c | 2 +- hw/ide/macio.c| 2 +- hw/ide/mmio.c | 8 hw/ide/pci.c | 12 ++-- hw/ide/trace-events | 2 +- 7 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h index 1a7869e85d..10ea6e1e23 100644 --- a/include/hw/ide/internal.h +++ b/include/hw/ide/internal.h @@ -57,8 +57,9 @@ typedef struct IDEDMAOps IDEDMAOps; #define REL0x04 #define TAG_MASK 0xf8 -#define IDE_CMD_RESET 0x04 -#define IDE_CMD_DISABLE_IRQ 0x02 +/* Bits of Device Control register */ +#define IDE_CTRL_RESET 0x04 +#define IDE_CTRL_DISABLE_IRQ0x02 /* ACS-2 T13/2015-D Table B.2 Command codes */ #define WIN_NOP0x00 @@ -564,7 +565,7 @@ static inline IDEState *idebus_active_if(IDEBus *bus) static inline void ide_set_irq(IDEBus *bus) { -if (!(bus->cmd & IDE_CMD_DISABLE_IRQ)) { +if (!(bus->cmd & IDE_CTRL_DISABLE_IRQ)) { qemu_irq_raise(bus->irq); } } @@ -603,7 +604,7 @@ void ide_atapi_io_error(IDEState *s, int ret); void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val); uint32_t ide_ioport_read(void *opaque, uint32_t addr1); uint32_t ide_status_read(void *opaque, uint32_t addr); -void ide_cmd_write(void *opaque, uint32_t addr, uint32_t val); +void ide_ctrl_write(void *opaque, uint32_t addr, uint32_t val); void ide_data_writew(void *opaque, uint32_t addr, uint32_t val); uint32_t ide_data_readw(void *opaque, uint32_t addr); void ide_data_writel(void *opaque, uint32_t addr, uint32_t val); diff --git a/hw/ide/core.c b/hw/ide/core.c index d997a78e47..b472220d65 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -2235,25 +2235,25 @@ uint32_t ide_status_read(void *opaque, uint32_t addr) return ret; } -void ide_cmd_write(void *opaque, uint32_t addr, uint32_t val) +void ide_ctrl_write(void *opaque, uint32_t addr, uint32_t val) { IDEBus *bus = opaque; IDEState *s; int i; -trace_ide_cmd_write(addr, val, bus); +trace_ide_ctrl_write(addr, val, bus); /* common for both drives */ -if (!(bus->cmd & IDE_CMD_RESET) && -(val & IDE_CMD_RESET)) { +if (!(bus->cmd & IDE_CTRL_RESET) && +(val & IDE_CTRL_RESET)) { /* reset low to high */ for(i = 0;i < 2; i++) { s = &bus->ifs[i]; s->status = BUSY_STAT | SEEK_STAT; s->error = 0x01; } -} else if ((bus->cmd & IDE_CMD_RESET) && - !(val & IDE_CMD_RESET)) { +} else if ((bus->cmd & IDE_CTRL_RESET) && + !(val & IDE_CTRL_RESET)) { /* high to low */ for(i = 0;i < 2; i++) { s = &bus->ifs[i]; diff --git a/hw/ide/ioport.c b/hw/ide/ioport.c index ab1f4e5d9c..b613ff3bba 100644 --- a/hw/ide/ioport.c +++ b/hw/ide/ioport.c @@ -46,7 +46,7 @@ static const MemoryRegionPortio ide_portio_list[] = { }; static const MemoryRegionPortio ide_portio2_list[] = { -{ 0, 1, 1, .read = ide_status_read, .write = ide_cmd_write }, +{ 0, 1, 1, .read = ide_status_read, .write = ide_ctrl_write }, PORTIO_END_OF_LIST(), }; diff --git a/hw/ide/macio.c b/hw/ide/macio.c index 62a599a075..b270a10163 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -329,7 +329,7 @@ static void pmac_ide_write(void *opaque, hwaddr addr, uint64_t val, case 0x8: case 0x16: if (size == 1) { -ide_cmd_write(&d->bus, 0, val); +ide_ctrl_write(&d->bus, 0, val); } break; case 0x20: diff --git a/hw/ide/mmio.c b/hw/ide/mmio.c index d233bd8c01..80b8a9eb09 100644 --- a/hw/ide/mmio.c +++ b/hw/ide/mmio.c @@ -95,16 +95,16 @@ static uint64_t mmio_ide_status_read(void *opaque, hwaddr addr, return ide_status_read(&s->bus, 0); } -static void mmio_ide_cmd_write(void *opaque, hwaddr addr, - uint64_t val, unsigned size) +static void mmio_ide_ctrl_write(void *opaque, hwaddr addr, +uint64_t val, unsigned size) { MMIOState *s = opaque; -ide_cmd_write(&s->bus, 0, val); +ide_ctrl_write(&s->bus, 0, val); } static const MemoryRegionOps mmio_ide_cs_ops = { .read = mmio_ide_status_read, -.write = mmio_ide_cmd_write, +.write = mmio_ide_ctrl_write, .endianness = DEVICE_LITTLE_ENDIAN, }; diff --git a/hw/ide/pci.c b/hw/ide/pci.c index 5e85c4ad17..59726ae453 100644 --- a/hw/ide/pci.c +++ b/hw/ide/pci.c @@ -38,7 +38,7 @@ (IDE_RETRY_DMA | IDE_RETRY_PIO | \ IDE_RETRY_READ | IDE_RETRY_FLUSH) -static uint64_t pci_ide_cmd_read(void *opaque, hwaddr addr, unsigned size) +static uint64_t pci_ide_
[PATCH 6/7] ide: clear interrupt on command write
Not known to fix any bug, but I couldn't help but notice that ATA specifies that writing to this register should clear an interrupt. ATA7: Section 5.3.3 (Command register - Effect) ATA6: Section 7.4.4 (Command register - Effect) ATA5: Section 7.4.4 (Command register - Effect) ATA4: Section 7.4.4 (Command register - Effect) ATA3: Section 5.2.2 (Command register) Other editions: try searching for the phrase "Writing this register". Signed-off-by: John Snow --- hw/ide/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/ide/core.c b/hw/ide/core.c index 5f4f004312..e4c69a7fde 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -1312,6 +1312,7 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) default: case ATA_IOPORT_WR_COMMAND: ide_clear_hob(bus); +qemu_irq_lower(bus->irq); ide_exec_cmd(bus, val); break; } -- 2.26.2
[PATCH 0/7] IDE: SRST and other fixes
The goal of this series is to fix the Software Reset (SRST) routine. That said, the first six patches are almost entirely unrelated... Patches 2, 3, and 6 fix extremely minor deviations from the spec I noticed while researching SRST. (One of them gets rid of a FIXME from 2003.) Patches 1, 4, and 5 are very small code cleanups that don't cause any functional changes that should make patches 2, 3, and 6 more obvious to review. Patch 7 fixes SRST; it depends on the other patches only for a changed constant name. With a small rebase, it could be suitable for 5.1. John Snow (7): ide: rename cmd_write to ctrl_write ide: don't tamper with the device register ide: model HOB correctly ide: reorder set/get sector functions ide: remove magic constants from the device register ide: clear interrupt on command write ide: cancel pending callbacks on SRST include/hw/ide/internal.h | 21 +-- hw/ide/core.c | 124 +++--- hw/ide/ioport.c | 2 +- hw/ide/macio.c| 2 +- hw/ide/mmio.c | 8 +-- hw/ide/pci.c | 12 ++-- hw/ide/trace-events | 2 +- 7 files changed, 106 insertions(+), 65 deletions(-) -- 2.26.2
[PATCH 2/7] ide: don't tamper with the device register
In real ISA operation, register writes go out to an entire bus channel and all listening devices receive the write. The devices do not toggle the DEV bit based on their own configuration, nor does the HBA intermediate or tamper with that value. The reality of the matter is that DEV0/DEV1 accordingly will react to command register writes based on whether or not the device was selected. This does not fix a known bug, but it makes the code slightly simpler and more obvious. Signed-off-by: John Snow --- hw/ide/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index b472220d65..5cedebc408 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -1297,8 +1297,8 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) break; case ATA_IOPORT_WR_DEVICE_HEAD: /* FIXME: HOB readback uses bit 7 */ -bus->ifs[0].select = (val & ~0x10) | 0xa0; -bus->ifs[1].select = (val | 0x10) | 0xa0; +bus->ifs[0].select = val | 0xa0; +bus->ifs[1].select = val | 0xa0; /* select drive */ bus->unit = (val >> 4) & 1; break; -- 2.26.2
[PATCH 3/7] ide: model HOB correctly
I have been staring at this FIXME for years and I never knew what it meant. I finally stumbled across it! When writing to the command registers, the old value is shifted into a HOB copy of the register and the new value is written into the primary register. When reading registers, the value retrieved is dependent on the HOB bit in the CONTROL register. By setting bit 7 (0x80) in CONTROL, any register read will, if it has one, yield the HOB value for that register instead. Our code has a problem: We were using bit 7 of the DEVICE register to model this. We use bus->cmd roughly as the control register already, as it stores the value from ide_ctrl_write. Lastly, all command register writes reset the HOB, so fix that, too. Signed-off-by: John Snow --- include/hw/ide/internal.h | 1 + hw/ide/core.c | 15 +++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h index 10ea6e1e23..16d806e0cf 100644 --- a/include/hw/ide/internal.h +++ b/include/hw/ide/internal.h @@ -58,6 +58,7 @@ typedef struct IDEDMAOps IDEDMAOps; #define TAG_MASK 0xf8 /* Bits of Device Control register */ +#define IDE_CTRL_HOB0x80 #define IDE_CTRL_RESET 0x04 #define IDE_CTRL_DISABLE_IRQ0x02 diff --git a/hw/ide/core.c b/hw/ide/core.c index 5cedebc408..a880b91b47 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -1215,8 +1215,7 @@ static void ide_cmd_lba48_transform(IDEState *s, int lba48) static void ide_clear_hob(IDEBus *bus) { /* any write clears HOB high bit of device control register */ -bus->ifs[0].select &= ~(1 << 7); -bus->ifs[1].select &= ~(1 << 7); +bus->cmd &= ~(IDE_CTRL_HOB); } /* IOport [W]rite [R]egisters */ @@ -1256,12 +1255,14 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) return; } +/* NOTE: Device0 and Device1 both receive incoming register writes. + * (They're on the same bus! They have to!) */ + switch (reg_num) { case 0: break; case ATA_IOPORT_WR_FEATURES: ide_clear_hob(bus); -/* NOTE: data is written to the two drives */ bus->ifs[0].hob_feature = bus->ifs[0].feature; bus->ifs[1].hob_feature = bus->ifs[1].feature; bus->ifs[0].feature = val; @@ -1296,7 +1297,7 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) bus->ifs[1].hcyl = val; break; case ATA_IOPORT_WR_DEVICE_HEAD: -/* FIXME: HOB readback uses bit 7 */ +ide_clear_hob(bus); bus->ifs[0].select = val | 0xa0; bus->ifs[1].select = val | 0xa0; /* select drive */ @@ -1304,7 +1305,7 @@ void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) break; default: case ATA_IOPORT_WR_COMMAND: -/* command */ +ide_clear_hob(bus); ide_exec_cmd(bus, val); break; } @@ -2142,9 +2143,7 @@ uint32_t ide_ioport_read(void *opaque, uint32_t addr) int ret, hob; reg_num = addr & 7; -/* FIXME: HOB readback uses bit 7, but it's always set right now */ -//hob = s->select & (1 << 7); -hob = 0; +hob = bus->cmd & (IDE_CTRL_HOB); switch (reg_num) { case ATA_IOPORT_RR_DATA: ret = 0xff; -- 2.26.2
Re: [PATCH v2 08/22] migration/block-dirty-bitmap: keep bitmap state for all bitmaps
24.07.2020 00:30, Eric Blake wrote: On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Keep bitmap state for disabled bitmaps too. Keep the state until the end of the process. It's needed for the following commit to implement bitmap postcopy canceling. To clean-up the new list the following logic is used: We need two events to consider bitmap migration finished: 1. chunk with DIRTY_BITMAP_MIG_FLAG_COMPLETE flag should be received 2. dirty_bitmap_mig_before_vm_start should be called These two events may come in any order, so we understand which one is last, and on the last of them we remove bitmap migration state from the list. Signed-off-by: Vladimir Sementsov-Ogievskiy --- migration/block-dirty-bitmap.c | 64 +++--- 1 file changed, 43 insertions(+), 21 deletions(-) @@ -484,45 +488,59 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s) bdrv_disable_dirty_bitmap(s->bitmap); if (flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED) { - LoadBitmapState *b; - bdrv_dirty_bitmap_create_successor(s->bitmap, &local_err); if (local_err) { error_report_err(local_err); return -EINVAL; } - - b = g_new(LoadBitmapState, 1); - b->bs = s->bs; - b->bitmap = s->bitmap; - b->migrated = false; - s->enabled_bitmaps = g_slist_prepend(s->enabled_bitmaps, b); } + b = g_new(LoadBitmapState, 1); + b->bs = s->bs; + b->bitmap = s->bitmap; + b->migrated = false; + b->enabled = flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED, + + s->bitmaps = g_slist_prepend(s->bitmaps, b); Did you really mean to use a comma operator there, or should that be ';'? Of course, it should be ';':) -- Best regards, Vladimir
Re: [PATCH v2 16/22] qemu-iotests/199: change discard patterns
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: iotest 40 works too long because of many discard opertion. On the same I'm assuming you meant s/40/199/ here, as well as the typo fixes pointed out by Andrey. time, postcopy period is very short, in spite of all these efforts. So, let's use less discards (and with more interesting patterns) to reduce test timing. In the next commit we'll increase postcopy period. Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/199 | 44 +- 1 file changed, 26 insertions(+), 18 deletions(-) -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 17/22] qemu-iotests/199: increase postcopy period
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Test wants force bitmap postcopy. Still, resulting postcopy period is The test wants to force a bitmap postcopy. Still, the resulting postcopy period is very small. very small. Let's increase it by adding more bitmaps to migrate. Also, test disabled bitmaps migration. Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/199 | 58 -- 1 file changed, 39 insertions(+), 19 deletions(-) Patches 12-17: Tested-by: Eric Blake As they all work without any other patches in this series, and DO make a dramatic difference (cutting the test from over 70 seconds to just 7, on my machine), I'm inclined to stage them now, even while waiting for you to rebase the rest of the series. And 18 is already in the tree. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 12/22] qemu-iotests/199: fix style
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Mostly, satisfy pep8 complains. Signed-off-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/199 | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) With none of your series applied, I get: $ ./check -qcow2 199 ... 199 not run[16:52:34] [16:52:34]not suitable for this cache mode: writeback Not run: 199 Passed all 0 iotests 199 fail [16:53:37] [16:53:37]output mismatch (see 199.out.bad) --- /home/eblake/qemu/tests/qemu-iotests/199.out 2020-07-23 16:48:56.275529368 -0500 +++ /home/eblake/qemu/build/tests/qemu-iotests/199.out.bad 2020-07-23 16:53:37.728416207 -0500 @@ -1,5 +1,13 @@ -. +E +== +ERROR: test_postcopy (__main__.TestDirtyBitmapPostcopyMigration) +-- +Traceback (most recent call last): + File "199", line 41, in setUp +os.mkfifo(fifo) +FileExistsError: [Errno 17] File exists + -- Ran 1 tests -OK +FAILED (errors=1) Failures: 199 Failed 1 of 1 iotests Ah, 'scratch/mig_fifo' was left over from some other aborted run of the test. I removed that file (which implies it might be nice if the test handled that automatically, instead of making me do it), and tried again; now I got the desired: 199 pass [17:00:34] [17:01:48] 74s Passed all 1 iotests After trying to rebase your series, I once again got failures, but that could mean I botched the rebase (since quite a few of the code patches earlier in the series were non-trivially changed). If you send a v3 (which would be really nice!), I'd hoist this and 13/22 first in the series, to get to a point where testing 199 works, to then make it easier to demonstrate what the rest of the 199 enhancements do in relation to the non-iotest patches. But I like that you separated the 199 improvements from the code - testing-wise, it's easy to apply the iotests patches first, make sure it fails, then apply the code patches, and make sure it passes, to prove that the enhanced test now covers what the code fixes did. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 08/22] migration/block-dirty-bitmap: keep bitmap state for all bitmaps
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Keep bitmap state for disabled bitmaps too. Keep the state until the end of the process. It's needed for the following commit to implement bitmap postcopy canceling. To clean-up the new list the following logic is used: We need two events to consider bitmap migration finished: 1. chunk with DIRTY_BITMAP_MIG_FLAG_COMPLETE flag should be received 2. dirty_bitmap_mig_before_vm_start should be called These two events may come in any order, so we understand which one is last, and on the last of them we remove bitmap migration state from the list. Signed-off-by: Vladimir Sementsov-Ogievskiy --- migration/block-dirty-bitmap.c | 64 +++--- 1 file changed, 43 insertions(+), 21 deletions(-) @@ -484,45 +488,59 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s) bdrv_disable_dirty_bitmap(s->bitmap); if (flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED) { -LoadBitmapState *b; - bdrv_dirty_bitmap_create_successor(s->bitmap, &local_err); if (local_err) { error_report_err(local_err); return -EINVAL; } - -b = g_new(LoadBitmapState, 1); -b->bs = s->bs; -b->bitmap = s->bitmap; -b->migrated = false; -s->enabled_bitmaps = g_slist_prepend(s->enabled_bitmaps, b); } +b = g_new(LoadBitmapState, 1); +b->bs = s->bs; +b->bitmap = s->bitmap; +b->migrated = false; +b->enabled = flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED, + +s->bitmaps = g_slist_prepend(s->bitmaps, b); Did you really mean to use a comma operator there, or should that be ';'? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 10/22] migration/block-dirty-bitmap: cancel migration on shutdown
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: If target is turned of prior to postcopy finished, target crashes s/of/off/ because busy bitmaps are found at shutdown. Canceling incoming migration helps, as it removes all unfinished (and therefore busy) bitmaps. Similarly on source we crash in bdrv_close_all which asserts that all bdrv states are removed, because bdrv states involved into dirty bitmap migration are referenced by it. So, we need to cancel outgoing migration as well. Signed-off-by: Vladimir Sementsov-Ogievskiy --- -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 03/22] migration/block-dirty-bitmap: rename dirty_bitmap_mig_cleanup
On 2/19/20 8:20 AM, Vladimir Sementsov-Ogievskiy wrote: 18.02.2020 14:00, Andrey Shinkevich wrote: On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote: Rename dirty_bitmap_mig_cleanup to dirty_bitmap_do_save_cleanup, to stress that it is on save part. Signed-off-by: Vladimir Sementsov-Ogievskiy --- migration/block-dirty-bitmap.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) At the next opportunity, I would suggest the name like "dirty_bitmap_do_clean_after_saving()" and similar for dirty_bitmap_save_cleanup() "dirty_bitmap_clean_after_saving()". I'd keep my naming, it corresponds to .save_cleanup handler name. I'm fine with that explanation, so no need to rename again. Reviewed-by: Eric Blake Reviewed-by: Andrey Shinkevich -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 02/22] migration/block-dirty-bitmap: rename state structure types
On 2/17/20 9:02 AM, Vladimir Sementsov-Ogievskiy wrote: Rename types to be symmetrical for load/save part and shorter. Signed-off-by: Vladimir Sementsov-Ogievskiy --- migration/block-dirty-bitmap.c | 68 ++ 1 file changed, 36 insertions(+), 32 deletions(-) No longer applies to master, but the mechanical aspect of the change makes sense. If you rebase the series to make review easier, Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 00/22] Fix error handling during bitmap postcopy
On 5/29/20 7:16 AM, Vladimir Sementsov-Ogievskiy wrote: 29.05.2020 14:58, Eric Blake wrote: On 4/2/20 2:42 AM, Vladimir Sementsov-Ogievskiy wrote: Ping! It's a fix, but not a degradation and I'm afraid too big for 5.0. Still, I think I should ping it anyway. John, I'm afraid, that this all is for your branch :) Just noticing this thread, now that we've shuffled bitmaps maintainers. Is there anything here that we still need to include in 5.1? Yes, we need the whole series. I'm starting to go through it now, to see what is still worth getting in to 5.1-rc2, but no promises as it is a long series and I don't want to introduce last-minute regressions (the fact that this missed 5.0 says that 5.1 will be no worse than 5.0 if we don't get this in until 5.2). -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 2/4] block/nbd: define new max_write_zero_fast limit
On 6/11/20 11:26 AM, Vladimir Sementsov-Ogievskiy wrote: The NBD spec was recently updated to clarify that max_block doesn't relate to NBD_CMD_WRITE_ZEROES with NBD_CMD_FLAG_FAST_ZERO (which mirrors Qemu flag BDRV_REQ_NO_FALLBACK). bs->bl.max_write_zero_fast is zero by default which means using max_pwrite_zeroes. Update nbd driver to allow larger requests with BDRV_REQ_NO_FALLBACK. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/nbd.c b/block/nbd.c index 4ac23c8f62..b0584cf68d 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1956,6 +1956,7 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) bs->bl.request_alignment = min; bs->bl.max_pdiscard = QEMU_ALIGN_DOWN(INT_MAX, min); +bs->bl.max_pwrite_zeroes_fast = bs->bl.max_pdiscard; bs->bl.max_pwrite_zeroes = max; Do we even need max_pwrite_zeroes_fast? Doesn't qemu behave correctly if we just blindly assign max_pdiscard and max_pwrite_zeroes to the same value near 2G? bs->bl.max_transfer = max; -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v4 1/4] block: add max_pwrite_zeroes_fast to BlockLimits
On 6/11/20 11:26 AM, Vladimir Sementsov-Ogievskiy wrote: The NBD spec was recently updated to clarify that max_block doesn't relate to NBD_CMD_WRITE_ZEROES with NBD_CMD_FLAG_FAST_ZERO (which mirrors Qemu flag BDRV_REQ_NO_FALLBACK). To drop the restriction we need new max_pwrite_zeroes_fast. Default value of new max_pwrite_zeroes_fast is zero and it means use max_pwrite_zeroes. So this commit semantically changes nothing. Signed-off-by: Vladimir Sementsov-Ogievskiy --- include/block/block_int.h | 8 block/io.c| 17 - 2 files changed, 20 insertions(+), 5 deletions(-) Hmm, this is an optimization, rather than a correctness issue. I'm sorry I didn't review it sooner, but at this point, I think it is better as 5.2 material. diff --git a/include/block/block_int.h b/include/block/block_int.h index 791de6a59c..277e32fe31 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -626,6 +626,14 @@ typedef struct BlockLimits { * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */ int32_t max_pwrite_zeroes; +/* + * Maximum number of bytes that can zeroed at once if flag + * BDRV_REQ_NO_FALLBACK specified. Must be multiple of + * pwrite_zeroes_alignment. + * If 0, max_pwrite_zeroes is used for no-fallback case. + */ +int64_t max_pwrite_zeroes_fast; Nice that this is 64-bit off the bat (I know you have another series about converting more stuff to 64-bit). + /* Optimal alignment for write zeroes requests in bytes. A power * of 2 is best but not mandatory. Must be a multiple of * bl.request_alignment, and must be less than max_pwrite_zeroes diff --git a/block/io.c b/block/io.c index df8f2a98d4..0af62a53fd 100644 --- a/block/io.c +++ b/block/io.c @@ -1774,12 +1774,13 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, bool need_flush = false; int head = 0; int tail = 0; - -int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); +int max_write_zeroes; 32-bit... int alignment = MAX(bs->bl.pwrite_zeroes_alignment, bs->bl.request_alignment); int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER); +assert(alignment % bs->bl.request_alignment == 0); Would this look any better using the QEMU_IS_ALIGNED macro? + if (!drv) { return -ENOMEDIUM; } @@ -1788,12 +1789,18 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, return -ENOTSUP; } -assert(alignment % bs->bl.request_alignment == 0); -head = offset % alignment; -tail = (offset + bytes) % alignment; +if ((flags & BDRV_REQ_NO_FALLBACK) && bs->bl.max_pwrite_zeroes_fast) { +max_write_zeroes = bs->bl.max_pwrite_zeroes_fast; ...but you try to assign something that may be 64-bit into it. Risk of overflow. Maybe we should get your 64-bit cleanup series in first. +} else { +max_write_zeroes = bs->bl.max_pwrite_zeroes; +} +max_write_zeroes = MIN_NON_ZERO(max_write_zeroes, INT_MAX); max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment); assert(max_write_zeroes >= bs->bl.request_alignment); +head = offset % alignment; +tail = (offset + bytes) % alignment; + while (bytes > 0 && !ret) { int num = bytes; -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v11 00/11] iotests: Dump QCOW2 dirty bitmaps metadata
On 7/23/20 2:42 PM, Eric Blake wrote: On 7/17/20 3:14 AM, Andrey Shinkevich wrote: Add dirty bitmap information to QCOW2 metadata dump in the qcow2_format.py. block/qcow2.c | 2 +- docs/interop/qcow2.txt | 2 +- tests/qemu-iotests/qcow2.py | 18 ++- tests/qemu-iotests/qcow2_format.py | 221 ++--- 4 files changed, 220 insertions(+), 23 deletions(-) I still don't see any obvious coverage of the new output, which makes it harder to test (I have to manually run qcow2.py on a file rather than seeing what changes in a ???.out file). I know we said back in v9 that test 291 is not the right test, but that does not stop you from adding a new test just for that purpose. The bulk of this series is touching a non-installed utility. At this point, I feel safer deferring it to 5.2 (it is a feature addition for testsuite use only, and we missed soft freeze), even though it has no negative impact to installed binaries. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v11 01/11] qcow2: Fix capitalization of header extension constant.
On 7/17/20 3:14 AM, Andrey Shinkevich wrote: Make the capitalization of the hexadecimal numbers consistent for the QCOW2 header extension constants in docs/interop/qcow2.txt. Suggested-by: Eric Blake Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy --- block/qcow2.c | 2 +- docs/interop/qcow2.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) Reviewed-by: Eric Blake This one is trivial; if I have a reason to do a bitmaps pull request for the next 5.1 -rc build, I'll include this too; if not, it doesn't hurt to wait for 5.2. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v11 00/11] iotests: Dump QCOW2 dirty bitmaps metadata
On 7/17/20 3:14 AM, Andrey Shinkevich wrote: Add dirty bitmap information to QCOW2 metadata dump in the qcow2_format.py. v10: 01: Fixing of issues in QCOW2 extension classes noted by Vladimir. 02: Reading bitmap tables was moved into Qcow2BitmapTable class. 03: Handling '-j' key was moved into "if __name__" section. 04: Making copy of __dict__ was replaced with the method to_dict(). 05: Qcow2HeaderExtensionsDoc is introduced in the separate patch. Andrey Shinkevich (11): qcow2: Fix capitalization of header extension constant. qcow2_format.py: make printable data an extension class member qcow2_format.py: change Qcow2BitmapExt initialization method qcow2_format.py: dump bitmap flags in human readable way. qcow2_format.py: Dump bitmap directory information qcow2_format.py: pass cluster size to substructures qcow2_format.py: Dump bitmap table serialized entries qcow2.py: Introduce '-j' key to dump in JSON format qcow2_format.py: collect fields to dump in JSON format qcow2_format.py: introduce Qcow2HeaderExtensionsDoc class qcow2_format.py: support dumping metadata in JSON format block/qcow2.c | 2 +- docs/interop/qcow2.txt | 2 +- tests/qemu-iotests/qcow2.py| 18 ++- tests/qemu-iotests/qcow2_format.py | 221 ++--- 4 files changed, 220 insertions(+), 23 deletions(-) I still don't see any obvious coverage of the new output, which makes it harder to test (I have to manually run qcow2.py on a file rather than seeing what changes in a ???.out file). I know we said back in v9 that test 291 is not the right test, but that does not stop you from adding a new test just for that purpose. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH for-5.1? 0/4] non-blocking connect
On 7/20/20 1:07 PM, Vladimir Sementsov-Ogievskiy wrote: Hi! This fixes real problem (see 04). On the other hand it may be too much for 5.1, and it's not a degradation. So, up to you. Given the concerns raised on 3, I think I'll wait for v2 of the series, and defer it to 5.2. It's based on "[PATCH for-5.1? 0/3] Fix nbd reconnect dead-locks", or in other words Based-on: <20200720090024.18186-1-vsement...@virtuozzo.com> Vladimir Sementsov-Ogievskiy (4): qemu-sockets: refactor inet_connect_addr qemu-sockets: implement non-blocking connect interface io/channel-socket: implement non-blocking connect block/nbd: use non-blocking connect: fix vm hang on connect() include/io/channel-socket.h | 14 +++ include/qemu/sockets.h | 6 +++ block/nbd.c | 11 +++--- io/channel-socket.c | 74 util/qemu-sockets.c | 76 ++--- 5 files changed, 153 insertions(+), 28 deletions(-) -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH 3/3] block/nbd: nbd_co_reconnect_loop(): don't sleep if drained
On 7/20/20 4:00 AM, Vladimir Sementsov-Ogievskiy wrote: We try to go to wakeable sleep, so that, if drain begins it will break the sleep. But what if nbd_client_co_drain_begin() already called and s->drained is already true? We'll go to sleep, and drain will have to wait for the whole timeout. Let's improve it. Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) How frequently did you hit this case? At any rate, the optimization looks sane, and I'm happy to include it in 5.1. Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH 2/3] block/nbd: on shutdown terminate connection attempt
On 7/20/20 4:00 AM, Vladimir Sementsov-Ogievskiy wrote: On shutdown nbd driver may be in a connecting state. We should shutdown it as well, otherwise we may hang in nbd_teardown_connection, waiting for conneciton_co to finish in BDRV_POLL_WHILE(bs, s->connection_co) loop if remote server is down. How to reproduce the dead lock: Same reproducer as in the previous patch (again, where a temporary sleep or well-placed gdb breakpoint may be more reliable than running two process loops). Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH 1/3] block/nbd: allow drain during reconnect attempt
On 7/20/20 4:00 AM, Vladimir Sementsov-Ogievskiy wrote: It should be to reenter qio_channel_yield() on io/channel read/write path, so it's safe to reduce in_flight and allow attaching new aio context. And no problem to allow drain itself: connection attempt is not a guest request. Moreover, if remote server is down, we can hang in negotiation, blocking drain section and provoking a dead lock. How to reproduce the dead lock: I tried to reproduce this; but in the several minutes it has taken me to write this email, it still has not hung. Still, your stack trace is fairly good evidence of the problem, where adding a temporary sleep or running it under gdb with a breakpoint can probably make reproduction easier. 1. Create nbd-fault-injector.conf with the following contents: [inject-error "mega1"] event=data io=readwrite when=before 2. In one terminal run nbd-fault-injector in a loop, like this: n=1; while true; do echo $n; ((n++)); Bashism, but not a problem for the commit message. ./nbd-fault-injector.py 127.0.0.1:1 nbd-fault-injector.conf; done 3. In another terminal run qemu-io in a loop, like this: n=1; while true; do echo $n; ((n++)); ./qemu-io -c 'read 0 512' nbd+tcp://127.0.0.1:1; I prefer the spelling nbd:// for TCP connections, but also inconsequential. Note, that the hang may be triggered by another bug, so the whole case is fixed only together with commit "block/nbd: on shutdown terminate connection attempt". Signed-off-by: Vladimir Sementsov-Ogievskiy --- block/nbd.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/block/nbd.c b/block/nbd.c index 65a4f56924..49254f1c3c 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -280,7 +280,18 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s) s->ioc = NULL; } +bdrv_dec_in_flight(s->bs); s->connect_status = nbd_client_connect(s->bs, &local_err); +s->wait_drained_end = true; +while (s->drained) { +/* + * We may be entered once from nbd_client_attach_aio_context_bh + * and then from nbd_client_co_drain_end. So here is a loop. + */ +qemu_coroutine_yield(); +} +bdrv_inc_in_flight(s->bs); + This is very similar to the code in nbd_co_reconnect_loop. Does that function still need to wait on drained, since it calls nbd_reconnect_attempt which is now doing the same loop? But off-hand, I'm not seeing a problem with keeping both places. Reviewed-by: Eric Blake As a bug fix, I'll be including this in my NBD pull request for the next -rc build. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH] block/amend: Check whether the node exists
On Fri, 10 Jul 2020 at 10:51, Max Reitz wrote: > > We should check whether the user-specified node-name actually refers to > a node. The simplest way to do that is to use bdrv_lookup_bs() instead > of bdrv_find_node() (the former wraps the latter, and produces an error > message if necessary). > > Reported-by: Coverity (CID 1430268) > Fixes: ced914d0ab9fb2c900f873f6349a0b8eecd1fdbe > Signed-off-by: Max Reitz Hi; has this patch got lost? (I'm just running through the Coverity issues marked as fix-submitted to check the patches made it into master, and it looks like this one hasn't yet.) thanks -- PMM
Re: [PATCH v7 36/47] nbd: Use CAF when looking for dirty bitmap
On 25.06.2020 18:22, Max Reitz wrote: When looking for a dirty bitmap to share, we should handle filters by just including them in the search (so they do not break backing chains). Signed-off-by: Max Reitz --- nbd/server.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 20754e9ebc..b504a79435 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1561,13 +1561,13 @@ NBDExport *nbd_export_new(BlockDriverState *bs, uint64_t dev_offset, if (bitmap) { BdrvDirtyBitmap *bm = NULL; -while (true) { +while (bs) { bm = bdrv_find_dirty_bitmap(bs, bitmap); -if (bm != NULL || bs->backing == NULL) { +if (bm != NULL) { break; } -bs = bs->backing->bs; +bs = bdrv_filter_or_cow_bs(bs); } if (bm == NULL) { Reviewed-by: Andrey Shinkevich
Re: [PATCH v7 35/47] commit: Deal with filters
On 25.06.2020 18:22, Max Reitz wrote: This includes some permission limiting (for example, we only need to take the RESIZE permission if the base is smaller than the top). Signed-off-by: Max Reitz --- block/block-backend.c | 9 +++- block/commit.c | 96 +- block/monitor/block-hmp-cmds.c | 2 +- blockdev.c | 4 +- 4 files changed, 81 insertions(+), 30 deletions(-) ... +/* + * Block all nodes between top and base, because they will + * disappear from the chain after this operation. + * Note that this assumes that the user is fine with removing all + * nodes (including R/W filters) between top and base. Assuring + * this is the responsibility of the interface (i.e. whoever calls + * commit_start()). + */ +s->base_overlay = bdrv_find_overlay(top, base); +assert(s->base_overlay); + +/* + * The topmost node with + * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base) + */ +filtered_base = bdrv_cow_bs(s->base_overlay); +assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)); + +/* + * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves + * at s->base (if writes are blocked for a node, they are also blocked + * for its backing file). The other options would be a second filter + * driver above s->base. + */ +iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE; + +for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) { +if (iter == filtered_base) { The question same to mirroring: in case of multiple filters, one above another, the permission is extended for the filtered_base only. Andrey +/* + * From here on, all nodes are filters on the base. This + * allows us to share BLK_PERM_CONSISTENT_READ. + */ +iter_shared_perms |= BLK_PERM_CONSISTENT_READ; +} + ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0, - BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE, - errp); + iter_shared_perms, errp); if (ret < 0) { goto fail; } ... Reviewed-by: Andrey Shinkevich
[PATCH v5 2/3] nvme: indicate CMB support through controller capabilities register
This patch sets CMBS bit in controller capabilities register when user configures NVMe driver with CMB support, so capabilites are correctly reported to guest OS. Signed-off-by: Andrzej Jakowski Reviewed-by: Klaus Jensen Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 1 + include/block/nvme.h | 10 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 841c18920c..43866b744f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2198,6 +2198,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_TO(n->bar.cap, 0xf); NVME_CAP_SET_CSS(n->bar.cap, 1); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); +NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; diff --git a/include/block/nvme.h b/include/block/nvme.h index 370df7fc05..d641ca6649 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -36,6 +36,7 @@ enum NvmeCapShift { CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, CAP_PMR_SHIFT = 56, +CAP_CMB_SHIFT = 57, }; enum NvmeCapMask { @@ -49,6 +50,7 @@ enum NvmeCapMask { CAP_MPSMIN_MASK= 0xf, CAP_MPSMAX_MASK= 0xf, CAP_PMR_MASK = 0x1, +CAP_CMB_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -78,9 +80,11 @@ enum NvmeCapMask { #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ -<< CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ -<< CAP_PMR_SHIFT) + << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK) \ + << CAP_PMR_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK) \ + << CAP_CMB_SHIFT) enum NvmeCcShift { CC_EN_SHIFT = 0, -- 2.21.1
[PATCH v5 3/3] nvme: allow cmb and pmr to be enabled on same device
So far it was not possible to have CMB and PMR emulated on the same device, because BAR2 was used exclusively either of PMR or CMB. This patch places CMB at BAR4 offset so it not conflicts with MSI-X vectors. Signed-off-by: Andrzej Jakowski --- hw/block/nvme.c | 120 +-- hw/block/nvme.h | 1 + include/block/nvme.h | 4 +- 3 files changed, 85 insertions(+), 40 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 43866b744f..d55a71a346 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -22,12 +22,13 @@ * [pmrdev=,] \ * max_ioqpairs= * - * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. + * Note cmb_size_mb denotes size of CMB in MB. CMB when configured is assumed + * to be resident in BAR4 at offset that is 2MiB aligned. When CMB is emulated + * on Linux guest it is recommended to make cmb_size_mb multiple of 2. Both + * size and alignment restrictions are imposed by Linux guest. * - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when - * both provided. + * pmrdev is assumed to be resident in BAR2/BAR3. When configured it consumes + * whole BAR2/BAR3 exclusively. * Enabling pmr emulation can be achieved by pointing to memory-backend-file. * For example: * -object memory-backend-file,id=,share=on,mem-path=, \ @@ -57,8 +58,8 @@ #define NVME_MAX_IOQPAIRS 0x #define NVME_DB_SIZE 4 #define NVME_SPEC_VER 0x00010300 -#define NVME_CMB_BIR 2 #define NVME_PMR_BIR 2 +#define NVME_MSIX_BIR 4 #define NVME_TEMPERATURE 0x143 #define NVME_TEMPERATURE_WARNING 0x157 #define NVME_TEMPERATURE_CRITICAL 0x175 @@ -111,16 +112,18 @@ static uint16_t nvme_sqid(NvmeRequest *req) static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { -hwaddr low = n->ctrl_mem.addr; -hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size); +hwaddr low = memory_region_to_absolute_addr(&n->ctrl_mem, 0); +hwaddr hi = low + int128_get64(n->ctrl_mem.size); return addr >= low && addr < hi; } static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { +hwaddr cmb_addr = memory_region_to_absolute_addr(&n->ctrl_mem, 0); + if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) { -memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size); +memcpy(buf, (void *)&n->cmbuf[addr - cmb_addr], size); return; } @@ -207,17 +210,18 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint64_t prp2, uint32_t len, NvmeCtrl *n) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); +hwaddr cmb_addr = memory_region_to_absolute_addr(&n->ctrl_mem, 0); trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; if (unlikely(!prp1)) { trace_pci_nvme_err_invalid_prp(); return NVME_INVALID_FIELD | NVME_DNR; -} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr && - prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) { +} else if (n->bar.cmbsz && prp1 >= cmb_addr && + prp1 < cmb_addr + int128_get64(n->ctrl_mem.size)) { qsg->nsg = 0; qemu_iovec_init(iov, num_prps); -qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len); +qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - cmb_addr], trans_len); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); qemu_sglist_add(qsg, prp1, trans_len); @@ -262,7 +266,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (qsg->nsg){ qemu_sglist_add(qsg, prp_ent, trans_len); } else { -qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len); +qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - cmb_addr], trans_len); } len -= trans_len; i++; @@ -275,7 +279,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, if (qsg->nsg) { qemu_sglist_add(qsg, prp2, len); } else { -qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - n->ctrl_mem.addr], trans_len); +qemu_iovec_add(iov, (void *)&n->cmbuf[prp2 - cmb_addr], trans_len); } } } @@ -1980,7 +1984,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } -if (!n->params.cmb_size_mb && n->pmrdev) { +if (n->pmrdev) { if (host_memory_backend_is_mapped(n->pmrdev)) { char *path = object_get_canonical_path_component(OBJECT(n->pmrdev)); error_setg(errp, "can't use already busy memd
[PATCH v5] nvme: allow cmb and pmr emulation on same device
Resending series recently posted on mailing list related to nvme device extension with couple of fixes after review. This patch series does following: - Exports memory_region_to_absolute_addr() function so it is avaialbe for use by drivers. This function is needed by 3rd patch in this series - Fixes problem where CMBS bit was not set in controller capabilities register, so support for CMB was not correctly advertised to guest. This is resend of patch that has been submitted and reviewed by Klaus [1] - Introduces BAR4 sharing between MSI-X vectors and CMB. This allows to have CMB and PMR emulated on the same device. This extension was indicated by Keith [2] v5: - fixed problem introduced in v4 where CMB buffer was represented as subregion of BAR4 memory region. In that case CMB address was used incorrectly as it was relative to BAR4 and not absolute. Appropriate changes were added to v5 to calculate CMB address properly ([6]) v4: - modified BAR4 initialization, so now it consists of CMB, MSIX and PBA memory regions overlapping on top of it. This reduces patch complexity significantly (Klaus [5]) v3: - code style fixes including: removal of spurious line break, moving define into define section and code alignment (Klaus [4]) - removed unintended code reintroduction (Klaus [4]) v2: - rebase on Kevin's latest block branch (Klaus [3]) - improved comments section (Klaus [3]) - simplified calculation of BAR4 size (Klaus [3]) v1: - initial push of the patch [1]: https://lore.kernel.org/qemu-devel/20200408055607.g2ii7gwqbnv6cd3w@apples.localdomain/ [2]: https://lore.kernel.org/qemu-devel/20200330165518.ga8...@redsun51.ssa.fujisawa.hgst.com/ [3]: https://lore.kernel.org/qemu-devel/20200605181043.28782-1-andrzej.jakow...@linux.intel.com/ [4]: https://lore.kernel.org/qemu-devel/20200618092524.posxi5mysb3jjtpn@apples.localdomain/ [5]: https://lore.kernel.org/qemu-devel/20200626055033.6vxqvi4s5pll7som@apples.localdomain/ [6]: https://lore.kernel.org/qemu-devel/9143a543-d32d-f3e7-c37b-b3df7f853...@linux.intel.com/
[PATCH v5 1/3] memory: export memory_region_to_absolute_addr() function
This change exports memory_region_to_absolute_addr() function so it can be used by drivers requiring to calculate absolute address for memory subregions when memory hierarchy is used. Signed-off-by: Andrzej Jakowski --- include/exec/memory.h | 9 + softmmu/memory.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 307e527835..6e5bba602e 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -2017,6 +2017,15 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr, MemOp op, MemTxAttrs attrs); +/** + * memory_region_to_absolute_addr: walk through memory hierarchy to retrieve + * absolute address for given MemoryRegion. + * + * @mr: #MemoryRegion to scan through + * @offset: starting offset within mr + */ +hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset); + /** * address_space_init: initializes an address space * diff --git a/softmmu/memory.c b/softmmu/memory.c index 9200b20130..deff3739ff 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -399,7 +399,7 @@ static inline uint64_t memory_region_shift_write_access(uint64_t *value, return tmp; } -static hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset) +hwaddr memory_region_to_absolute_addr(MemoryRegion *mr, hwaddr offset) { MemoryRegion *root; hwaddr abs_addr = offset; -- 2.21.1
Re: [PATCH v7 34/47] backup: Deal with filters
On 25.06.2020 18:22, Max Reitz wrote: Signed-off-by: Max Reitz --- block/backup-top.c | 2 +- block/backup.c | 9 + blockdev.c | 19 +++ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/block/backup.c b/block/backup.c index 4f13bb20a5..9afa0bf3b4 100644 --- a/block/backup.c +++ b/block/backup.c @@ -297,6 +297,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, { int ret; BlockDriverInfo bdi; +bool target_does_cow = bdrv_backing_chain_next(target); Wouldn't it better to make the explicit type conversion or use "!!" approach? Andrey /* * If there is no backing file on the target, we cannot rely on COW if our @@ -304,7 +305,7 @@ static int64_t backup_calculate_cluster_size(BlockDriverState *target, * targets with a backing file, try to avoid COW if possible. */ ret = bdrv_get_info(target, &bdi); -if (ret == -ENOTSUP && !target->backing) { +if (ret == -ENOTSUP && !target_does_cow) { /* Cluster size is not defined */ ... Reviewed-by: Andrey Shinkevich
Re: [ovirt-users] very very bad iscsi performance
Getting meaningful results is more important than getting good results. If the benchmark is not meaningful, it is not useful towards fixing the issue. Did you try virtio-blk with direct LUN? Paolo Il gio 23 lug 2020, 16:35 Philip Brown ha scritto: > Im in the middle of a priority issue right now, so cant take time out to > rerun the bench, but... > Usually in that kind of situation, if you dont turn on sync-to-disk on > every write, you get benchmarks that are artificially HIGH. > Forcing O_DIRECT slows throughput down. > Dont you think the results are bad enough already? :-} > > - Original Message - > From: "Stefan Hajnoczi" > To: "Philip Brown" > Cc: "Nir Soffer" , "users" , > "qemu-block" , "Paolo Bonzini" , > "Sergio Lopez Pascual" , "Mordechai Lehrer" < > mleh...@redhat.com>, "Kevin Wolf" > Sent: Thursday, July 23, 2020 6:09:39 AM > Subject: Re: [BULK] Re: [ovirt-users] very very bad iscsi performance > > > Hi, > At first glance it appears that the filebench OLTP workload does not use > O_DIRECT, so this isn't a measurement of pure disk I/O performance: > https://github.com/filebench/filebench/blob/master/workloads/oltp.f > > If you suspect that disk performance is the issue please run a benchmark > that bypasses the page cache using O_DIRECT. > > The fio setting is direct=1. > > Here is an example fio job for 70% read/30% write 4KB random I/O: > > [global] > filename=/path/to/device > runtime=120 > ioengine=libaio > direct=1 > ramp_time=10# start measuring after warm-up time > > [read] > readwrite=randrw > rwmixread=70 > rwmixwrite=30 > iodepth=64 > blocksize=4k > > (Based on > https://blog.vmsplice.net/2017/11/common-disk-benchmarking-mistakes.html) > > Stefan > >
Re: [ovirt-users] very very bad iscsi performance
Im in the middle of a priority issue right now, so cant take time out to rerun the bench, but... Usually in that kind of situation, if you dont turn on sync-to-disk on every write, you get benchmarks that are artificially HIGH. Forcing O_DIRECT slows throughput down. Dont you think the results are bad enough already? :-} - Original Message - From: "Stefan Hajnoczi" To: "Philip Brown" Cc: "Nir Soffer" , "users" , "qemu-block" , "Paolo Bonzini" , "Sergio Lopez Pascual" , "Mordechai Lehrer" , "Kevin Wolf" Sent: Thursday, July 23, 2020 6:09:39 AM Subject: Re: [BULK] Re: [ovirt-users] very very bad iscsi performance Hi, At first glance it appears that the filebench OLTP workload does not use O_DIRECT, so this isn't a measurement of pure disk I/O performance: https://github.com/filebench/filebench/blob/master/workloads/oltp.f If you suspect that disk performance is the issue please run a benchmark that bypasses the page cache using O_DIRECT. The fio setting is direct=1. Here is an example fio job for 70% read/30% write 4KB random I/O: [global] filename=/path/to/device runtime=120 ioengine=libaio direct=1 ramp_time=10# start measuring after warm-up time [read] readwrite=randrw rwmixread=70 rwmixwrite=30 iodepth=64 blocksize=4k (Based on https://blog.vmsplice.net/2017/11/common-disk-benchmarking-mistakes.html) Stefan
Re: [PATCH v2 20/20] simplebench: add bench-backup.py
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > Add script to benchmark new backup architecture. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > scripts/simplebench/bench-backup.py | 132 > 1 file changed, 132 insertions(+) > create mode 100755 scripts/simplebench/bench-backup.py Looks reasonable. Not sure whether I can give an R-b, given my non-existing background on the simplebench scripts, and also that it’s probably not really necessary. The only thing that sprung to my eye is that you give x-use-copy-range even when the “new” label isn’t given, which may be a problem. (Because AFAIU, the “new” label means that qemu is sufficiently new to support x-max-workers; and that’s basically equivalent to supporting x-use-copy-range.) Max signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 19/20] simplebench: bench_block_job: add cmd_options argument
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > Add argument to allow additional block-job options. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > scripts/simplebench/bench-example.py | 2 +- > scripts/simplebench/bench_block_job.py | 13 - > 2 files changed, 9 insertions(+), 6 deletions(-) [...] > diff --git a/scripts/simplebench/bench_block_job.py > b/scripts/simplebench/bench_block_job.py > index 9808d696cf..7332845c1c 100755 > --- a/scripts/simplebench/bench_block_job.py > +++ b/scripts/simplebench/bench_block_job.py > @@ -1,4 +1,4 @@ > -#!/usr/bin/env python > +#!/usr/bin/env python3 Looks a bit unrelated. Apart from that: Reviewed-by: Max Reitz signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 18/20] block/block-copy: drop unused argument of block_copy()
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > include/block/block-copy.h | 3 +-- > block/backup-top.c | 2 +- > block/block-copy.c | 11 ++- > 3 files changed, 4 insertions(+), 12 deletions(-) Reviewed-by: Max Reitz signature.asc Description: OpenPGP digital signature
Re: [BULK] Re: [ovirt-users] very very bad iscsi performance
On Tue, Jul 21, 2020 at 07:14:53AM -0700, Philip Brown wrote: > Thank you for the analysis. I have some further comments: > > First off, filebench pre-writes the files before doing oltp benchmarks, so I > dont think the thin provisioning is at play here. > I will double check this, but if you dont hear otherwise, please presume that > is the case :) > > Secondly, I am surprised at your recommendation to use virtio instead of > virtio-scsi. since the writeup for virtio-scsi claims it has equivalent > performance in general, and adds better scaling > https://www.ovirt.org/develop/release-management/features/storage/virtio-scsi.html > > As far as your suggestion for using multiple disks for scaling higher: > We are using an SSD. Isnt the whole advantage of using SSD drives, that you > can get the IOP/s performance of 10 drives, out of a single drive? > We certainly get that using it natively, outside of a VM. > SO it would be nice to see performance approaching that within an ovirt VM. Hi, At first glance it appears that the filebench OLTP workload does not use O_DIRECT, so this isn't a measurement of pure disk I/O performance: https://github.com/filebench/filebench/blob/master/workloads/oltp.f If you suspect that disk performance is the issue please run a benchmark that bypasses the page cache using O_DIRECT. The fio setting is direct=1. Here is an example fio job for 70% read/30% write 4KB random I/O: [global] filename=/path/to/device runtime=120 ioengine=libaio direct=1 ramp_time=10# start measuring after warm-up time [read] readwrite=randrw rwmixread=70 rwmixwrite=30 iodepth=64 blocksize=4k (Based on https://blog.vmsplice.net/2017/11/common-disk-benchmarking-mistakes.html) Stefan signature.asc Description: PGP signature
Re: [PATCH for-5.1] nbd: Fix large trim/zero requests
23.07.2020 14:47, Eric Blake wrote: On 7/23/20 2:23 AM, Vladimir Sementsov-Ogievskiy wrote: 23.07.2020 00:22, Eric Blake wrote: Although qemu as NBD client limits requests to <2G, the NBD protocol allows clients to send requests almost all the way up to 4G. But because our block layer is not yet 64-bit clean, we accidentally wrap such requests into a negative size, and fail with EIO instead of performing the intended operation. @@ -2378,8 +2378,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FAST_ZERO) { flags |= BDRV_REQ_NO_FALLBACK; } - ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, - request->len, flags); + ret = 0; + /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */ + while (ret == 0 && request->len) { + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, + len, flags); + request->len -= len; + request->from += len; + } return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); It's easy enough to audit that blk_pwrite_zeroes returns 0 (and not arbitrary positive) on success. Yes, that's why I've posted my commend to blk_co_pdiscard :) @@ -2393,8 +2402,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, "flush failed", errp); case NBD_CMD_TRIM: - ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, - request->len); + ret = 0; + /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */ + while (ret == 0 && request->len) { Did you check all the paths not to return positive ret on success? I'd prefer to compare ret >= 0 for this temporary code to not care of it And for blk_co_pdiscard, the audit is already right here in the patch: pre-patch, ret = blk_co_pdiscard, followed by... + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, + len); + request->len -= len; + request->from += len; Hmm.. Modifying the function parameter. Safe now, as nbd_handle_request() call is the last usage of request in nbd_trip(). + } if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { ...a check for ret == 0. Hmm. Is it a bug or not? Anyway, bdrv_co_pdiscard() does "if (ret && .." as well, so if some driver return positive ret, it may fail earlier.. ret = blk_co_flush(exp->blk); } Yes, I can respin the patch to use >= 0 as the check for success in both functions, but it doesn't change the behavior. OK. Anyway: Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH 13/13] ssh: add GUri-based URI parsing
On Thu, Jul 23, 2020 at 04:58:48PM +0400, Marc-André Lureau wrote: > Hi > > On Thu, Jul 23, 2020 at 4:33 PM Richard W.M. Jones > wrote: > > > On Thu, Jul 09, 2020 at 11:42:34PM +0400, Marc-André Lureau wrote: > > > Signed-off-by: Marc-André Lureau > > > --- > > > block/ssh.c | 75 + > > > 1 file changed, 58 insertions(+), 17 deletions(-) > > > > > > diff --git a/block/ssh.c b/block/ssh.c > > > index c8f6ad79e3c..d2bc6277613 100644 > > > --- a/block/ssh.c > > > +++ b/block/ssh.c > > > @@ -180,9 +180,37 @@ static void sftp_error_trace(BDRVSSHState *s, const > > char *op) > > > > > > static int parse_uri(const char *filename, QDict *options, Error **errp) > > > { > > > +g_autofree char *port_str = NULL; > > > +const char *scheme, *server, *path, *user, *key, *value; > > > +gint port; > > > + > > > +#ifdef HAVE_GLIB_GURI > > > +g_autoptr(GUri) uri = NULL; > > > +g_autoptr(GHashTable) params = NULL; > > > +g_autoptr(GError) err = NULL; > > > +GHashTableIter iter; > > > + > > > +uri = g_uri_parse(filename, G_URI_FLAGS_ENCODED_QUERY, &err); > > > +if (!uri) { > > > +error_setg(errp, "Failed to parse SSH URI: %s", err->message); > > > +return -EINVAL; > > > +} > > > + > > > +params = g_uri_parse_params(g_uri_get_query(uri), -1, > > > +"&;", G_URI_PARAMS_NONE, &err); > > > +if (err) { > > > +error_report("Failed to parse SSH URI query: %s", err->message); > > > +return -EINVAL; > > > +} > > > + > > > +scheme = g_uri_get_scheme(uri); > > > +user = g_uri_get_user(uri); > > > +server = g_uri_get_host(uri); > > > +path = g_uri_get_path(uri); > > > +port = g_uri_get_port(uri); > > > +#else > > > g_autoptr(URI) uri = NULL; > > > g_autoptr(QueryParams) qp = NULL; > > > -g_autofree char *port_str = NULL; > > > int i; > > > > As Dan said already, this conditional code looks horrible and is going > > to be a maintenance burden. Was there a later version of this patch > > series that resolved this? I don't think I saw one. > > > > The patch is quite experimental. glib didn't even yet receive a release > with GUri! But since I am working on the glib side, I wanted to make sure > it covers qemu needs. > > I will revisit the series once GUri is frozen & released (in > mid-september),and use a copy version fallback. > > Although, as I said in the cover, this is a bit riskier than having a > transition period with both the old libxml-based parser and glib-based one > for very recent distro. The risk here is that the GUri has different semantics/behaviour to the libxml one. If that risk is large, then we shouldn't use GUri at all. If the risk is low enough that we consider GUri a viable option, then we should use it exclusively IMHO. This guarantees that all deployments of QEMU will have identical behaviour, which will lower our support burden in the event that bugs do appear, as we'll only have one codepath to worry about. The test suite you provided for GUri looks pretty comprehensive, and QEMU's iotests are pretty decent too, so I think between the two we have enough coverage that we should have confidence in using GUri exclusively. If we hit obscure edge cases, we can just deal with it as it arises. Regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
Re: [PATCH 13/13] ssh: add GUri-based URI parsing
Hi On Thu, Jul 23, 2020 at 4:33 PM Richard W.M. Jones wrote: > On Thu, Jul 09, 2020 at 11:42:34PM +0400, Marc-André Lureau wrote: > > Signed-off-by: Marc-André Lureau > > --- > > block/ssh.c | 75 + > > 1 file changed, 58 insertions(+), 17 deletions(-) > > > > diff --git a/block/ssh.c b/block/ssh.c > > index c8f6ad79e3c..d2bc6277613 100644 > > --- a/block/ssh.c > > +++ b/block/ssh.c > > @@ -180,9 +180,37 @@ static void sftp_error_trace(BDRVSSHState *s, const > char *op) > > > > static int parse_uri(const char *filename, QDict *options, Error **errp) > > { > > +g_autofree char *port_str = NULL; > > +const char *scheme, *server, *path, *user, *key, *value; > > +gint port; > > + > > +#ifdef HAVE_GLIB_GURI > > +g_autoptr(GUri) uri = NULL; > > +g_autoptr(GHashTable) params = NULL; > > +g_autoptr(GError) err = NULL; > > +GHashTableIter iter; > > + > > +uri = g_uri_parse(filename, G_URI_FLAGS_ENCODED_QUERY, &err); > > +if (!uri) { > > +error_setg(errp, "Failed to parse SSH URI: %s", err->message); > > +return -EINVAL; > > +} > > + > > +params = g_uri_parse_params(g_uri_get_query(uri), -1, > > +"&;", G_URI_PARAMS_NONE, &err); > > +if (err) { > > +error_report("Failed to parse SSH URI query: %s", err->message); > > +return -EINVAL; > > +} > > + > > +scheme = g_uri_get_scheme(uri); > > +user = g_uri_get_user(uri); > > +server = g_uri_get_host(uri); > > +path = g_uri_get_path(uri); > > +port = g_uri_get_port(uri); > > +#else > > g_autoptr(URI) uri = NULL; > > g_autoptr(QueryParams) qp = NULL; > > -g_autofree char *port_str = NULL; > > int i; > > As Dan said already, this conditional code looks horrible and is going > to be a maintenance burden. Was there a later version of this patch > series that resolved this? I don't think I saw one. > The patch is quite experimental. glib didn't even yet receive a release with GUri! But since I am working on the glib side, I wanted to make sure it covers qemu needs. I will revisit the series once GUri is frozen & released (in mid-september),and use a copy version fallback. Although, as I said in the cover, this is a bit riskier than having a transition period with both the old libxml-based parser and glib-based one for very recent distro. Tbh, I think having both is not a big burden, because there is very low activity around those functions. Iow, we are not spreading qemu with a lot of extra conditionals, but only in very limited mostly static places. > Rich. > > > > uri = uri_parse(filename); > > @@ -190,44 +218,57 @@ static int parse_uri(const char *filename, QDict > *options, Error **errp) > > return -EINVAL; > > } > > > > -if (g_strcmp0(uri->scheme, "ssh") != 0) { > > -error_setg(errp, "URI scheme must be 'ssh'"); > > +qp = query_params_parse(uri->query); > > +if (!qp) { > > +error_setg(errp, "could not parse query parameters"); > > return -EINVAL; > > } > > > > -if (!uri->server || strcmp(uri->server, "") == 0) { > > -error_setg(errp, "missing hostname in URI"); > > +scheme = uri->scheme; > > +user = uri->user; > > +server = uri->server; > > +path = uri->path; > > +port = uri->port; > > +#endif > > +if (g_strcmp0(scheme, "ssh") != 0) { > > +error_setg(errp, "URI scheme must be 'ssh'"); > > return -EINVAL; > > } > > > > -if (!uri->path || strcmp(uri->path, "") == 0) { > > -error_setg(errp, "missing remote path in URI"); > > +if (!server || strcmp(server, "") == 0) { > > +error_setg(errp, "missing hostname in URI"); > > return -EINVAL; > > } > > > > -qp = query_params_parse(uri->query); > > -if (!qp) { > > -error_setg(errp, "could not parse query parameters"); > > +if (!path || strcmp(path, "") == 0) { > > +error_setg(errp, "missing remote path in URI"); > > return -EINVAL; > > } > > > > -if(uri->user && strcmp(uri->user, "") != 0) { > > -qdict_put_str(options, "user", uri->user); > > +if (user && strcmp(user, "") != 0) { > > +qdict_put_str(options, "user", user); > > } > > > > -qdict_put_str(options, "server.host", uri->server); > > +qdict_put_str(options, "server.host", server); > > > > -port_str = g_strdup_printf("%d", uri->port ?: 22); > > +port_str = g_strdup_printf("%d", port ?: 22); > > qdict_put_str(options, "server.port", port_str); > > > > -qdict_put_str(options, "path", uri->path); > > +qdict_put_str(options, "path", path); > > > > /* Pick out any query parameters that we understand, and ignore > > * the rest. > > */ > > +#ifdef HAVE_GLIB_GURI > > +g_hash_table_iter_init(&iter, params); > > +while (g_hash_table_iter_next(&iter, (void **)&key, (void > **)&value)) { > > +#e
Re: [PATCH 05/13] block/ssh: auto-ify URI parsing variables
On Thu, Jul 09, 2020 at 11:42:26PM +0400, Marc-André Lureau wrote: > Signed-off-by: Marc-André Lureau > --- > block/ssh.c | 23 +++ > 1 file changed, 7 insertions(+), 16 deletions(-) > > diff --git a/block/ssh.c b/block/ssh.c > index 098dbe03c15..c8f6ad79e3c 100644 > --- a/block/ssh.c > +++ b/block/ssh.c > @@ -180,9 +180,9 @@ static void sftp_error_trace(BDRVSSHState *s, const char > *op) > > static int parse_uri(const char *filename, QDict *options, Error **errp) > { > -URI *uri = NULL; > -QueryParams *qp; > -char *port_str; > +g_autoptr(URI) uri = NULL; > +g_autoptr(QueryParams) qp = NULL; > +g_autofree char *port_str = NULL; > int i; > > uri = uri_parse(filename); > @@ -192,23 +192,23 @@ static int parse_uri(const char *filename, QDict > *options, Error **errp) > > if (g_strcmp0(uri->scheme, "ssh") != 0) { > error_setg(errp, "URI scheme must be 'ssh'"); > -goto err; > +return -EINVAL; > } > > if (!uri->server || strcmp(uri->server, "") == 0) { > error_setg(errp, "missing hostname in URI"); > -goto err; > +return -EINVAL; > } > > if (!uri->path || strcmp(uri->path, "") == 0) { > error_setg(errp, "missing remote path in URI"); > -goto err; > +return -EINVAL; > } > > qp = query_params_parse(uri->query); > if (!qp) { > error_setg(errp, "could not parse query parameters"); > -goto err; > +return -EINVAL; > } > > if(uri->user && strcmp(uri->user, "") != 0) { > @@ -219,7 +219,6 @@ static int parse_uri(const char *filename, QDict > *options, Error **errp) > > port_str = g_strdup_printf("%d", uri->port ?: 22); > qdict_put_str(options, "server.port", port_str); > -g_free(port_str); > > qdict_put_str(options, "path", uri->path); > > @@ -232,15 +231,7 @@ static int parse_uri(const char *filename, QDict > *options, Error **errp) > } > } > > -query_params_free(qp); > -uri_free(uri); > return 0; > - > - err: > -if (uri) { > - uri_free(uri); > -} > -return -EINVAL; > } > I had to look up the definition of g_autoptr, and it seems fine since there's a corresponding URI macro added in the first commit. ACK Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Re: [PATCH 13/13] ssh: add GUri-based URI parsing
On Thu, Jul 09, 2020 at 11:42:34PM +0400, Marc-André Lureau wrote: > Signed-off-by: Marc-André Lureau > --- > block/ssh.c | 75 + > 1 file changed, 58 insertions(+), 17 deletions(-) > > diff --git a/block/ssh.c b/block/ssh.c > index c8f6ad79e3c..d2bc6277613 100644 > --- a/block/ssh.c > +++ b/block/ssh.c > @@ -180,9 +180,37 @@ static void sftp_error_trace(BDRVSSHState *s, const char > *op) > > static int parse_uri(const char *filename, QDict *options, Error **errp) > { > +g_autofree char *port_str = NULL; > +const char *scheme, *server, *path, *user, *key, *value; > +gint port; > + > +#ifdef HAVE_GLIB_GURI > +g_autoptr(GUri) uri = NULL; > +g_autoptr(GHashTable) params = NULL; > +g_autoptr(GError) err = NULL; > +GHashTableIter iter; > + > +uri = g_uri_parse(filename, G_URI_FLAGS_ENCODED_QUERY, &err); > +if (!uri) { > +error_setg(errp, "Failed to parse SSH URI: %s", err->message); > +return -EINVAL; > +} > + > +params = g_uri_parse_params(g_uri_get_query(uri), -1, > +"&;", G_URI_PARAMS_NONE, &err); > +if (err) { > +error_report("Failed to parse SSH URI query: %s", err->message); > +return -EINVAL; > +} > + > +scheme = g_uri_get_scheme(uri); > +user = g_uri_get_user(uri); > +server = g_uri_get_host(uri); > +path = g_uri_get_path(uri); > +port = g_uri_get_port(uri); > +#else > g_autoptr(URI) uri = NULL; > g_autoptr(QueryParams) qp = NULL; > -g_autofree char *port_str = NULL; > int i; As Dan said already, this conditional code looks horrible and is going to be a maintenance burden. Was there a later version of this patch series that resolved this? I don't think I saw one. Rich. > uri = uri_parse(filename); > @@ -190,44 +218,57 @@ static int parse_uri(const char *filename, QDict > *options, Error **errp) > return -EINVAL; > } > > -if (g_strcmp0(uri->scheme, "ssh") != 0) { > -error_setg(errp, "URI scheme must be 'ssh'"); > +qp = query_params_parse(uri->query); > +if (!qp) { > +error_setg(errp, "could not parse query parameters"); > return -EINVAL; > } > > -if (!uri->server || strcmp(uri->server, "") == 0) { > -error_setg(errp, "missing hostname in URI"); > +scheme = uri->scheme; > +user = uri->user; > +server = uri->server; > +path = uri->path; > +port = uri->port; > +#endif > +if (g_strcmp0(scheme, "ssh") != 0) { > +error_setg(errp, "URI scheme must be 'ssh'"); > return -EINVAL; > } > > -if (!uri->path || strcmp(uri->path, "") == 0) { > -error_setg(errp, "missing remote path in URI"); > +if (!server || strcmp(server, "") == 0) { > +error_setg(errp, "missing hostname in URI"); > return -EINVAL; > } > > -qp = query_params_parse(uri->query); > -if (!qp) { > -error_setg(errp, "could not parse query parameters"); > +if (!path || strcmp(path, "") == 0) { > +error_setg(errp, "missing remote path in URI"); > return -EINVAL; > } > > -if(uri->user && strcmp(uri->user, "") != 0) { > -qdict_put_str(options, "user", uri->user); > +if (user && strcmp(user, "") != 0) { > +qdict_put_str(options, "user", user); > } > > -qdict_put_str(options, "server.host", uri->server); > +qdict_put_str(options, "server.host", server); > > -port_str = g_strdup_printf("%d", uri->port ?: 22); > +port_str = g_strdup_printf("%d", port ?: 22); > qdict_put_str(options, "server.port", port_str); > > -qdict_put_str(options, "path", uri->path); > +qdict_put_str(options, "path", path); > > /* Pick out any query parameters that we understand, and ignore > * the rest. > */ > +#ifdef HAVE_GLIB_GURI > +g_hash_table_iter_init(&iter, params); > +while (g_hash_table_iter_next(&iter, (void **)&key, (void **)&value)) { > +#else > for (i = 0; i < qp->n; ++i) { > -if (strcmp(qp->p[i].name, "host_key_check") == 0) { > -qdict_put_str(options, "host_key_check", qp->p[i].value); > +key = qp->p[i].name; > +value = qp->p[i].value; > +#endif > +if (g_strcmp0(key, "host_key_check") == 0) { > +qdict_put_str(options, "host_key_check", value); > } > } > > -- > 2.27.0.221.ga08a83db2b -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Re: [PATCH for-5.1] nbd: Fix large trim/zero requests
On 7/23/20 2:23 AM, Vladimir Sementsov-Ogievskiy wrote: 23.07.2020 00:22, Eric Blake wrote: Although qemu as NBD client limits requests to <2G, the NBD protocol allows clients to send requests almost all the way up to 4G. But because our block layer is not yet 64-bit clean, we accidentally wrap such requests into a negative size, and fail with EIO instead of performing the intended operation. @@ -2378,8 +2378,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FAST_ZERO) { flags |= BDRV_REQ_NO_FALLBACK; } - ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, - request->len, flags); + ret = 0; + /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */ + while (ret == 0 && request->len) { + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, + len, flags); + request->len -= len; + request->from += len; + } return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); It's easy enough to audit that blk_pwrite_zeroes returns 0 (and not arbitrary positive) on success. @@ -2393,8 +2402,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, "flush failed", errp); case NBD_CMD_TRIM: - ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, - request->len); + ret = 0; + /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */ + while (ret == 0 && request->len) { Did you check all the paths not to return positive ret on success? I'd prefer to compare ret >= 0 for this temporary code to not care of it And for blk_co_pdiscard, the audit is already right here in the patch: pre-patch, ret = blk_co_pdiscard, followed by... + int align = client->check_align ?: 1; + int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, + align)); + ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, + len); + request->len -= len; + request->from += len; Hmm.. Modifying the function parameter. Safe now, as nbd_handle_request() call is the last usage of request in nbd_trip(). + } if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { ...a check for ret == 0. ret = blk_co_flush(exp->blk); } Yes, I can respin the patch to use >= 0 as the check for success in both functions, but it doesn't change the behavior. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 17/20] backup: move to block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > This brings async request handling and block-status driven chunk sizes > to backup out of the box, which improves backup performance. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > include/block/block-copy.h | 9 +-- > block/backup.c | 145 +++-- > block/block-copy.c | 21 ++ > 3 files changed, 83 insertions(+), 92 deletions(-) This patch feels like it should be multiple ones. I don’t see why a patch that lets backup use block-copy (more) should have to modify the block-copy code. More specifically: I think that block_copy_set_progress_callback() should be removed in a separate patch. Also, moving @cb_opaque/@progress_opaque from BlockCopyState into BlockCopyCallState seems like a separate patch to me, too. (I presume if the cb had had its own opaque object from patch 5 on, there wouldn’t be this problem now. We’d just stop using the progress callback in backup, and remove it in one separate patch.) [...] > diff --git a/block/backup.c b/block/backup.c > index ec2676abc2..59c00f5293 100644 > --- a/block/backup.c > +++ b/block/backup.c > @@ -44,42 +44,19 @@ typedef struct BackupBlockJob { > BlockdevOnError on_source_error; > BlockdevOnError on_target_error; > uint64_t len; > -uint64_t bytes_read; > int64_t cluster_size; > int max_workers; > int64_t max_chunk; > > BlockCopyState *bcs; > + > +BlockCopyCallState *bcs_call; Can this be more descriptive? E.g. background_bcs? bg_bcs_call? bg_bcsc? > +int ret; > +bool error_is_read; > } BackupBlockJob; > > static const BlockJobDriver backup_job_driver; > [...] > static int coroutine_fn backup_loop(BackupBlockJob *job) > { > -bool error_is_read; > -int64_t offset; > -BdrvDirtyBitmapIter *bdbi; > -int ret = 0; > +while (true) { /* retry loop */ > +assert(!job->bcs_call); > +job->bcs_call = block_copy_async(job->bcs, 0, > + QEMU_ALIGN_UP(job->len, > + job->cluster_size), > + true, job->max_workers, > job->max_chunk, > + backup_block_copy_callback, job); > > -bdbi = bdrv_dirty_iter_new(block_copy_dirty_bitmap(job->bcs)); > -while ((offset = bdrv_dirty_iter_next(bdbi)) != -1) { > -do { > -if (yield_and_check(job)) { > -goto out; > +while (job->bcs_call && !job->common.job.cancelled) { > +/* wait and handle pauses */ Doesn’t someone need to reset BlockCopyCallState.cancelled when the job is unpaused? I can’t see anyone doing that. Well, or even just reentering the block-copy operation after backup_pause() has cancelled it. Is there some magic I’m missing? Does pausing (which leads to cancelling the block-copy operation) just yield to the CB being invoked, so the copy operation is considered done, and then we just enter the next iteration of the loop and try again? But cancelling the block-copy operation leads to it returning 0, AFAIR, so... > + > +job_pause_point(&job->common.job); > + > +if (job->bcs_call && !job->common.job.cancelled) { > +job_yield(&job->common.job); > } > -ret = backup_do_cow(job, offset, job->cluster_size, > &error_is_read); > -if (ret < 0 && backup_error_action(job, error_is_read, -ret) == > - BLOCK_ERROR_ACTION_REPORT) > -{ > -goto out; > +} > + > +if (!job->bcs_call && job->ret == 0) { > +/* Success */ > +return 0; ...I would assume we return here when the job is paused. > +} > + > +if (job->common.job.cancelled) { > +if (job->bcs_call) { > +block_copy_cancel(job->bcs_call); > } > -} while (ret < 0); > +return 0; > +} > + > +if (!job->bcs_call && job->ret < 0 && Is it even possible for bcs_call to be non-NULL here? > +(backup_error_action(job, job->error_is_read, -job->ret) == > + BLOCK_ERROR_ACTION_REPORT)) > +{ > +return job->ret; > +} So if we get an error, and the error action isn’t REPORT, we just try the whole operation again? That doesn’t sound very IGNORE-y to me. > } > > - out: > -bdrv_dirty_iter_free(bdbi); > -return ret; > +g_assert_not_reached(); > } > > static void backup_init_bcs_bitmap(BackupBlockJob *job) > @@ -246,9 +227,14 @@ static int coroutine_fn backup_run(Job *job, Error > **errp) > int64_t count; > > for (offset = 0; offset < s->len; ) { > -if (yield_and_check(s)) { > -ret = -ECANCELED; > -goto out; > +if (job_is_cancelled(job))
Re: [PATCH v2 16/20] iotests: 257: prepare for backup over block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > Iotest 257 dumps a lot of in-progress information of backup job, such > as offset and bitmap dirtiness. Further commit will move backup to be > one block-copy call, which will introduce async parallel requests > instead of plain cluster-by-cluster copying. To keep things > deterministic, allow only one worker (only one copy request at a time) > for this test. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > tests/qemu-iotests/257 | 1 + > tests/qemu-iotests/257.out | 306 ++--- > 2 files changed, 154 insertions(+), 153 deletions(-) It’s a shame that we don’t run this test with the default configuration then, but I suppose that’s how it is. For now, at least. Reviewed-by: Max Reitz signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 15/20] iotests: 219: prepare for backup over block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > The further change of moving backup to be a on block-copy call will -on? > make copying chunk-size and cluster-size a separate things. So, even s/a/two/ > with 64k cluster sized qcow2 image, default chunk would be 1M. > Test 219 depends on specified chunk-size. Update it for explicit > chunk-size for backup as for mirror. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > tests/qemu-iotests/219 | 13 +++-- > 1 file changed, 7 insertions(+), 6 deletions(-) > > diff --git a/tests/qemu-iotests/219 b/tests/qemu-iotests/219 > index db272c5249..2bbed28f39 100755 > --- a/tests/qemu-iotests/219 > +++ b/tests/qemu-iotests/219 > @@ -203,13 +203,13 @@ with iotests.FilePath('disk.img') as disk_path, \ > # but related to this also automatic state transitions like job > # completion), but still get pause points often enough to avoid making > this > # test very slow, it's important to have the right ratio between speed > and > -# buf_size. > +# copy-chunk-size. > # > -# For backup, buf_size is hard-coded to the source image cluster size > (64k), > -# so we'll pick the same for mirror. The slice time, i.e. the granularity > -# of the rate limiting is 100ms. With a speed of 256k per second, we can > -# get four pause points per second. This gives us 250ms per iteration, > -# which should be enough to stay deterministic. > +# Chose 64k copy-chunk-size both for mirror (by buf_size) and backup (by > +# x-max-chunk). The slice time, i.e. the granularity of the rate limiting > +# is 100ms. With a speed of 256k per second, we can get four pause points > +# per second. This gives us 250ms per iteration, which should be enough > to > +# stay deterministic. Don’t we also have to limit the number of workers to 1 so we actually keep 250 ms per iteration instead of just finishing four requests immediately, then pausing for a second? > test_job_lifecycle(vm, 'drive-mirror', has_ready=True, job_args={ > 'device': 'drive0-node', > @@ -226,6 +226,7 @@ with iotests.FilePath('disk.img') as disk_path, \ > 'target': copy_path, > 'sync': 'full', > 'speed': 262144, > +'x-max-chunk': 65536, > 'auto-finalize': auto_finalize, > 'auto-dismiss': auto_dismiss, > }) > signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 14/20] iotests: 185: prepare for backup over block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > The further change of moving backup to be a on block-copy call will -on? > make copying chunk-size and cluster-size a separate things. So, even s/a/two/ > with 64k cluster sized qcow2 image, default chunk would be 1M. > 185 test however assumes, that with speed limited to 64K, one iteration > would result in offset=64K. It will change, as first iteration would > result in offset=1M independently of speed. > > So, let's explicitly specify, what test wants: set max-chunk to 64K, so > that one iteration is 64K. Note, that we don't need to limit > max-workers, as block-copy rate limitator will handle the situation and *limitator > wouldn't start new workers when speed limit is obviously reached. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > tests/qemu-iotests/185 | 3 ++- > tests/qemu-iotests/185.out | 2 +- > 2 files changed, 3 insertions(+), 2 deletions(-) > > diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185 > index fd5e6ebe11..6afb3fc82f 100755 > --- a/tests/qemu-iotests/185 > +++ b/tests/qemu-iotests/185 > @@ -182,7 +182,8 @@ _send_qemu_cmd $h \ >'target': '$TEST_IMG.copy', >'format': '$IMGFMT', >'sync': 'full', > - 'speed': 65536 } }" \ > + 'speed': 65536, > + 'x-max-chunk': 65536 } }" \ Out of curiosity, would it also suffice to disable copy offloading? But anyway: Reviewed-by: Max Reitz > "return" > > # If we don't sleep here 'quit' command races with disk I/O > diff --git a/tests/qemu-iotests/185.out b/tests/qemu-iotests/185.out > index ac5ab16bc8..5232647972 100644 > --- a/tests/qemu-iotests/185.out > +++ b/tests/qemu-iotests/185.out > @@ -61,7 +61,7 @@ Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 > cluster_size=65536 l > > { 'execute': 'qmp_capabilities' } > {"return": {}} > -{ 'execute': 'drive-backup', 'arguments': { 'device': 'disk', 'target': > 'TEST_DIR/t.IMGFMT.copy', 'format': 'IMGFMT', 'sync': 'full', 'speed': 65536 > } } > +{ 'execute': 'drive-backup', 'arguments': { 'device': 'disk', 'target': > 'TEST_DIR/t.IMGFMT.copy', 'format': 'IMGFMT', 'sync': 'full', 'speed': 65536, > 'x-max-chunk': 65536 } } > Formatting 'TEST_DIR/t.qcow2.copy', fmt=qcow2 size=67108864 > cluster_size=65536 lazy_refcounts=off refcount_bits=16 compression_type=zlib > {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": > "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "disk"}} > {"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": > "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk"}} > signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 13/20] iotests: 129: prepare for backup over block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > After introducing parallel async copy requests instead of plain > cluster-by-cluster copying loop, backup job may finish earlier than > final assertion in do_test_stop. Let's require slow backup explicitly > by specifying speed parameter. Isn’t the problem really that block_set_io_throttle does absolutely nothing? (Which is a long-standing problem with 129. I personally just never run it, honestly.) > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > tests/qemu-iotests/129 | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/tests/qemu-iotests/129 b/tests/qemu-iotests/129 > index 4db5eca441..bca56b589d 100755 > --- a/tests/qemu-iotests/129 > +++ b/tests/qemu-iotests/129 > @@ -76,7 +76,7 @@ class TestStopWithBlockJob(iotests.QMPTestCase): > def test_drive_backup(self): > self.do_test_stop("drive-backup", device="drive0", >target=self.target_img, > - sync="full") > + sync="full", speed=1024) > > def test_block_commit(self): > self.do_test_stop("block-commit", device="drive0") > signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 12/20] iotests: 56: prepare for backup over block-copy
On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: > After introducing parallel async copy requests instead of plain > cluster-by-cluster copying loop, we'll have to wait for paused status, > as we need to wait for several parallel request. So, let's gently wait > instead of just asserting that job already paused. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > tests/qemu-iotests/056 | 8 ++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/tests/qemu-iotests/056 b/tests/qemu-iotests/056 > index f73fc74457..2ced356a43 100755 > --- a/tests/qemu-iotests/056 > +++ b/tests/qemu-iotests/056 > @@ -306,8 +306,12 @@ class BackupTest(iotests.QMPTestCase): > event = self.vm.event_wait(name="BLOCK_JOB_ERROR", > match={'data': {'device': 'drive0'}}) > self.assertNotEqual(event, None) > -# OK, job should be wedged > -res = self.vm.qmp('query-block-jobs') > +# OK, job should pause, but it can't do it immediately, as it can't > +# cancel other parallel requests (which didn't fail) > +while True: > +res = self.vm.qmp('query-block-jobs') > +if res['return'][0]['status'] == 'paused': > +break A timeout around this would be nice, I think. > self.assert_qmp(res, 'return[0]/status', 'paused') > res = self.vm.qmp('block-job-dismiss', id='drive0') > self.assert_qmp(res, 'error/desc', > signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 11/20] qapi: backup: add x-max-chunk and x-max-workers parameters
On 22.07.20 14:22, Max Reitz wrote: > On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote: >> Add new parameters to configure future backup features. The patch >> doesn't introduce aio backup requests (so we actually have only one >> worker) neither requests larger than one cluster. Still, formally we >> satisfy these maximums anyway, so add the parameters now, to facilitate >> further patch which will really change backup job behavior. >> >> Options are added with x- prefixes, as the only use for them are some >> very conservative iotests which will be updated soon. >> >> Signed-off-by: Vladimir Sementsov-Ogievskiy >> --- >> qapi/block-core.json | 9 - >> include/block/block_int.h | 7 +++ >> block/backup.c| 21 + >> block/replication.c | 2 +- >> blockdev.c| 5 + >> 5 files changed, 42 insertions(+), 2 deletions(-) [...] >> diff --git a/block/replication.c b/block/replication.c >> index 25987eab0f..a9ee82db80 100644 >> --- a/block/replication.c >> +++ b/block/replication.c >> @@ -563,7 +563,7 @@ static void replication_start(ReplicationState *rs, >> ReplicationMode mode, >> s->backup_job = backup_job_create( >> NULL, s->secondary_disk->bs, >> s->hidden_disk->bs, >> 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, >> NULL, >> -true, >> +true, 0, 0, > > Passing 0 for max_workers will error out immediately, won’t it? Ah, oops. Saw your own reply only now. Yep, 1 worker would be nice. :) signature.asc Description: OpenPGP digital signature
Re: [PATCH for-5.1] nbd: Fix large trim/zero requests
23.07.2020 00:22, Eric Blake wrote: Although qemu as NBD client limits requests to <2G, the NBD protocol allows clients to send requests almost all the way up to 4G. But because our block layer is not yet 64-bit clean, we accidentally wrap such requests into a negative size, and fail with EIO instead of performing the intended operation. The bug is visible in modern systems with something as simple as: $ qemu-img create -f qcow2 /tmp/image.img 5G $ sudo qemu-nbd --connect=/dev/nbd0 /tmp/image.img $ sudo blkdiscard /dev/nbd0 or with user-space only: $ truncate --size=3G file $ qemu-nbd -f raw file $ nbdsh -u nbd://localhost:10809 -c 'h.trim(3*1024*1024*1024,0)' Alas, our iotests do not currently make it easy to add external dependencies on blkdiscard or nbdsh, so we have to rely on manual testing for now. This patch can be reverted when we later improve the overall block layer to be 64-bit clean, but for now, a minimal fix was deemed less risky prior to release. CC: qemu-sta...@nongnu.org Fixes: 1f4d6d18ed Fixes: 1c6c4bb7f0 Fixes: https://github.com/systemd/systemd/issues/16242 Signed-off-by: Eric Blake --- nbd/server.c | 26 ++ 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 4752a6c8bc07..029618017c90 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -2378,8 +2378,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, if (request->flags & NBD_CMD_FLAG_FAST_ZERO) { flags |= BDRV_REQ_NO_FALLBACK; } -ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, -request->len, flags); +ret = 0; +/* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */ +while (ret == 0 && request->len) { +int align = client->check_align ?: 1; +int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, +align)); +ret = blk_pwrite_zeroes(exp->blk, request->from + exp->dev_offset, +len, flags); +request->len -= len; +request->from += len; +} return nbd_send_generic_reply(client, request->handle, ret, "writing to file failed", errp); @@ -2393,8 +2402,17 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, "flush failed", errp); case NBD_CMD_TRIM: -ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, - request->len); +ret = 0; +/* FIXME simplify this when blk_co_pdiscard switches to 64-bit */ +while (ret == 0 && request->len) { Did you check all the paths not to return positive ret on success? I'd prefer to compare ret >= 0 for this temporary code to not care of it +int align = client->check_align ?: 1; +int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES, +align)); +ret = blk_co_pdiscard(exp->blk, request->from + exp->dev_offset, + len); +request->len -= len; +request->from += len; Hmm.. Modifying the function parameter. Safe now, as nbd_handle_request() call is the last usage of request in nbd_trip(). +} if (ret == 0 && request->flags & NBD_CMD_FLAG_FUA) { ret = blk_co_flush(exp->blk); } -- Best regards, Vladimir