Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
On 10/5/22 10:44, Damien Le Moal wrote: > On 9/29/22 18:31, Sam Li wrote: >> Since Linux doesn't have a user API to issue zone append operations to >> zoned devices from user space, the file-posix driver is modified to add >> zone append emulation using regular writes. To do this, the file-posix >> driver tracks the wp location of all zones of the device. It uses an >> array of uint64_t. The most significant bit of each wp location indicates >> if the zone type is sequential write required. >> >> The zones wp can be changed due to the following operations issued: >> - zone reset: change the wp to the start offset of that zone >> - zone finish: change to the end location of that zone >> - write to a zone >> - zone append >> >> Signed-off-by: Sam Li >> --- >> block/file-posix.c | 138 ++- >> include/block/block-common.h | 16 >> include/block/block_int-common.h | 5 ++ >> include/block/raw-aio.h | 4 +- >> 4 files changed, 159 insertions(+), 4 deletions(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index 73656d87f2..33e81ac112 100755 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { >> struct { >> struct iovec *iov; >> int niov; >> +int64_t *append_sector; >> +BlockZoneWps *wps; >> } io; >> struct { >> uint64_t cmd; >> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat >> *st) { >> #endif >> } >> >> +#if defined(CONFIG_BLKZONED) >> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, >> + unsigned int nrz) { > > Maybe rename this to get_zones_wp() ? > >> +struct blk_zone *blkz; >> +int64_t rep_size; >> +int64_t sector = offset >> BDRV_SECTOR_BITS; >> +int ret, n = 0, i = 0; >> + >> +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct >> blk_zone); >> +g_autofree struct blk_zone_report *rep = NULL; > > To be cleaner, move this declaration above with the others ? > >> +rep = g_malloc(rep_size); >> + >> +blkz = (struct blk_zone *)(rep + 1); >> +while (n < nrz) { >> +memset(rep, 0, rep_size); >> +rep->sector = sector; >> +rep->nr_zones = nrz - n; >> + >> +do { >> +ret = ioctl(fd, BLKREPORTZONE, rep); >> +} while (ret != 0 && errno == EINTR); >> +if (ret != 0) { >> +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", >> +fd, offset, errno); >> +return -errno; >> +} >> + >> +if (!rep->nr_zones) { >> +break; >> +} >> + >> +for (i = 0; i < rep->nr_zones; i++, n++) { >> +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; >> +sector = blkz[i].start + blkz[i].len; >> + >> +/* >> + * In the wp tracking, it only cares if the zone type is >> sequential >> + * writes required so that the wp can advance to the right >> location. > > Or sequential write preferred (host aware case) > >> + * Instead of the type of zone_type which is an 8-bit unsigned >> + * integer, use the first most significant bits of the wp >> location >> + * to indicate the zone type: 0 for SWR zones and 1 for the >> + * others. >> + */ >> +if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { > > This should be: > > if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { > > Note that the type field is not a bit-field. So you must compare values > instead of doing bit operations. > >> +wps->wp[i] += (uint64_t)1 << 63; > > You can simplify this: > > wps->wp[i] |= 1ULL << 63; > > Overall, I would rewrite this like this: > > for (i = 0; i < rep->nr_zones; i++, n++) { > /* > * The wp tracking cares only about sequential write required > * and sequential write preferred zones so that the wp can > * advance to the right location. > * Use the most significant bit of the wp location > * to indicate the zone type: 0 for SWR zones and 1 for > * conventional zones. > */ > if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { > wps->wp[i] = 1ULL << 63; > else > wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > } > sector = blkz[i - 1].start + blkz[i - 1].len; > > Which I think is a lot simpler. > >> +} >> +} >> +} >> + >> +return 0; >> +} >> +#endif >> + >> static void raw_refresh_limits(BlockDriverState *bs, Error **errp) >> { >> BDRVRawState *s = bs->opaque; >> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, >> Error **errp) >> error_report("Invalid device capacity %" PRId64 " bytes ", >> bs->bl.capacity); >> return; >>
Re: [PATCH v2 2/2] block: introduce zone append write for zoned devices
On 9/29/22 18:31, Sam Li wrote: > A zone append command is a write operation that specifies the first > logical block of a zone as the write position. When writing to a zoned > block device using zone append, the byte offset of the write is pointing > to the write pointer of that zone. Upon completion the device will > respond with the position the data has been written in the zone. > > Signed-off-by: Sam Li > --- > block/block-backend.c | 65 ++ > block/file-posix.c | 51 +++ > block/io.c | 21 ++ > block/raw-format.c | 7 > include/block/block-io.h | 3 ++ > include/block/block_int-common.h | 3 ++ > include/sysemu/block-backend-io.h | 9 + > qemu-io-cmds.c | 62 > tests/qemu-iotests/tests/zoned.out | 7 > tests/qemu-iotests/tests/zoned.sh | 9 + > 10 files changed, 237 insertions(+) > > diff --git a/block/block-backend.c b/block/block-backend.c > index f7f7acd6f4..07a8632af1 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo { > struct { > BlockZoneOp op; > } zone_mgmt; > +struct { > +int64_t *append_sector; > +} zone_append; > }; > } BlkRwCo; > > @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return &acb->common; > } > > +static void blk_aio_zone_append_entry(void *opaque) { > +BlkAioEmAIOCB *acb = opaque; > +BlkRwCo *rwco = &acb->rwco; > + > +rwco->ret = blk_co_zone_append(rwco->blk, > rwco->zone_append.append_sector, > + rwco->iobuf, rwco->flags); > +blk_aio_complete(acb); > +} > + > +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags, > +BlockCompletionFunc *cb, void *opaque) { > +BlkAioEmAIOCB *acb; > +Coroutine *co; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); > +acb->rwco = (BlkRwCo) { > +.blk= blk, > +.ret= NOT_DONE, > +.flags = flags, > +.iobuf = qiov, > +.zone_append = { > +.append_sector = offset, > +}, > +}; > +acb->has_returned = false; > + > +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb); > +bdrv_coroutine_enter(blk_bs(blk), co); > + > +acb->has_returned = true; > +if (acb->rwco.ret != NOT_DONE) { > +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk), > + blk_aio_complete_bh, acb); > +} > + > +return &acb->common; > +} > + > /* > * Send a zone_report command. > * offset is a byte offset from the start of the device. No alignment > @@ -1921,6 +1965,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, > BlockZoneOp op, > return ret; > } > > +/* > + * Send a zone_append command. > + */ > +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset, > +QEMUIOVector *qiov, BdrvRequestFlags flags) > +{ > +int ret; > +IO_CODE(); > + > +blk_inc_in_flight(blk); > +blk_wait_while_drained(blk); > +if (!blk_is_available(blk)) { > +blk_dec_in_flight(blk); > +return -ENOMEDIUM; > +} > + > +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags); > +blk_dec_in_flight(blk); > +return ret; > +} > + > void blk_drain(BlockBackend *blk) > { > BlockDriverState *bs = blk_bs(blk); > diff --git a/block/file-posix.c b/block/file-posix.c > index 33e81ac112..24b70f1afe 100755 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -3454,6 +3454,56 @@ static int coroutine_fn > raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, > #endif > } > > + whiteline change. > +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs, > + int64_t *offset, > + QEMUIOVector *qiov, > + BdrvRequestFlags flags) { > +#if defined(CONFIG_BLKZONED) > +BDRVRawState *s = bs->opaque; > +int64_t zone_size_mask = bs->bl.zone_size - 1; > +int64_t iov_len = 0; > +int64_t len = 0; > +RawPosixAIOData acb; > + > +if (*offset & zone_size_mask) { > +error_report("sector offset %" PRId64 " is not aligned to zone size " > + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512); > +return -EINVAL; > +} > + > +int64_t wg = bs->bl.write_granularity; > +int64_t wg_mask = wg - 1; > +for (int i = 0; i < qiov->niov; i++) { > + iov_len = qiov->iov[i].iov_len; > + if (iov_len & wg_mask) { > + erro
Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp
On 9/29/22 18:31, Sam Li wrote: > Since Linux doesn't have a user API to issue zone append operations to > zoned devices from user space, the file-posix driver is modified to add > zone append emulation using regular writes. To do this, the file-posix > driver tracks the wp location of all zones of the device. It uses an > array of uint64_t. The most significant bit of each wp location indicates > if the zone type is sequential write required. > > The zones wp can be changed due to the following operations issued: > - zone reset: change the wp to the start offset of that zone > - zone finish: change to the end location of that zone > - write to a zone > - zone append > > Signed-off-by: Sam Li > --- > block/file-posix.c | 138 ++- > include/block/block-common.h | 16 > include/block/block_int-common.h | 5 ++ > include/block/raw-aio.h | 4 +- > 4 files changed, 159 insertions(+), 4 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 73656d87f2..33e81ac112 100755 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData { > struct { > struct iovec *iov; > int niov; > +int64_t *append_sector; > +BlockZoneWps *wps; > } io; > struct { > uint64_t cmd; > @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat > *st) { > #endif > } > > +#if defined(CONFIG_BLKZONED) > +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps, > + unsigned int nrz) { Maybe rename this to get_zones_wp() ? > +struct blk_zone *blkz; > +int64_t rep_size; > +int64_t sector = offset >> BDRV_SECTOR_BITS; > +int ret, n = 0, i = 0; > + > +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct > blk_zone); > +g_autofree struct blk_zone_report *rep = NULL; To be cleaner, move this declaration above with the others ? > +rep = g_malloc(rep_size); > + > +blkz = (struct blk_zone *)(rep + 1); > +while (n < nrz) { > +memset(rep, 0, rep_size); > +rep->sector = sector; > +rep->nr_zones = nrz - n; > + > +do { > +ret = ioctl(fd, BLKREPORTZONE, rep); > +} while (ret != 0 && errno == EINTR); > +if (ret != 0) { > +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d", > +fd, offset, errno); > +return -errno; > +} > + > +if (!rep->nr_zones) { > +break; > +} > + > +for (i = 0; i < rep->nr_zones; i++, n++) { > +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; > +sector = blkz[i].start + blkz[i].len; > + > +/* > + * In the wp tracking, it only cares if the zone type is > sequential > + * writes required so that the wp can advance to the right > location. Or sequential write preferred (host aware case) > + * Instead of the type of zone_type which is an 8-bit unsigned > + * integer, use the first most significant bits of the wp > location > + * to indicate the zone type: 0 for SWR zones and 1 for the > + * others. > + */ > +if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) { This should be: if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) { Note that the type field is not a bit-field. So you must compare values instead of doing bit operations. > +wps->wp[i] += (uint64_t)1 << 63; You can simplify this: wps->wp[i] |= 1ULL << 63; Overall, I would rewrite this like this: for (i = 0; i < rep->nr_zones; i++, n++) { /* * The wp tracking cares only about sequential write required * and sequential write preferred zones so that the wp can * advance to the right location. * Use the most significant bit of the wp location * to indicate the zone type: 0 for SWR zones and 1 for * conventional zones. */ if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) { wps->wp[i] = 1ULL << 63; else wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS; } sector = blkz[i - 1].start + blkz[i - 1].len; Which I think is a lot simpler. > +} > +} > +} > + > +return 0; > +} > +#endif > + > static void raw_refresh_limits(BlockDriverState *bs, Error **errp) > { > BDRVRawState *s = bs->opaque; > @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, > Error **errp) > error_report("Invalid device capacity %" PRId64 " bytes ", > bs->bl.capacity); > return; > } > + > +ret = get_sysfs_long_val(&st, "physical_block_size"); > +if (ret >= 0) { > +bs->bl.write_granularity = ret; > +} This change seems unrelated to the wp tracking. Should this be mo
Re: [PATCH v2 13/13] hw/ppc/e500: Add Freescale eSDHC to e500 boards
Am 3. Oktober 2022 21:06:57 UTC schrieb "Philippe Mathieu-Daudé" : >On 3/10/22 22:31, Bernhard Beschow wrote: >> Adds missing functionality to emulated e500 SOCs which increases the >> chance of given "real" firmware images to access SD cards. >> >> Signed-off-by: Bernhard Beschow >> --- >> docs/system/ppc/ppce500.rst | 13 + >> hw/ppc/Kconfig | 1 + >> hw/ppc/e500.c | 31 ++- >> 3 files changed, 44 insertions(+), 1 deletion(-) > >> +static void dt_sdhc_create(void *fdt, const char *parent, const char *mpic) >> +{ >> +hwaddr mmio = MPC85XX_ESDHC_REGS_OFFSET; >> +hwaddr size = MPC85XX_ESDHC_REGS_SIZE; >> +int irq = MPC85XX_ESDHC_IRQ; > >Why not pass these 3 variable as argument? In anticipation of data-driven board creation, I'd ideally infer those from the device's QOM properties. This seems similar to what Mark suggested in the BoF at KVM Forum [1], where -- IIUC -- he stated that QOM properties could be the foundation of all wiring representations. And device tree seems just like one specialized representation to me. (Note that I'm slightly hijacking the review here because I don't know where and how to express these thoughts elsewhere). Does it make sense to add the missing properties here? Best regards, Bernhard [1] https://etherpad.opendev.org/p/qemu-emulation-bof%40kvmforum2022 > >> +g_autofree char *name = NULL; >> + >> +name = g_strdup_printf("%s/sdhc@%" PRIx64, parent, mmio); >> +qemu_fdt_add_subnode(fdt, name); >> +qemu_fdt_setprop(fdt, name, "sdhci,auto-cmd12", NULL, 0); >> +qemu_fdt_setprop_phandle(fdt, name, "interrupt-parent", mpic); >> +qemu_fdt_setprop_cells(fdt, name, "bus-width", 4); >> +qemu_fdt_setprop_cells(fdt, name, "interrupts", irq, 0x2); >> +qemu_fdt_setprop_cells(fdt, name, "reg", mmio, size); >> +qemu_fdt_setprop_string(fdt, name, "compatible", "fsl,esdhc"); >> +} >> typedef struct PlatformDevtreeData { >> void *fdt; >> @@ -553,6 +573,8 @@ static int ppce500_load_device_tree(PPCE500MachineState >> *pms, >> dt_rtc_create(fdt, "i2c", "rtc"); >> +/* sdhc */ >> +dt_sdhc_create(fdt, soc, mpic); >>
Re: [PATCH v2 09/13] hw/ppc/e500: Implement pflash handling
Am 3. Oktober 2022 21:21:15 UTC schrieb "Philippe Mathieu-Daudé" : >On 3/10/22 22:31, Bernhard Beschow wrote: >> Allows e500 boards to have their root file system reside on flash using >> only builtin devices located in the eLBC memory region. >> >> Note that the flash memory area is only created when a -pflash argument is >> given, and that the size is determined by the given file. The idea is to >> put users into control. >> >> Signed-off-by: Bernhard Beschow >> --- >> docs/system/ppc/ppce500.rst | 12 ++ >> hw/ppc/Kconfig | 1 + >> hw/ppc/e500.c | 76 + >> 3 files changed, 89 insertions(+) > >> @@ -856,6 +892,7 @@ void ppce500_init(MachineState *machine) >> unsigned int pci_irq_nrs[PCI_NUM_PINS] = {1, 2, 3, 4}; >> IrqLines *irqs; >> DeviceState *dev, *mpicdev; >> +DriveInfo *dinfo; >> CPUPPCState *firstenv = NULL; >> MemoryRegion *ccsr_addr_space; >> SysBusDevice *s; >> @@ -1024,6 +1061,45 @@ void ppce500_init(MachineState *machine) >> pmc->platform_bus_base, >> &pms->pbus_dev->mmio); >> +dinfo = drive_get(IF_PFLASH, 0, 0); >> +if (dinfo) { >> +BlockBackend *blk = blk_by_legacy_dinfo(dinfo); >> +BlockDriverState *bs = blk_bs(blk); >> +uint64_t size = bdrv_getlength(bs); >> +uint64_t mmio_size = pms->pbus_dev->mmio.size; >> +uint32_t sector_len = 64 * KiB; >> + >> +if (ctpop64(size) != 1) { >> +error_report("Size of pflash file must be a power of two."); > >This is a PFLASH restriction (which you already fixed in the previous >patch), not a board one. I agree that this check seems redundant to the one in cfi01. I added this one for clearer error messages since cfi01 only complains about the "device size" not being a power of two while this message at least gives a hint towards the source of the problem (the file given in the pflash option). Usually the size of the pflash area is hardcoded in the board while I choose to derive it from the size of the backing file in order to avoid hardcoding it. My idea is to put users into control by offering more flexibility. > >> +exit(1); >> +} >> + >> +if (size > mmio_size) { >> +error_report("Size of pflash file must not be bigger than %" >> PRIu64 >> + " bytes.", mmio_size); > >There is no hardware limitation here, you can wire flash bigger than the >memory aperture. What is above the aperture will simply be ignored. > >Should we display a warning here instead of a fatal error? While this is technically possible, is that what users would expect? Couldn't we just require users to truncate their files if they really want the "aperture" behavior? > >> +exit(1); >> +} >> + >> +assert(QEMU_IS_ALIGNED(size, sector_len)); > >Similarly, this doesn't seem a problem the board code should worry >about: better to defer it to PFLASH realize(). The reason for the assert() here is that size isn't stored directly in the cfi01 device. Instead, it must be calculated by the properties "num-blocks" times "sector-length". For this to work, size must be divisible by sector_len without remainder, which is checked by the assertion. We could theoretically add a "size" property which would violate the single point of truth principle, though. Do you see a different solution? Best regards, Bernhard > >> +dev = qdev_new(TYPE_PFLASH_CFI01); >> +qdev_prop_set_drive(dev, "drive", blk); >> +qdev_prop_set_uint32(dev, "num-blocks", size / sector_len); >> +qdev_prop_set_uint64(dev, "sector-length", sector_len); >> +qdev_prop_set_uint8(dev, "width", 2); >> +qdev_prop_set_bit(dev, "big-endian", true); >> +qdev_prop_set_uint16(dev, "id0", 0x89); >> +qdev_prop_set_uint16(dev, "id1", 0x18); >> +qdev_prop_set_uint16(dev, "id2", 0x); >> +qdev_prop_set_uint16(dev, "id3", 0x0); >> +qdev_prop_set_string(dev, "name", "e500.flash"); >> +s = SYS_BUS_DEVICE(dev); >> +sysbus_realize_and_unref(s, &error_fatal); >> + >> +memory_region_add_subregion(&pms->pbus_dev->mmio, 0, >> +sysbus_mmio_get_region(s, 0)); >> +} >> + >> /* >>* Smart firmware defaults ahead! >>* >
Re: [PATCH] spec: Add NBD_OPT_EXTENDED_HEADERS
On Fri, Dec 03, 2021 at 05:14:34PM -0600, Eric Blake wrote: > Add a new negotiation feature where the client and server agree to use > larger packet headers on every packet sent during transmission phase. > This has two purposes: first, it makes it possible to perform > operations like trim, write zeroes, and block status on more than 2^32 > bytes in a single command; this in turn requires that some structured > replies from the server also be extended to match. The wording chosen > here is careful to permit a server to use either flavor in its reply > (that is, a request less than 32-bits can trigger an extended reply, > and conversely a request larger than 32-bits can trigger a compact > reply). Following up on this original proposal with something that came out of KVM Forum this year. > +* `NBD_REPLY_TYPE_BLOCK_STATUS_EXT` (6) > + > + This chunk type is in the status chunk category. *length* MUST be > + 4 + (a positive multiple of 16). The semantics of this chunk mirror > + those of `NBD_REPLY_TYPE_BLOCK_STATUS`, other than the use of a > + larger *extent length* field, as well as added padding to ease > + alignment. This chunk type MUST NOT be used unless extended headers > + were negotiated with `NBD_OPT_EXTENDED_HEADERS`. > + > + The payload starts with: > + > + 32 bits, metadata context ID > + > + and is followed by a list of one or more descriptors, each with this > + layout: > + > + 64 bits, length of the extent to which the status below > + applies (unsigned, MUST be nonzero) > + 32 bits, status flags > + 32 bits, padding (MUST be zero) During KVM Forum, I had several conversations about Zoned Block Devices (https://zonedstorage.io/docs/linux/zbd-api), and what it would take to expose ZBD information over NBD. In particular, NBD_CMD_BLOCK_STATUS sounds like a great way for advertising information about zones (by adding several metadata contexts that can be negotiated during NBD_OPT_SET_META_CONTEXT), except for the fact that a zone might be larger than 32 bits in size. So Rich Jones asked me the question of whether my work on 64-bit extensions to the NBD protocol could also allow for a server to advertise a metadata context only to clients that support 64-bit extensions, at which point it can report 64-bit offsets or lengths as needed, rather than being limited to 32-bit status flags. The idea definitely has merit, so I'm working on incorporating that into my next revision for 64-bit extensions in NBD. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Re: [RFC patch 0/1] block: vhost-blk backend
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote: > Although QEMU virtio-blk is quite fast, there is still some room for > improvements. Disk latency can be reduced if we handle virito-blk requests > in host kernel so we avoid a lot of syscalls and context switches. > > The biggest disadvantage of this vhost-blk flavor is raw format. > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html > > Also by using kernel modules we can bypass iothread limitation and finaly > scale > block requests with cpus for high-performance devices. This is planned to be > implemented in next version. > > Linux kernel module part: > https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadche...@virtuozzo.com/ > > test setups and results: > fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128 > QEMU drive options: cache=none > filesystem: xfs Please post the full QEMU command-line so it's clear exactly what this is benchmarking. A preallocated raw image file is a good baseline with: --object iothread,id=iothread0 \ --blockdev file,filename=test.img,cache.direct=on,aio=native,node-name=drive0 \ --device virtio-blk-pci,drive=drive0,iothread=iothread0 (BTW QEMU's default vq size is 256 descriptors and the number of vqs is the number of vCPUs.) > > SSD: >| randread, IOPS | randwrite, IOPS | > Host | 95.8k| 85.3k | > QEMU virtio| 57.5k| 79.4k | > QEMU vhost-blk | 95.6k| 84.3k | > > RAMDISK (vq == vcpu): With fio numjobs=vcpu here? > | randread, IOPS | randwrite, IOPS | > virtio, 1vcpu|123k | 129k | > virtio, 2vcpu|253k (??) | 250k (??) | QEMU's aio=threads (default) gets around the single IOThread. It beats aio=native for this reason in some cases. Were you using aio=native or aio=threads? > virtio, 4vcpu|158k | 154k | > vhost-blk, 1vcpu |110k | 113k | > vhost-blk, 2vcpu |247k | 252k | signature.asc Description: PGP signature
Re: [RFC PATCH 1/1] block: add vhost-blk backend
On Mon, Jul 25, 2022 at 11:55:27PM +0300, Andrey Zhadchenko wrote: > Although QEMU virtio is quite fast, there is still some room for > improvements. Disk latency can be reduced if we handle virito-blk requests > in host kernel istead of passing them to QEMU. The patch adds vhost-blk > backend which sets up vhost-blk kernel module to process requests. > > test setup and results: > fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128 > QEMU drive options: cache=none > filesystem: xfs > > SSD: >| randread, IOPS | randwrite, IOPS | > Host | 95.8k| 85.3k | > QEMU virtio| 57.5k| 79.4k | > QEMU vhost-blk | 95.6k| 84.3k | > > RAMDISK (vq == vcpu): > | randread, IOPS | randwrite, IOPS | > virtio, 1vcpu|123k | 129k | > virtio, 2vcpu|253k (??) | 250k (??) | > virtio, 4vcpu|158k | 154k | > vhost-blk, 1vcpu |110k | 113k | > vhost-blk, 2vcpu |247k | 252k | > vhost-blk, 4vcpu |576k | 567k | > > Signed-off-by: Andrey Zhadchenko > --- > hw/block/Kconfig | 5 + > hw/block/meson.build | 4 + > hw/block/vhost-blk.c | 394 ++ > hw/virtio/meson.build | 3 + > hw/virtio/vhost-blk-pci.c | 102 + > include/hw/virtio/vhost-blk.h | 50 + > meson.build | 5 + > 7 files changed, 563 insertions(+) > create mode 100644 hw/block/vhost-blk.c > create mode 100644 hw/virtio/vhost-blk-pci.c > create mode 100644 include/hw/virtio/vhost-blk.h > > diff --git a/hw/block/Kconfig b/hw/block/Kconfig > index 9e8f28f982..b4286ad10e 100644 > --- a/hw/block/Kconfig > +++ b/hw/block/Kconfig > @@ -36,6 +36,11 @@ config VIRTIO_BLK > default y > depends on VIRTIO > > +config VHOST_BLK > +bool > +default n Feel free to enable it by default. That way it gets more CI/build coverage. > +depends on VIRTIO && LINUX > + > config VHOST_USER_BLK > bool > # Only PCI devices are provided for now > diff --git a/hw/block/meson.build b/hw/block/meson.build > index 2389326112..caf9bedff3 100644 > --- a/hw/block/meson.build > +++ b/hw/block/meson.build > @@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: > files('tc58128.c')) > specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) > specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: > files('vhost-user-blk.c')) > > +if have_vhost_blk > + specific_ss.add(files('vhost-blk.c')) > +endif Can this use the same add(when: 'CONFIG_VHOST_BLK', ...) syntax as the other conditional builds above? > + > subdir('dataplane') > diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c > new file mode 100644 > index 00..33d90af270 > --- /dev/null > +++ b/hw/block/vhost-blk.c > @@ -0,0 +1,394 @@ > +/* > + * Copyright (c) 2022 Virtuozzo International GmbH. > + * Author: Andrey Zhadchenko > + * > + * vhost-blk is host kernel accelerator for virtio-blk. > + * > + * This work is licensed under the terms of the GNU LGPL, version 2 or later. > + * See the COPYING.LIB file in the top-level directory. > + */ > + > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/error-report.h" > +#include "qom/object.h" > +#include "hw/qdev-core.h" > +#include "hw/boards.h" > +#include "hw/virtio/vhost.h" > +#include "hw/virtio/vhost-blk.h" > +#include "hw/virtio/virtio.h" > +#include "hw/virtio/virtio-blk.h" > +#include "hw/virtio/virtio-bus.h" > +#include "hw/virtio/virtio-access.h" > +#include "hw/virtio/virtio-pci.h" > +#include "sysemu/sysemu.h" > +#include "linux-headers/linux/vhost.h" > +#include > +#include > + > +static int vhost_blk_start(VirtIODevice *vdev) > +{ > +VHostBlk *s = VHOST_BLK(vdev); > +struct vhost_vring_file backend; > +int ret, i; > +int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque; This needs a clean API so vhost-blk.c doesn't make assumptions about the file-posix BlockDriver's internal state memory layout. > +BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > +VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > + > +if (!k->set_guest_notifiers) { > +error_report("vhost-blk: binding does not support guest notifiers"); > +return -ENOSYS; > +} > + > +if (s->vhost_started) { > +return 0; > +} > + > +if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) { > +error_report("vhost-blk: unable to set owner"); > +return -ENOSYS; > +} > + > +ret = vhost_dev_enable_notifiers(&s->dev, vdev); > +if (ret < 0) { > +error_report("vhost-blk: unable to enable dev notifiers", errno); > +return ret; > +} > + > +s->dev.acked_features = vdev->guest_features & s->dev.backend_features; > + > +ret = vhost_dev_start(&s->dev, vdev); > +if (ret < 0) { > +
Re: [RFC patch 0/1] block: vhost-blk backend
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote: > Although QEMU virtio-blk is quite fast, there is still some room for > improvements. Disk latency can be reduced if we handle virito-blk requests > in host kernel so we avoid a lot of syscalls and context switches. > > The biggest disadvantage of this vhost-blk flavor is raw format. > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html > > Also by using kernel modules we can bypass iothread limitation and finaly > scale > block requests with cpus for high-performance devices. This is planned to be > implemented in next version. > > Linux kernel module part: > https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadche...@virtuozzo.com/ > > test setups and results: > fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128 > QEMU drive options: cache=none > filesystem: xfs > > SSD: >| randread, IOPS | randwrite, IOPS | > Host | 95.8k| 85.3k | > QEMU virtio| 57.5k| 79.4k | > QEMU vhost-blk | 95.6k| 84.3k | > > RAMDISK (vq == vcpu): > | randread, IOPS | randwrite, IOPS | > virtio, 1vcpu|123k | 129k | > virtio, 2vcpu|253k (??) | 250k (??) | > virtio, 4vcpu|158k | 154k | > vhost-blk, 1vcpu |110k | 113k | > vhost-blk, 2vcpu |247k | 252k | > vhost-blk, 4vcpu |576k | 567k | > > Andrey Zhadchenko (1): > block: add vhost-blk backend > > configure | 13 ++ > hw/block/Kconfig | 5 + > hw/block/meson.build | 1 + > hw/block/vhost-blk.c | 395 ++ > hw/virtio/meson.build | 1 + > hw/virtio/vhost-blk-pci.c | 102 + > include/hw/virtio/vhost-blk.h | 44 > linux-headers/linux/vhost.h | 3 + > 8 files changed, 564 insertions(+) > create mode 100644 hw/block/vhost-blk.c > create mode 100644 hw/virtio/vhost-blk-pci.c > create mode 100644 include/hw/virtio/vhost-blk.h vhost-blk has been tried several times in the past. That doesn't mean it cannot be merged this time, but past arguments should be addressed: - What makes it necessary to move the code into the kernel? In the past the performance results were not very convincing. The fastest implementations actually tend to be userspace NVMe PCI drivers that bypass the kernel! Bypassing the VFS and submitting block requests directly was not a huge boost. The syscall/context switch argument sounds okay but the numbers didn't really show that kernel block I/O is much faster than userspace block I/O. I've asked for more details on the QEMU command-line to understand what your numbers show. Maybe something has changed since previous times when vhost-blk has been tried. The only argument I see is QEMU's current 1 IOThread per virtio-blk device limitation, which is currently being worked on. If that's the only reason for vhost-blk then is it worth doing all the work of getting vhost-blk shipped (kernel, QEMU, and libvirt changes)? It seems like a short-term solution. - The security impact of bugs in kernel vhost-blk code is more serious than bugs in a QEMU userspace process. - The management stack needs to be changed to use vhost-blk whereas QEMU can be optimized without affecting other layers. Stefan signature.asc Description: PGP signature
Re: [RFC patch 0/1] block: vhost-blk backend
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote: > Although QEMU virtio-blk is quite fast, there is still some room for > improvements. Disk latency can be reduced if we handle virito-blk requests > in host kernel so we avoid a lot of syscalls and context switches. > > The biggest disadvantage of this vhost-blk flavor is raw format. > Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach > files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html > > Also by using kernel modules we can bypass iothread limitation and finaly > scale > block requests with cpus for high-performance devices. This is planned to be > implemented in next version. Hi Andrey, Do you have a new version of this patch series that uses multiple threads? I have been playing with vq-IOThread mapping in QEMU and would like to benchmark vhost-blk vs QEMU virtio-blk mq IOThreads: https://gitlab.com/stefanha/qemu/-/tree/virtio-blk-mq-iothread-prototype Thanks, Stefan signature.asc Description: PGP signature
[PULL 32/54] monitor: expose monitor_puts to rest of code
This helps us construct strings elsewhere before echoing to the monitor. It avoids having to jump through hoops like: monitor_printf(mon, "%s", s->str); It will be useful in following patches but for now convert all existing plain "%s" printfs to use the _puts api. Signed-off-by: Alex Bennée Reviewed-by: Richard Henderson Reviewed-by: Kevin Wolf Reviewed-by: Markus Armbruster Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20220929114231.583801-33-alex.ben...@linaro.org> diff --git a/docs/devel/writing-monitor-commands.rst b/docs/devel/writing-monitor-commands.rst index 4aa2bb904d..2fefedcd98 100644 --- a/docs/devel/writing-monitor-commands.rst +++ b/docs/devel/writing-monitor-commands.rst @@ -716,7 +716,7 @@ message. Here's the implementation of the "info roms" HMP command:: if (hmp_handle_error(mon, err)) { return; } - monitor_printf(mon, "%s", info->human_readable_text); + monitor_puts(mon, info->human_readable_text); } Also, you have to add the function's prototype to the hmp.h file. diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index a4b40e8391..737e750670 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -31,6 +31,7 @@ void monitor_resume(Monitor *mon); int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp); int monitor_fd_param(Monitor *mon, const char *fdname, Error **errp); +int monitor_puts(Monitor *mon, const char *str); int monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) G_GNUC_PRINTF(2, 0); int monitor_printf(Monitor *mon, const char *fmt, ...) G_GNUC_PRINTF(2, 3); diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h index caa2e90ef2..a2cdbbf646 100644 --- a/monitor/monitor-internal.h +++ b/monitor/monitor-internal.h @@ -174,7 +174,6 @@ extern int mon_refcount; extern HMPCommand hmp_cmds[]; -int monitor_puts(Monitor *mon, const char *str); void monitor_data_init(Monitor *mon, bool is_qmp, bool skip_flush, bool use_io_thread); void monitor_data_destroy(Monitor *mon); diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c index bfb3c043a0..939a520d17 100644 --- a/block/monitor/block-hmp-cmds.c +++ b/block/monitor/block-hmp-cmds.c @@ -638,16 +638,16 @@ static void print_block_info(Monitor *mon, BlockInfo *info, assert(!info || !info->has_inserted || info->inserted == inserted); if (info && *info->device) { -monitor_printf(mon, "%s", info->device); +monitor_puts(mon, info->device); if (inserted && inserted->has_node_name) { monitor_printf(mon, " (%s)", inserted->node_name); } } else { assert(info || inserted); -monitor_printf(mon, "%s", - inserted && inserted->has_node_name ? inserted->node_name - : info && info->has_qdev ? info->qdev - : ""); +monitor_puts(mon, + inserted && inserted->has_node_name ? inserted->node_name + : info && info->has_qdev ? info->qdev + : ""); } if (inserted) { diff --git a/hw/misc/mos6522.c b/hw/misc/mos6522.c index f9e646350e..fe38c44426 100644 --- a/hw/misc/mos6522.c +++ b/hw/misc/mos6522.c @@ -595,7 +595,7 @@ void hmp_info_via(Monitor *mon, const QDict *qdict) if (hmp_handle_error(mon, err)) { return; } -monitor_printf(mon, "%s", info->human_readable_text); +monitor_puts(mon, info->human_readable_text); } static const MemoryRegionOps mos6522_ops = { diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index c6cd6f91dd..f90eea8d01 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -730,7 +730,7 @@ static void hmp_info_pci_device(Monitor *mon, const PciDeviceInfo *dev) monitor_printf(mon, ""); if (dev->class_info->has_desc) { -monitor_printf(mon, "%s", dev->class_info->desc); +monitor_puts(mon, dev->class_info->desc); } else { monitor_printf(mon, "Class %04" PRId64, dev->class_info->q_class); } @@ -2258,12 +2258,12 @@ static void print_stats_schema_value(Monitor *mon, StatsSchemaValue *value) if (unit && value->base == 10 && value->exponent >= -18 && value->exponent <= 18 && value->exponent % 3 == 0) { -monitor_printf(mon, "%s", si_prefix(value->exponent)); +monitor_puts(mon, si_prefix(value->exponent)); } else if (unit && value->base == 2 && value->exponent >= 0 && value->exponent <= 60 && value->exponent % 10 == 0) { -monitor_printf(mon, "%s", iec_binary_prefix(value->exponent)); +monitor_puts(mon, iec_binary_prefix(value->exponent)); } else if (value->exponent) { /* Use exponential notation and write the unit's English name */ monitor_printf(mon, "* %d^%d%s", @@ -2273,7 +2273,7 @@ static void print_stats_schema_value(Monitor *mon, StatsSchema
Re: [PATCH v2 00/13] ppc/e500: Add support for two types of flash, cleanup
Hey, On 10/3/22 18:27, Philippe Mathieu-Daudé wrote: Hi Daniel, On 3/10/22 22:31, Bernhard Beschow wrote: Cover letter: ~ This series adds support for -pflash and direct SD card access to the PPC e500 boards. The idea is to increase compatibility with "real" firmware images where only the bare minimum of drivers is compiled in. Bernhard Beschow (13): hw/ppc/meson: Allow e500 boards to be enabled separately hw/gpio/meson: Introduce dedicated config switch for hw/gpio/mpc8xxx docs/system/ppc/ppce500: Add heading for networking chapter hw/ppc/e500: Reduce usage of sysbus API hw/ppc/mpc8544ds: Rename wrongly named method hw/ppc/mpc8544ds: Add platform bus hw/ppc/e500: Remove if statement which is now always true This first part is mostly reviewed and can already go via your ppc-next queue. We're missing an ACK in patch 6/13: hw/ppc/mpc8544ds: Add platform bus I'll need some time to understand what's been doing there to provide my own R-b. Or you can toss a R-b there :D Thanks, Daniel hw/block/pflash_cfi01: Error out if device length isn't a power of two hw/ppc/e500: Implement pflash handling hw/sd/sdhci-internal: Unexport ESDHC defines hw/sd/sdhci: Rename ESDHC_* defines to USDHC_* hw/sd/sdhci: Implement Freescale eSDHC device model hw/ppc/e500: Add Freescale eSDHC to e500 boards This second part still need work. I can take it via the sdmmc-next queue. Regards, Phil.
Re: [PATCH v6] 9pfs: use GHashTable for fid table
On Dienstag, 4. Oktober 2022 12:41:21 CEST Linus Heckemann wrote: > The previous implementation would iterate over the fid table for > lookup operations, resulting in an operation with O(n) complexity on > the number of open files and poor cache locality -- for every open, > stat, read, write, etc operation. > > This change uses a hashtable for this instead, significantly improving > the performance of the 9p filesystem. The runtime of NixOS's simple > installer test, which copies ~122k files totalling ~1.8GiB from 9p, > decreased by a factor of about 10. > > Signed-off-by: Linus Heckemann > Reviewed-by: Philippe Mathieu-Daudé > Reviewed-by: Greg Kurz > Message-Id: <20220908112353.289267-1-...@sphalerite.org> > [CS: - Retain BUG_ON(f->clunked) in get_fid(). > - Add TODO comment in clunk_fid(). ] > Signed-off-by: Christian Schoenebeck > --- In general: LGTM now, but I will definitely go for some longer test runs before queuing this patch. Some minor side notes below ... > Christian Schoenebeck writes: > > Not a big deal, but I start thinking whether to keep BUG_ON() here as > > well. > > That would require using g_hash_table_lookup() here instead of > > g_hash_table_contains(). Not that I would insist. > > Done. > > > checkpatch.pl > > Fixed. > > > OK, I get it that you squashed your previous patch and that you ended up > > with this comment instead. But if you read the old comment version here, > > you'll notice that it describes the actual problem more accurately: > > especially that v9fs_reopen_fid() and put_fid() are the 2 functions that > > may cause the coroutine to "yield" here. That's an important information > > to be retained in this comment here as it's not obvious. > > Reworded to mention these functions explicitly. > > > I would not move this to the 2nd loop. I would still set the > > > > FID_NON_RECLAIMABLE flag here, for the same reasons that you are bumping > > the refcount already in the first loop below. > > Good point! Fixed. > > > ... that's not the same behaviour as before, is it? Old behaviour was to > > put> > > the respective (last) fid only on error. And this would now put all fids. > > Indeed it isn't the old behaviour, but I believe it's correct: since > before we only incremented the reference counter for each one when we > started reopening it. Now, we increment _all_ their refcounts, so we > need to put all of them back as well. Yeah, commented in v9fs_mark_fids_unreclaim() changes below ... > > Wasn't there something like GLIST_FOREACH()? Then you wouldn't need to > > add > > > > that variable. > > glib does provide g_list_foreach, but that requires a function > pointer. I'm not aware of a macro for this. I could change this to use > a QLIST (for which we do have a macro) instead of a GList if you > insist, but I'd default to leaving this as is. That's better handled by a future cleanup patch. Right now I find it prioritized to get this performance fix merged ASAP, as it provides a significant improvement for a lot of people. I have seen GLIST_FOREACH macros (without function pointer) in other projects, but I guess that macro was defined locally by those projects. > Thanks for your repeated reviews and patience! > > Linus > > > hw/9pfs/9p.c | 195 +-- > hw/9pfs/9p.h | 2 +- > 2 files changed, 113 insertions(+), 84 deletions(-) > > diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c > index aebadeaa03..729d3181e0 100644 > --- a/hw/9pfs/9p.c > +++ b/hw/9pfs/9p.c > @@ -256,7 +256,8 @@ static size_t v9fs_string_size(V9fsString *str) > } > > /* > - * returns 0 if fid got re-opened, 1 if not, < 0 on error */ > + * returns 0 if fid got re-opened, 1 if not, < 0 on error > + */ > static int coroutine_fn v9fs_reopen_fid(V9fsPDU *pdu, V9fsFidState *f) > { > int err = 1; > @@ -282,33 +283,32 @@ static V9fsFidState *coroutine_fn get_fid(V9fsPDU > *pdu, int32_t fid) V9fsFidState *f; > V9fsState *s = pdu->s; > > -QSIMPLEQ_FOREACH(f, &s->fid_list, next) { > +f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid)); > +if (f) { > BUG_ON(f->clunked); > -if (f->fid == fid) { > -/* > - * Update the fid ref upfront so that > - * we don't get reclaimed when we yield > - * in open later. > - */ > -f->ref++; > -/* > - * check whether we need to reopen the > - * file. We might have closed the fd > - * while trying to free up some file > - * descriptors. > - */ > -err = v9fs_reopen_fid(pdu, f); > -if (err < 0) { > -f->ref--; > -return NULL; > -} > -/* > - * Mark the fid as referenced so that the LRU > - * reclaim won't close the file descriptor > - */ > -f->flags |= FID_REFERENCED; > -return f; > +
[PATCH v6] 9pfs: use GHashTable for fid table
The previous implementation would iterate over the fid table for lookup operations, resulting in an operation with O(n) complexity on the number of open files and poor cache locality -- for every open, stat, read, write, etc operation. This change uses a hashtable for this instead, significantly improving the performance of the 9p filesystem. The runtime of NixOS's simple installer test, which copies ~122k files totalling ~1.8GiB from 9p, decreased by a factor of about 10. Signed-off-by: Linus Heckemann Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Greg Kurz Message-Id: <20220908112353.289267-1-...@sphalerite.org> [CS: - Retain BUG_ON(f->clunked) in get_fid(). - Add TODO comment in clunk_fid(). ] Signed-off-by: Christian Schoenebeck --- Christian Schoenebeck writes: > Not a big deal, but I start thinking whether to keep BUG_ON() here as well. > That would require using g_hash_table_lookup() here instead of > g_hash_table_contains(). Not that I would insist. Done. > checkpatch.pl Fixed. > OK, I get it that you squashed your previous patch and that you ended up with > this comment instead. But if you read the old comment version here, you'll > notice that it describes the actual problem more accurately: especially that > v9fs_reopen_fid() and put_fid() are the 2 functions that may cause the > coroutine to "yield" here. That's an important information to be retained in > this comment here as it's not obvious. Reworded to mention these functions explicitly. > I would not move this to the 2nd loop. I would still set the > FID_NON_RECLAIMABLE flag here, for the same reasons that you are bumping the > refcount already in the first loop below. Good point! Fixed. > ... that's not the same behaviour as before, is it? Old behaviour was to put > the respective (last) fid only on error. And this would now put all fids. Indeed it isn't the old behaviour, but I believe it's correct: since before we only incremented the reference counter for each one when we started reopening it. Now, we increment _all_ their refcounts, so we need to put all of them back as well. > Wasn't there something like GLIST_FOREACH()? Then you wouldn't need to add > that variable. glib does provide g_list_foreach, but that requires a function pointer. I'm not aware of a macro for this. I could change this to use a QLIST (for which we do have a macro) instead of a GList if you insist, but I'd default to leaving this as is. Thanks for your repeated reviews and patience! Linus hw/9pfs/9p.c | 195 +-- hw/9pfs/9p.h | 2 +- 2 files changed, 113 insertions(+), 84 deletions(-) diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index aebadeaa03..729d3181e0 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -256,7 +256,8 @@ static size_t v9fs_string_size(V9fsString *str) } /* - * returns 0 if fid got re-opened, 1 if not, < 0 on error */ + * returns 0 if fid got re-opened, 1 if not, < 0 on error + */ static int coroutine_fn v9fs_reopen_fid(V9fsPDU *pdu, V9fsFidState *f) { int err = 1; @@ -282,33 +283,32 @@ static V9fsFidState *coroutine_fn get_fid(V9fsPDU *pdu, int32_t fid) V9fsFidState *f; V9fsState *s = pdu->s; -QSIMPLEQ_FOREACH(f, &s->fid_list, next) { +f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid)); +if (f) { BUG_ON(f->clunked); -if (f->fid == fid) { -/* - * Update the fid ref upfront so that - * we don't get reclaimed when we yield - * in open later. - */ -f->ref++; -/* - * check whether we need to reopen the - * file. We might have closed the fd - * while trying to free up some file - * descriptors. - */ -err = v9fs_reopen_fid(pdu, f); -if (err < 0) { -f->ref--; -return NULL; -} -/* - * Mark the fid as referenced so that the LRU - * reclaim won't close the file descriptor - */ -f->flags |= FID_REFERENCED; -return f; +/* + * Update the fid ref upfront so that + * we don't get reclaimed when we yield + * in open later. + */ +f->ref++; +/* + * check whether we need to reopen the + * file. We might have closed the fd + * while trying to free up some file + * descriptors. + */ +err = v9fs_reopen_fid(pdu, f); +if (err < 0) { +f->ref--; +return NULL; } +/* + * Mark the fid as referenced so that the LRU + * reclaim won't close the file descriptor + */ +f->flags |= FID_REFERENCED; +return f; } return NULL; } @@ -317,12 +317,11 @@ static V9fsFidState *alloc_fid(V9fsState *s, int32_t fid) { V9fsFidState *f; -QSIMPLEQ_FOREACH(f, &s->fid_list, next
Re: [PATCH v5 2/2] block: Refactor get_tmp_filename()
Am 02.10.2022 um 11:38 hat Bin Meng geschrieben: > Hi Kevin, > > On Fri, Sep 30, 2022 at 6:13 PM Kevin Wolf wrote: > > > > Am 28.09.2022 um 16:41 hat Bin Meng geschrieben: > > > From: Bin Meng > > > > > > At present there are two callers of get_tmp_filename() and they are > > > inconsistent. > > > > > > One does: > > > > > > /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ > > > char *tmp_filename = g_malloc0(PATH_MAX + 1); > > > ... > > > ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); > > > > > > while the other does: > > > > > > s->qcow_filename = g_malloc(PATH_MAX); > > > ret = get_tmp_filename(s->qcow_filename, PATH_MAX); > > > > > > As we can see different 'size' arguments are passed. There are also > > > platform specific implementations inside the function, and the use > > > of snprintf is really undesirable. > > > > > > The function name is also misleading. It creates a temporary file, > > > not just a filename. > > > > > > Refactor this routine by changing its name and signature to: > > > > > > char *create_tmp_file(Error **errp) > > > > > > and use g_file_open_tmp() for a consistent implementation. > > > > > > Signed-off-by: Bin Meng > > > --- > > > > > > Changes in v5: > > > - minor change in the commit message > > > - add some notes in the function comment block > > > - add g_autofree for tmp_filename > > > > > > Changes in v4: > > > - Rename the function to create_tmp_file() and take "Error **errp" as > > > a parameter, so that callers can pass errp all the way down to this > > > routine. > > > - Commit message updated to reflect the latest change > > > > > > Changes in v3: > > > - Do not use errno directly, instead still let get_tmp_filename() return > > > a negative number to indicate error > > > > > > Changes in v2: > > > - Use g_autofree and g_steal_pointer > > > > > > include/block/block_int-common.h | 2 +- > > > block.c | 45 > > > block/vvfat.c| 7 +++-- > > > 3 files changed, 20 insertions(+), 34 deletions(-) > > > > > > diff --git a/include/block/block_int-common.h > > > b/include/block/block_int-common.h > > > index 8947abab76..d7c0a7e96f 100644 > > > --- a/include/block/block_int-common.h > > > +++ b/include/block/block_int-common.h > > > @@ -1230,7 +1230,7 @@ static inline BlockDriverState *child_bs(BdrvChild > > > *child) > > > } > > > > > > int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); > > > -int get_tmp_filename(char *filename, int size); > > > +char *create_tmp_file(Error **errp); > > > void bdrv_parse_filename_strip_prefix(const char *filename, const char > > > *prefix, > > >QDict *options); > > > > > > diff --git a/block.c b/block.c > > > index 582c205307..bd3006d85d 100644 > > > --- a/block.c > > > +++ b/block.c > > > @@ -860,35 +860,25 @@ int bdrv_probe_geometry(BlockDriverState *bs, > > > HDGeometry *geo) > > > > > > /* > > > * Create a uniquely-named empty temporary file. > > > - * Return 0 upon success, otherwise a negative errno value. > > > + * Return the actual file name used upon success, otherwise NULL. > > > + * This string should be freed with g_free() when not needed any longer. > > > + * > > > + * Note: creating a temporary file for the caller to (re)open is > > > + * inherently racy. Use g_file_open_tmp() instead whenever practical. > > > */ > > > -int get_tmp_filename(char *filename, int size) > > > +char *create_tmp_file(Error **errp) > > > { > > > -#ifdef _WIN32 > > > -char temp_dir[MAX_PATH]; > > > -/* GetTempFileName requires that its output buffer (4th param) > > > - have length MAX_PATH or greater. */ > > > -assert(size >= MAX_PATH); > > > -return (GetTempPath(MAX_PATH, temp_dir) > > > -&& GetTempFileName(temp_dir, "qem", 0, filename) > > > > We're using different prefixes on Windows and on Linux. This patch > > unifies both paths to use the Linux name. Nobody should rely on the name > > of temporary files, so there is hope it won't break anything. > > > > > -? 0 : -GetLastError()); > > > -#else > > > +g_autofree char *name = NULL; > > > +g_autoptr(GError) err = NULL; > > > int fd; > > > -const char *tmpdir; > > > -tmpdir = getenv("TMPDIR"); > > > -if (!tmpdir) { > > > -tmpdir = "/var/tmp"; > > > -} > > > -if (snprintf(filename, size, "%s/vl.XX", tmpdir) >= size) { > > > -return -EOVERFLOW; > > > -} > > > -fd = mkstemp(filename); > > > + > > > +fd = g_file_open_tmp("vl.XX", &name, &err); > > > > This implicitly reverts commit 69bef7931e8, g_file_open_tmp() uses /tmp > > as the default instead of /var/tmp as this function does before the > > patch. > > Oops, thanks for the pointer. Commit message of 69bef7931e8 does not > explicitely explain why to change from /tmp to /var/tmp. Is that > because QEMU block codes write a hu