Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp

2022-10-04 Thread Damien Le Moal
On 10/5/22 10:44, Damien Le Moal wrote:
> On 9/29/22 18:31, Sam Li wrote:
>> Since Linux doesn't have a user API to issue zone append operations to
>> zoned devices from user space, the file-posix driver is modified to add
>> zone append emulation using regular writes. To do this, the file-posix
>> driver tracks the wp location of all zones of the device. It uses an
>> array of uint64_t. The most significant bit of each wp location indicates
>> if the zone type is sequential write required.
>>
>> The zones wp can be changed due to the following operations issued:
>> - zone reset: change the wp to the start offset of that zone
>> - zone finish: change to the end location of that zone
>> - write to a zone
>> - zone append
>>
>> Signed-off-by: Sam Li 
>> ---
>>  block/file-posix.c   | 138 ++-
>>  include/block/block-common.h |  16 
>>  include/block/block_int-common.h |   5 ++
>>  include/block/raw-aio.h  |   4 +-
>>  4 files changed, 159 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 73656d87f2..33e81ac112 100755
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>>  struct {
>>  struct iovec *iov;
>>  int niov;
>> +int64_t *append_sector;
>> +BlockZoneWps *wps;
>>  } io;
>>  struct {
>>  uint64_t cmd;
>> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat 
>> *st) {
>>  #endif
>>  }
>>  
>> +#if defined(CONFIG_BLKZONED)
>> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
>> +  unsigned int nrz) {
> 
> Maybe rename this to get_zones_wp() ?
> 
>> +struct blk_zone *blkz;
>> +int64_t rep_size;
>> +int64_t sector = offset >> BDRV_SECTOR_BITS;
>> +int ret, n = 0, i = 0;
>> +
>> +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
>> blk_zone);
>> +g_autofree struct blk_zone_report *rep = NULL;
> 
> To be cleaner, move this declaration above with the others ?
> 
>> +rep = g_malloc(rep_size);
>> +
>> +blkz = (struct blk_zone *)(rep + 1);
>> +while (n < nrz) {
>> +memset(rep, 0, rep_size);
>> +rep->sector = sector;
>> +rep->nr_zones = nrz - n;
>> +
>> +do {
>> +ret = ioctl(fd, BLKREPORTZONE, rep);
>> +} while (ret != 0 && errno == EINTR);
>> +if (ret != 0) {
>> +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
>> +fd, offset, errno);
>> +return -errno;
>> +}
>> +
>> +if (!rep->nr_zones) {
>> +break;
>> +}
>> +
>> +for (i = 0; i < rep->nr_zones; i++, n++) {
>> +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
>> +sector = blkz[i].start + blkz[i].len;
>> +
>> +/*
>> + * In the wp tracking, it only cares if the zone type is 
>> sequential
>> + * writes required so that the wp can advance to the right 
>> location.
> 
> Or sequential write preferred (host aware case)
> 
>> + * Instead of the type of zone_type which is an 8-bit unsigned
>> + * integer, use the first most significant bits of the wp 
>> location
>> + * to indicate the zone type: 0 for SWR zones and 1 for the
>> + * others.
>> + */
>> +if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {
> 
> This should be:
> 
>   if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {
> 
> Note that the type field is not a bit-field. So you must compare values
> instead of doing bit operations.
> 
>> +wps->wp[i] += (uint64_t)1 << 63;
> 
> You can simplify this:
> 
>  wps->wp[i] |= 1ULL << 63;
> 
> Overall, I would rewrite this like this:
> 
> for (i = 0; i < rep->nr_zones; i++, n++) {
> /*
>  * The wp tracking cares only about sequential write required
>  * and sequential write preferred zones so that the wp can
>  * advance to the right location.
>  * Use the most significant bit of the wp location
>  * to indicate the zone type: 0 for SWR zones and 1 for
>  * conventional zones.
>  */
> if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
> wps->wp[i] = 1ULL << 63;
> else
> wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> }
> sector = blkz[i - 1].start + blkz[i - 1].len;
> 
> Which I think is a lot simpler.
> 
>> +}
>> +}
>> +}
>> +
>> +return 0;
>> +}
>> +#endif
>> +
>>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>>  {
>>  BDRVRawState *s = bs->opaque;
>> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, 
>> Error **errp)
>>  error_report("Invalid device capacity %" PRId64 " bytes ", 
>> bs->bl.capacity);
>>  return;
>>  

Re: [PATCH v2 2/2] block: introduce zone append write for zoned devices

2022-10-04 Thread Damien Le Moal
On 9/29/22 18:31, Sam Li wrote:
> A zone append command is a write operation that specifies the first
> logical block of a zone as the write position. When writing to a zoned
> block device using zone append, the byte offset of the write is pointing
> to the write pointer of that zone. Upon completion the device will
> respond with the position the data has been written in the zone.
> 
> Signed-off-by: Sam Li 
> ---
>  block/block-backend.c  | 65 ++
>  block/file-posix.c | 51 +++
>  block/io.c | 21 ++
>  block/raw-format.c |  7 
>  include/block/block-io.h   |  3 ++
>  include/block/block_int-common.h   |  3 ++
>  include/sysemu/block-backend-io.h  |  9 +
>  qemu-io-cmds.c | 62 
>  tests/qemu-iotests/tests/zoned.out |  7 
>  tests/qemu-iotests/tests/zoned.sh  |  9 +
>  10 files changed, 237 insertions(+)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f7f7acd6f4..07a8632af1 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1439,6 +1439,9 @@ typedef struct BlkRwCo {
>  struct {
>  BlockZoneOp op;
>  } zone_mgmt;
> +struct {
> +int64_t *append_sector;
> +} zone_append;
>  };
>  } BlkRwCo;
>  
> @@ -1869,6 +1872,47 @@ BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return &acb->common;
>  }
>  
> +static void blk_aio_zone_append_entry(void *opaque) {
> +BlkAioEmAIOCB *acb = opaque;
> +BlkRwCo *rwco = &acb->rwco;
> +
> +rwco->ret = blk_co_zone_append(rwco->blk, 
> rwco->zone_append.append_sector,
> +   rwco->iobuf, rwco->flags);
> +blk_aio_complete(acb);
> +}
> +
> +BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags,
> +BlockCompletionFunc *cb, void *opaque) {
> +BlkAioEmAIOCB *acb;
> +Coroutine *co;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
> +acb->rwco = (BlkRwCo) {
> +.blk= blk,
> +.ret= NOT_DONE,
> +.flags  = flags,
> +.iobuf  = qiov,
> +.zone_append = {
> +.append_sector = offset,
> +},
> +};
> +acb->has_returned = false;
> +
> +co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
> +bdrv_coroutine_enter(blk_bs(blk), co);
> +
> +acb->has_returned = true;
> +if (acb->rwco.ret != NOT_DONE) {
> +replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
> + blk_aio_complete_bh, acb);
> +}
> +
> +return &acb->common;
> +}
> +
>  /*
>   * Send a zone_report command.
>   * offset is a byte offset from the start of the device. No alignment
> @@ -1921,6 +1965,27 @@ int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, 
> BlockZoneOp op,
>  return ret;
>  }
>  
> +/*
> + * Send a zone_append command.
> + */
> +int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
> +QEMUIOVector *qiov, BdrvRequestFlags flags)
> +{
> +int ret;
> +IO_CODE();
> +
> +blk_inc_in_flight(blk);
> +blk_wait_while_drained(blk);
> +if (!blk_is_available(blk)) {
> +blk_dec_in_flight(blk);
> +return -ENOMEDIUM;
> +}
> +
> +ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
> +blk_dec_in_flight(blk);
> +return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>  BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 33e81ac112..24b70f1afe 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -3454,6 +3454,56 @@ static int coroutine_fn 
> raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
>  #endif
>  }
>  
> +

whiteline change.

> +static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
> +   int64_t *offset,
> +   QEMUIOVector *qiov,
> +   BdrvRequestFlags flags) {
> +#if defined(CONFIG_BLKZONED)
> +BDRVRawState *s = bs->opaque;
> +int64_t zone_size_mask = bs->bl.zone_size - 1;
> +int64_t iov_len = 0;
> +int64_t len = 0;
> +RawPosixAIOData acb;
> +
> +if (*offset & zone_size_mask) {
> +error_report("sector offset %" PRId64 " is not aligned to zone size "
> + "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
> +return -EINVAL;
> +}
> +
> +int64_t wg = bs->bl.write_granularity;
> +int64_t wg_mask = wg - 1;
> +for (int i = 0; i < qiov->niov; i++) {
> +   iov_len = qiov->iov[i].iov_len;
> +   if (iov_len & wg_mask) {
> +   erro

Re: [PATCH v2 1/2] file-posix: add the tracking of the zones wp

2022-10-04 Thread Damien Le Moal
On 9/29/22 18:31, Sam Li wrote:
> Since Linux doesn't have a user API to issue zone append operations to
> zoned devices from user space, the file-posix driver is modified to add
> zone append emulation using regular writes. To do this, the file-posix
> driver tracks the wp location of all zones of the device. It uses an
> array of uint64_t. The most significant bit of each wp location indicates
> if the zone type is sequential write required.
> 
> The zones wp can be changed due to the following operations issued:
> - zone reset: change the wp to the start offset of that zone
> - zone finish: change to the end location of that zone
> - write to a zone
> - zone append
> 
> Signed-off-by: Sam Li 
> ---
>  block/file-posix.c   | 138 ++-
>  include/block/block-common.h |  16 
>  include/block/block_int-common.h |   5 ++
>  include/block/raw-aio.h  |   4 +-
>  4 files changed, 159 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 73656d87f2..33e81ac112 100755
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -206,6 +206,8 @@ typedef struct RawPosixAIOData {
>  struct {
>  struct iovec *iov;
>  int niov;
> +int64_t *append_sector;
> +BlockZoneWps *wps;
>  } io;
>  struct {
>  uint64_t cmd;
> @@ -1332,6 +1334,59 @@ static int hdev_get_max_segments(int fd, struct stat 
> *st) {
>  #endif
>  }
>  
> +#if defined(CONFIG_BLKZONED)
> +static int report_zone_wp(int64_t offset, int fd, BlockZoneWps *wps,
> +  unsigned int nrz) {

Maybe rename this to get_zones_wp() ?

> +struct blk_zone *blkz;
> +int64_t rep_size;
> +int64_t sector = offset >> BDRV_SECTOR_BITS;
> +int ret, n = 0, i = 0;
> +
> +rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct 
> blk_zone);
> +g_autofree struct blk_zone_report *rep = NULL;

To be cleaner, move this declaration above with the others ?

> +rep = g_malloc(rep_size);
> +
> +blkz = (struct blk_zone *)(rep + 1);
> +while (n < nrz) {
> +memset(rep, 0, rep_size);
> +rep->sector = sector;
> +rep->nr_zones = nrz - n;
> +
> +do {
> +ret = ioctl(fd, BLKREPORTZONE, rep);
> +} while (ret != 0 && errno == EINTR);
> +if (ret != 0) {
> +error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
> +fd, offset, errno);
> +return -errno;
> +}
> +
> +if (!rep->nr_zones) {
> +break;
> +}
> +
> +for (i = 0; i < rep->nr_zones; i++, n++) {
> +wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
> +sector = blkz[i].start + blkz[i].len;
> +
> +/*
> + * In the wp tracking, it only cares if the zone type is 
> sequential
> + * writes required so that the wp can advance to the right 
> location.

Or sequential write preferred (host aware case)

> + * Instead of the type of zone_type which is an 8-bit unsigned
> + * integer, use the first most significant bits of the wp 
> location
> + * to indicate the zone type: 0 for SWR zones and 1 for the
> + * others.
> + */
> +if (!(blkz[i].type & BLK_ZONE_TYPE_SEQWRITE_REQ)) {

This should be:

if (blkz[i].type != BLK_ZONE_TYPE_CONVENTIONAL) {

Note that the type field is not a bit-field. So you must compare values
instead of doing bit operations.

> +wps->wp[i] += (uint64_t)1 << 63;

You can simplify this:

   wps->wp[i] |= 1ULL << 63;

Overall, I would rewrite this like this:

for (i = 0; i < rep->nr_zones; i++, n++) {
/*
 * The wp tracking cares only about sequential write required
 * and sequential write preferred zones so that the wp can
 * advance to the right location.
 * Use the most significant bit of the wp location
 * to indicate the zone type: 0 for SWR zones and 1 for
 * conventional zones.
 */
if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
wps->wp[i] = 1ULL << 63;
else
wps->wp[i] = blkz[i].wp << BDRV_SECTOR_BITS;
}
sector = blkz[i - 1].start + blkz[i - 1].len;

Which I think is a lot simpler.

> +}
> +}
> +}
> +
> +return 0;
> +}
> +#endif
> +
>  static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>  BDRVRawState *s = bs->opaque;
> @@ -1415,6 +1470,20 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  error_report("Invalid device capacity %" PRId64 " bytes ", 
> bs->bl.capacity);
>  return;
>  }
> +
> +ret = get_sysfs_long_val(&st, "physical_block_size");
> +if (ret >= 0) {
> +bs->bl.write_granularity = ret;
> +}

This change seems unrelated to the wp tracking. Should this be mo

Re: [PATCH v2 13/13] hw/ppc/e500: Add Freescale eSDHC to e500 boards

2022-10-04 Thread Bernhard Beschow
Am 3. Oktober 2022 21:06:57 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On 3/10/22 22:31, Bernhard Beschow wrote:
>> Adds missing functionality to emulated e500 SOCs which increases the
>> chance of given "real" firmware images to access SD cards.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   docs/system/ppc/ppce500.rst | 13 +
>>   hw/ppc/Kconfig  |  1 +
>>   hw/ppc/e500.c   | 31 ++-
>>   3 files changed, 44 insertions(+), 1 deletion(-)
>
>> +static void dt_sdhc_create(void *fdt, const char *parent, const char *mpic)
>> +{
>> +hwaddr mmio = MPC85XX_ESDHC_REGS_OFFSET;
>> +hwaddr size = MPC85XX_ESDHC_REGS_SIZE;
>> +int irq = MPC85XX_ESDHC_IRQ;
>
>Why not pass these 3 variable as argument?

In anticipation of data-driven board creation, I'd ideally infer those from the 
device's QOM properties. This seems similar to what Mark suggested in the BoF 
at KVM Forum [1], where -- IIUC -- he stated that QOM properties could be the 
foundation of all wiring representations. And device tree seems just like one 
specialized representation to me. (Note that I'm slightly hijacking the review 
here because I don't know where and how to express these thoughts elsewhere).

Does it make sense to add the missing properties here?

Best regards,
Bernhard

[1] https://etherpad.opendev.org/p/qemu-emulation-bof%40kvmforum2022

>
>> +g_autofree char *name = NULL;
>> +
>> +name = g_strdup_printf("%s/sdhc@%" PRIx64, parent, mmio);
>> +qemu_fdt_add_subnode(fdt, name);
>> +qemu_fdt_setprop(fdt, name, "sdhci,auto-cmd12", NULL, 0);
>> +qemu_fdt_setprop_phandle(fdt, name, "interrupt-parent", mpic);
>> +qemu_fdt_setprop_cells(fdt, name, "bus-width", 4);
>> +qemu_fdt_setprop_cells(fdt, name, "interrupts", irq, 0x2);
>> +qemu_fdt_setprop_cells(fdt, name, "reg", mmio, size);
>> +qemu_fdt_setprop_string(fdt, name, "compatible", "fsl,esdhc");
>> +}
>> typedef struct PlatformDevtreeData {
>>   void *fdt;
>> @@ -553,6 +573,8 @@ static int ppce500_load_device_tree(PPCE500MachineState 
>> *pms,
>> dt_rtc_create(fdt, "i2c", "rtc");
>>   +/* sdhc */
>> +dt_sdhc_create(fdt, soc, mpic);
>>   



Re: [PATCH v2 09/13] hw/ppc/e500: Implement pflash handling

2022-10-04 Thread Bernhard Beschow
Am 3. Oktober 2022 21:21:15 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On 3/10/22 22:31, Bernhard Beschow wrote:
>> Allows e500 boards to have their root file system reside on flash using
>> only builtin devices located in the eLBC memory region.
>> 
>> Note that the flash memory area is only created when a -pflash argument is
>> given, and that the size is determined by the given file. The idea is to
>> put users into control.
>> 
>> Signed-off-by: Bernhard Beschow 
>> ---
>>   docs/system/ppc/ppce500.rst | 12 ++
>>   hw/ppc/Kconfig  |  1 +
>>   hw/ppc/e500.c   | 76 +
>>   3 files changed, 89 insertions(+)
>
>> @@ -856,6 +892,7 @@ void ppce500_init(MachineState *machine)
>>   unsigned int pci_irq_nrs[PCI_NUM_PINS] = {1, 2, 3, 4};
>>   IrqLines *irqs;
>>   DeviceState *dev, *mpicdev;
>> +DriveInfo *dinfo;
>>   CPUPPCState *firstenv = NULL;
>>   MemoryRegion *ccsr_addr_space;
>>   SysBusDevice *s;
>> @@ -1024,6 +1061,45 @@ void ppce500_init(MachineState *machine)
>>   pmc->platform_bus_base,
>>   &pms->pbus_dev->mmio);
>>   +dinfo = drive_get(IF_PFLASH, 0, 0);
>> +if (dinfo) {
>> +BlockBackend *blk = blk_by_legacy_dinfo(dinfo);
>> +BlockDriverState *bs = blk_bs(blk);
>> +uint64_t size = bdrv_getlength(bs);
>> +uint64_t mmio_size = pms->pbus_dev->mmio.size;
>> +uint32_t sector_len = 64 * KiB;
>> +
>> +if (ctpop64(size) != 1) {
>> +error_report("Size of pflash file must be a power of two.");
>
>This is a PFLASH restriction (which you already fixed in the previous
>patch), not a board one.

I agree that this check seems redundant to the one in cfi01. I added this one 
for clearer error messages since cfi01 only complains about the "device size" 
not being a power of two while this message at least gives a hint towards the 
source of the problem (the file given in the pflash option).

Usually the size of the pflash area is hardcoded in the board while I choose to 
derive it from the size of the backing file in order to avoid hardcoding it. My 
idea is to put users into control by offering more flexibility.

>
>> +exit(1);
>> +}
>> +
>> +if (size > mmio_size) {
>> +error_report("Size of pflash file must not be bigger than %" 
>> PRIu64
>> + " bytes.", mmio_size);
>
>There is no hardware limitation here, you can wire flash bigger than the
>memory aperture. What is above the aperture will simply be ignored.
>
>Should we display a warning here instead of a fatal error?

While this is technically possible, is that what users would expect? Couldn't 
we just require users to truncate their files if they really want the 
"aperture" behavior?

>
>> +exit(1);
>> +}
>> +
>> +assert(QEMU_IS_ALIGNED(size, sector_len));
>
>Similarly, this doesn't seem a problem the board code should worry
>about: better to defer it to PFLASH realize().

The reason for the assert() here is that size isn't stored directly in the 
cfi01 device. Instead, it must be calculated by the properties "num-blocks" 
times "sector-length". For this to work, size must be divisible by sector_len 
without remainder, which is checked by the assertion.

We could theoretically add a "size" property which would violate the single 
point of truth principle, though. Do you see a different solution?

Best regards,
Bernhard

>
>> +dev = qdev_new(TYPE_PFLASH_CFI01);
>> +qdev_prop_set_drive(dev, "drive", blk);
>> +qdev_prop_set_uint32(dev, "num-blocks", size / sector_len);
>> +qdev_prop_set_uint64(dev, "sector-length", sector_len);
>> +qdev_prop_set_uint8(dev, "width", 2);
>> +qdev_prop_set_bit(dev, "big-endian", true);
>> +qdev_prop_set_uint16(dev, "id0", 0x89);
>> +qdev_prop_set_uint16(dev, "id1", 0x18);
>> +qdev_prop_set_uint16(dev, "id2", 0x);
>> +qdev_prop_set_uint16(dev, "id3", 0x0);
>> +qdev_prop_set_string(dev, "name", "e500.flash");
>> +s = SYS_BUS_DEVICE(dev);
>> +sysbus_realize_and_unref(s, &error_fatal);
>> +
>> +memory_region_add_subregion(&pms->pbus_dev->mmio, 0,
>> +sysbus_mmio_get_region(s, 0));
>> +}
>> +
>>   /*
>>* Smart firmware defaults ahead!
>>*
>




Re: [PATCH] spec: Add NBD_OPT_EXTENDED_HEADERS

2022-10-04 Thread Eric Blake
On Fri, Dec 03, 2021 at 05:14:34PM -0600, Eric Blake wrote:
> Add a new negotiation feature where the client and server agree to use
> larger packet headers on every packet sent during transmission phase.
> This has two purposes: first, it makes it possible to perform
> operations like trim, write zeroes, and block status on more than 2^32
> bytes in a single command; this in turn requires that some structured
> replies from the server also be extended to match.  The wording chosen
> here is careful to permit a server to use either flavor in its reply
> (that is, a request less than 32-bits can trigger an extended reply,
> and conversely a request larger than 32-bits can trigger a compact
> reply).

Following up on this original proposal with something that came out of
KVM Forum this year.

> +* `NBD_REPLY_TYPE_BLOCK_STATUS_EXT` (6)
> +
> +  This chunk type is in the status chunk category.  *length* MUST be
> +  4 + (a positive multiple of 16).  The semantics of this chunk mirror
> +  those of `NBD_REPLY_TYPE_BLOCK_STATUS`, other than the use of a
> +  larger *extent length* field, as well as added padding to ease
> +  alignment.  This chunk type MUST NOT be used unless extended headers
> +  were negotiated with `NBD_OPT_EXTENDED_HEADERS`.
> +
> +  The payload starts with:
> +
> +  32 bits, metadata context ID  
> +
> +  and is followed by a list of one or more descriptors, each with this
> +  layout:
> +
> +  64 bits, length of the extent to which the status below
> + applies (unsigned, MUST be nonzero)  
> +  32 bits, status flags  
> +  32 bits, padding (MUST be zero)

During KVM Forum, I had several conversations about Zoned Block
Devices (https://zonedstorage.io/docs/linux/zbd-api), and what it
would take to expose ZBD information over NBD.  In particular,
NBD_CMD_BLOCK_STATUS sounds like a great way for advertising
information about zones (by adding several metadata contexts that can
be negotiated during NBD_OPT_SET_META_CONTEXT), except for the fact
that a zone might be larger than 32 bits in size.  So Rich Jones asked
me the question of whether my work on 64-bit extensions to the NBD
protocol could also allow for a server to advertise a metadata context
only to clients that support 64-bit extensions, at which point it can
report 64-bit offsets or lengths as needed, rather than being limited
to 32-bit status flags.

The idea definitely has merit, so I'm working on incorporating that
into my next revision for 64-bit extensions in NBD.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [RFC patch 0/1] block: vhost-blk backend

2022-10-04 Thread Stefan Hajnoczi
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly 
> scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.
> 
> Linux kernel module part:
> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadche...@virtuozzo.com/
> 
> test setups and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128

> QEMU drive options: cache=none
> filesystem: xfs

Please post the full QEMU command-line so it's clear exactly what this
is benchmarking.

A preallocated raw image file is a good baseline with:

  --object iothread,id=iothread0 \
  --blockdev file,filename=test.img,cache.direct=on,aio=native,node-name=drive0 
\
  --device virtio-blk-pci,drive=drive0,iothread=iothread0

(BTW QEMU's default vq size is 256 descriptors and the number of vqs is
the number of vCPUs.)

> 
> SSD:
>| randread, IOPS  | randwrite, IOPS |
> Host   |  95.8k|  85.3k  |
> QEMU virtio|  57.5k|  79.4k  |
> QEMU vhost-blk |  95.6k|  84.3k  |
> 
> RAMDISK (vq == vcpu):

With fio numjobs=vcpu here?

>  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu|123k  |  129k   |
> virtio, 2vcpu|253k (??) |  250k (??)  |

QEMU's aio=threads (default) gets around the single IOThread. It beats
aio=native for this reason in some cases. Were you using aio=native or
aio=threads?

> virtio, 4vcpu|158k  |  154k   |
> vhost-blk, 1vcpu |110k  |  113k   |
> vhost-blk, 2vcpu |247k  |  252k   |


signature.asc
Description: PGP signature


Re: [RFC PATCH 1/1] block: add vhost-blk backend

2022-10-04 Thread Stefan Hajnoczi
On Mon, Jul 25, 2022 at 11:55:27PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel istead of passing them to QEMU. The patch adds vhost-blk
> backend which sets up vhost-blk kernel module to process requests.
> 
> test setup and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> QEMU drive options: cache=none
> filesystem: xfs
> 
> SSD:
>| randread, IOPS  | randwrite, IOPS |
> Host   |  95.8k|  85.3k  |
> QEMU virtio|  57.5k|  79.4k  |
> QEMU vhost-blk |  95.6k|  84.3k  |
> 
> RAMDISK (vq == vcpu):
>  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu|123k  |  129k   |
> virtio, 2vcpu|253k (??) |  250k (??)  |
> virtio, 4vcpu|158k  |  154k   |
> vhost-blk, 1vcpu |110k  |  113k   |
> vhost-blk, 2vcpu |247k  |  252k   |
> vhost-blk, 4vcpu |576k  |  567k   |
> 
> Signed-off-by: Andrey Zhadchenko 
> ---
>  hw/block/Kconfig  |   5 +
>  hw/block/meson.build  |   4 +
>  hw/block/vhost-blk.c  | 394 ++
>  hw/virtio/meson.build |   3 +
>  hw/virtio/vhost-blk-pci.c | 102 +
>  include/hw/virtio/vhost-blk.h |  50 +
>  meson.build   |   5 +
>  7 files changed, 563 insertions(+)
>  create mode 100644 hw/block/vhost-blk.c
>  create mode 100644 hw/virtio/vhost-blk-pci.c
>  create mode 100644 include/hw/virtio/vhost-blk.h
> 
> diff --git a/hw/block/Kconfig b/hw/block/Kconfig
> index 9e8f28f982..b4286ad10e 100644
> --- a/hw/block/Kconfig
> +++ b/hw/block/Kconfig
> @@ -36,6 +36,11 @@ config VIRTIO_BLK
>  default y
>  depends on VIRTIO
>  
> +config VHOST_BLK
> +bool
> +default n

Feel free to enable it by default. That way it gets more CI/build
coverage.

> +depends on VIRTIO && LINUX
> +
>  config VHOST_USER_BLK
>  bool
>  # Only PCI devices are provided for now
> diff --git a/hw/block/meson.build b/hw/block/meson.build
> index 2389326112..caf9bedff3 100644
> --- a/hw/block/meson.build
> +++ b/hw/block/meson.build
> @@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: 
> files('tc58128.c'))
>  specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
>  specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: 
> files('vhost-user-blk.c'))
>  
> +if have_vhost_blk
> +  specific_ss.add(files('vhost-blk.c'))
> +endif

Can this use the same add(when: 'CONFIG_VHOST_BLK', ...) syntax as the
other conditional builds above?

> +
>  subdir('dataplane')
> diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
> new file mode 100644
> index 00..33d90af270
> --- /dev/null
> +++ b/hw/block/vhost-blk.c
> @@ -0,0 +1,394 @@
> +/*
> + * Copyright (c) 2022 Virtuozzo International GmbH.
> + * Author: Andrey Zhadchenko 
> + *
> + * vhost-blk is host kernel accelerator for virtio-blk.
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/error-report.h"
> +#include "qom/object.h"
> +#include "hw/qdev-core.h"
> +#include "hw/boards.h"
> +#include "hw/virtio/vhost.h"
> +#include "hw/virtio/vhost-blk.h"
> +#include "hw/virtio/virtio.h"
> +#include "hw/virtio/virtio-blk.h"
> +#include "hw/virtio/virtio-bus.h"
> +#include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/virtio-pci.h"
> +#include "sysemu/sysemu.h"
> +#include "linux-headers/linux/vhost.h"
> +#include 
> +#include 
> +
> +static int vhost_blk_start(VirtIODevice *vdev)
> +{
> +VHostBlk *s = VHOST_BLK(vdev);
> +struct vhost_vring_file backend;
> +int ret, i;
> +int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;

This needs a clean API so vhost-blk.c doesn't make assumptions about the
file-posix BlockDriver's internal state memory layout.

> +BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> +VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
> +
> +if (!k->set_guest_notifiers) {
> +error_report("vhost-blk: binding does not support guest notifiers");
> +return -ENOSYS;
> +}
> +
> +if (s->vhost_started) {
> +return 0;
> +}
> +
> +if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
> +error_report("vhost-blk: unable to set owner");
> +return -ENOSYS;
> +}
> +
> +ret = vhost_dev_enable_notifiers(&s->dev, vdev);
> +if (ret < 0) {
> +error_report("vhost-blk: unable to enable dev notifiers", errno);
> +return ret;
> +}
> +
> +s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
> +
> +ret = vhost_dev_start(&s->dev, vdev);
> +if (ret < 0) {
> +

Re: [RFC patch 0/1] block: vhost-blk backend

2022-10-04 Thread Stefan Hajnoczi
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly 
> scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.
> 
> Linux kernel module part:
> https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadche...@virtuozzo.com/
> 
> test setups and results:
> fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
> QEMU drive options: cache=none
> filesystem: xfs
> 
> SSD:
>| randread, IOPS  | randwrite, IOPS |
> Host   |  95.8k|  85.3k  |
> QEMU virtio|  57.5k|  79.4k  |
> QEMU vhost-blk |  95.6k|  84.3k  |
> 
> RAMDISK (vq == vcpu):
>  | randread, IOPS | randwrite, IOPS |
> virtio, 1vcpu|123k  |  129k   |
> virtio, 2vcpu|253k (??) |  250k (??)  |
> virtio, 4vcpu|158k  |  154k   |
> vhost-blk, 1vcpu |110k  |  113k   |
> vhost-blk, 2vcpu |247k  |  252k   |
> vhost-blk, 4vcpu |576k  |  567k   |
> 
> Andrey Zhadchenko (1):
>   block: add vhost-blk backend
> 
>  configure |  13 ++
>  hw/block/Kconfig  |   5 +
>  hw/block/meson.build  |   1 +
>  hw/block/vhost-blk.c  | 395 ++
>  hw/virtio/meson.build |   1 +
>  hw/virtio/vhost-blk-pci.c | 102 +
>  include/hw/virtio/vhost-blk.h |  44 
>  linux-headers/linux/vhost.h   |   3 +
>  8 files changed, 564 insertions(+)
>  create mode 100644 hw/block/vhost-blk.c
>  create mode 100644 hw/virtio/vhost-blk-pci.c
>  create mode 100644 include/hw/virtio/vhost-blk.h

vhost-blk has been tried several times in the past. That doesn't mean it
cannot be merged this time, but past arguments should be addressed:

- What makes it necessary to move the code into the kernel? In the past
  the performance results were not very convincing. The fastest
  implementations actually tend to be userspace NVMe PCI drivers that
  bypass the kernel! Bypassing the VFS and submitting block requests
  directly was not a huge boost. The syscall/context switch argument
  sounds okay but the numbers didn't really show that kernel block I/O
  is much faster than userspace block I/O.

  I've asked for more details on the QEMU command-line to understand
  what your numbers show. Maybe something has changed since previous
  times when vhost-blk has been tried.

  The only argument I see is QEMU's current 1 IOThread per virtio-blk
  device limitation, which is currently being worked on. If that's the
  only reason for vhost-blk then is it worth doing all the work of
  getting vhost-blk shipped (kernel, QEMU, and libvirt changes)? It
  seems like a short-term solution.

- The security impact of bugs in kernel vhost-blk code is more serious
  than bugs in a QEMU userspace process.

- The management stack needs to be changed to use vhost-blk whereas
  QEMU can be optimized without affecting other layers.

Stefan


signature.asc
Description: PGP signature


Re: [RFC patch 0/1] block: vhost-blk backend

2022-10-04 Thread Stefan Hajnoczi
On Mon, Jul 25, 2022 at 11:55:26PM +0300, Andrey Zhadchenko wrote:
> Although QEMU virtio-blk is quite fast, there is still some room for
> improvements. Disk latency can be reduced if we handle virito-blk requests
> in host kernel so we avoid a lot of syscalls and context switches.
> 
> The biggest disadvantage of this vhost-blk flavor is raw format.
> Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
> files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html
> 
> Also by using kernel modules we can bypass iothread limitation and finaly 
> scale
> block requests with cpus for high-performance devices. This is planned to be
> implemented in next version.

Hi Andrey,
Do you have a new version of this patch series that uses multiple
threads?

I have been playing with vq-IOThread mapping in QEMU and would like to
benchmark vhost-blk vs QEMU virtio-blk mq IOThreads:
https://gitlab.com/stefanha/qemu/-/tree/virtio-blk-mq-iothread-prototype

Thanks,
Stefan


signature.asc
Description: PGP signature


[PULL 32/54] monitor: expose monitor_puts to rest of code

2022-10-04 Thread Alex Bennée
This helps us construct strings elsewhere before echoing to the
monitor. It avoids having to jump through hoops like:

  monitor_printf(mon, "%s", s->str);

It will be useful in following patches but for now convert all
existing plain "%s" printfs to use the _puts api.

Signed-off-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Reviewed-by: Kevin Wolf 
Reviewed-by: Markus Armbruster 
Reviewed-by: Philippe Mathieu-Daudé 
Message-Id: <20220929114231.583801-33-alex.ben...@linaro.org>

diff --git a/docs/devel/writing-monitor-commands.rst 
b/docs/devel/writing-monitor-commands.rst
index 4aa2bb904d..2fefedcd98 100644
--- a/docs/devel/writing-monitor-commands.rst
+++ b/docs/devel/writing-monitor-commands.rst
@@ -716,7 +716,7 @@ message. Here's the implementation of the "info roms" HMP 
command::
  if (hmp_handle_error(mon, err)) {
  return;
  }
- monitor_printf(mon, "%s", info->human_readable_text);
+ monitor_puts(mon, info->human_readable_text);
  }
 
 Also, you have to add the function's prototype to the hmp.h file.
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index a4b40e8391..737e750670 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -31,6 +31,7 @@ void monitor_resume(Monitor *mon);
 int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp);
 int monitor_fd_param(Monitor *mon, const char *fdname, Error **errp);
 
+int monitor_puts(Monitor *mon, const char *str);
 int monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
 G_GNUC_PRINTF(2, 0);
 int monitor_printf(Monitor *mon, const char *fmt, ...) G_GNUC_PRINTF(2, 3);
diff --git a/monitor/monitor-internal.h b/monitor/monitor-internal.h
index caa2e90ef2..a2cdbbf646 100644
--- a/monitor/monitor-internal.h
+++ b/monitor/monitor-internal.h
@@ -174,7 +174,6 @@ extern int mon_refcount;
 
 extern HMPCommand hmp_cmds[];
 
-int monitor_puts(Monitor *mon, const char *str);
 void monitor_data_init(Monitor *mon, bool is_qmp, bool skip_flush,
bool use_io_thread);
 void monitor_data_destroy(Monitor *mon);
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index bfb3c043a0..939a520d17 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -638,16 +638,16 @@ static void print_block_info(Monitor *mon, BlockInfo 
*info,
 assert(!info || !info->has_inserted || info->inserted == inserted);
 
 if (info && *info->device) {
-monitor_printf(mon, "%s", info->device);
+monitor_puts(mon, info->device);
 if (inserted && inserted->has_node_name) {
 monitor_printf(mon, " (%s)", inserted->node_name);
 }
 } else {
 assert(info || inserted);
-monitor_printf(mon, "%s",
-   inserted && inserted->has_node_name ? 
inserted->node_name
-   : info && info->has_qdev ? info->qdev
-   : "");
+monitor_puts(mon,
+ inserted && inserted->has_node_name ? inserted->node_name
+ : info && info->has_qdev ? info->qdev
+ : "");
 }
 
 if (inserted) {
diff --git a/hw/misc/mos6522.c b/hw/misc/mos6522.c
index f9e646350e..fe38c44426 100644
--- a/hw/misc/mos6522.c
+++ b/hw/misc/mos6522.c
@@ -595,7 +595,7 @@ void hmp_info_via(Monitor *mon, const QDict *qdict)
 if (hmp_handle_error(mon, err)) {
 return;
 }
-monitor_printf(mon, "%s", info->human_readable_text);
+monitor_puts(mon, info->human_readable_text);
 }
 
 static const MemoryRegionOps mos6522_ops = {
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index c6cd6f91dd..f90eea8d01 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -730,7 +730,7 @@ static void hmp_info_pci_device(Monitor *mon, const 
PciDeviceInfo *dev)
 monitor_printf(mon, "");
 
 if (dev->class_info->has_desc) {
-monitor_printf(mon, "%s", dev->class_info->desc);
+monitor_puts(mon, dev->class_info->desc);
 } else {
 monitor_printf(mon, "Class %04" PRId64, dev->class_info->q_class);
 }
@@ -2258,12 +2258,12 @@ static void print_stats_schema_value(Monitor *mon, 
StatsSchemaValue *value)
 if (unit && value->base == 10 &&
 value->exponent >= -18 && value->exponent <= 18 &&
 value->exponent % 3 == 0) {
-monitor_printf(mon, "%s", si_prefix(value->exponent));
+monitor_puts(mon, si_prefix(value->exponent));
 } else if (unit && value->base == 2 &&
value->exponent >= 0 && value->exponent <= 60 &&
value->exponent % 10 == 0) {
 
-monitor_printf(mon, "%s", iec_binary_prefix(value->exponent));
+monitor_puts(mon, iec_binary_prefix(value->exponent));
 } else if (value->exponent) {
 /* Use exponential notation and write the unit's English name */
 monitor_printf(mon, "* %d^%d%s",
@@ -2273,7 +2273,7 @@ static void print_stats_schema_value(Monitor *mon, 
StatsSchema

Re: [PATCH v2 00/13] ppc/e500: Add support for two types of flash, cleanup

2022-10-04 Thread Daniel Henrique Barboza

Hey,

On 10/3/22 18:27, Philippe Mathieu-Daudé wrote:

Hi Daniel,

On 3/10/22 22:31, Bernhard Beschow wrote:

Cover letter:
~

This series adds support for -pflash and direct SD card access to the
PPC e500 boards. The idea is to increase compatibility with "real" firmware
images where only the bare minimum of drivers is compiled in.



Bernhard Beschow (13):
   hw/ppc/meson: Allow e500 boards to be enabled separately
   hw/gpio/meson: Introduce dedicated config switch for hw/gpio/mpc8xxx
   docs/system/ppc/ppce500: Add heading for networking chapter
   hw/ppc/e500: Reduce usage of sysbus API
   hw/ppc/mpc8544ds: Rename wrongly named method
   hw/ppc/mpc8544ds: Add platform bus
   hw/ppc/e500: Remove if statement which is now always true


This first part is mostly reviewed and can already go via your
ppc-next queue.


We're missing an ACK in patch 6/13:

hw/ppc/mpc8544ds: Add platform bus

I'll need some time to understand what's been doing there to provide my own
R-b. Or you can toss a R-b there :D


Thanks,


Daniel






   hw/block/pflash_cfi01: Error out if device length isn't a power of two
   hw/ppc/e500: Implement pflash handling
   hw/sd/sdhci-internal: Unexport ESDHC defines
   hw/sd/sdhci: Rename ESDHC_* defines to USDHC_*
   hw/sd/sdhci: Implement Freescale eSDHC device model
   hw/ppc/e500: Add Freescale eSDHC to e500 boards


This second part still need work. I can take it via the sdmmc-next
queue.

Regards,

Phil.




Re: [PATCH v6] 9pfs: use GHashTable for fid table

2022-10-04 Thread Christian Schoenebeck
On Dienstag, 4. Oktober 2022 12:41:21 CEST Linus Heckemann wrote:
> The previous implementation would iterate over the fid table for
> lookup operations, resulting in an operation with O(n) complexity on
> the number of open files and poor cache locality -- for every open,
> stat, read, write, etc operation.
> 
> This change uses a hashtable for this instead, significantly improving
> the performance of the 9p filesystem. The runtime of NixOS's simple
> installer test, which copies ~122k files totalling ~1.8GiB from 9p,
> decreased by a factor of about 10.
> 
> Signed-off-by: Linus Heckemann 
> Reviewed-by: Philippe Mathieu-Daudé 
> Reviewed-by: Greg Kurz 
> Message-Id: <20220908112353.289267-1-...@sphalerite.org>
> [CS: - Retain BUG_ON(f->clunked) in get_fid().
>  - Add TODO comment in clunk_fid(). ]
> Signed-off-by: Christian Schoenebeck 
> ---

In general: LGTM now, but I will definitely go for some longer test runs 
before queuing this patch. Some minor side notes below ...

> Christian Schoenebeck writes:
> > Not a big deal, but I start thinking whether to keep BUG_ON() here as
> > well.
> > That would require using g_hash_table_lookup() here instead of
> > g_hash_table_contains(). Not that I would insist.
> 
> Done.
> 
> > checkpatch.pl
> 
> Fixed.
> 
> > OK, I get it that you squashed your previous patch and that you ended up
> > with this comment instead. But if you read the old comment version here,
> > you'll notice that it describes the actual problem more accurately:
> > especially that v9fs_reopen_fid() and put_fid() are the 2 functions that
> > may cause the coroutine to "yield" here. That's an important information
> > to be retained in this comment here as it's not obvious.
> 
> Reworded to mention these functions explicitly.
> 
> >  I would not move this to the 2nd loop. I would still set the
> > 
> > FID_NON_RECLAIMABLE flag here, for the same reasons that you are bumping
> > the refcount already in the first loop below.
> 
> Good point! Fixed.
> 
> >  ... that's not the same behaviour as before, is it? Old behaviour was to
> >  put> 
> > the respective (last) fid only on error. And this would now put all fids.
> 
> Indeed it isn't the old behaviour, but I believe it's correct: since
> before we only incremented the reference counter for each one when we
> started reopening it. Now, we increment _all_ their refcounts, so we
> need to put all of them back as well.

Yeah, commented in v9fs_mark_fids_unreclaim() changes below ...

> >  Wasn't there something like GLIST_FOREACH()? Then you wouldn't need to
> >  add
> > 
> > that variable.
> 
> glib does provide g_list_foreach, but that requires a function
> pointer. I'm not aware of a macro for this. I could change this to use
> a QLIST (for which we do have a macro) instead of a GList if you
> insist, but I'd default to leaving this as is.

That's better handled by a future cleanup patch. Right now I find it 
prioritized to get this performance fix merged ASAP, as it provides a 
significant improvement for a lot of people.

I have seen GLIST_FOREACH macros (without function pointer) in other projects, 
but I guess that macro was defined locally by those projects.

> Thanks for your repeated reviews and patience!
> 
> Linus
> 
> 
>  hw/9pfs/9p.c | 195 +--
>  hw/9pfs/9p.h |   2 +-
>  2 files changed, 113 insertions(+), 84 deletions(-)
> 
> diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> index aebadeaa03..729d3181e0 100644
> --- a/hw/9pfs/9p.c
> +++ b/hw/9pfs/9p.c
> @@ -256,7 +256,8 @@ static size_t v9fs_string_size(V9fsString *str)
>  }
> 
>  /*
> - * returns 0 if fid got re-opened, 1 if not, < 0 on error */
> + * returns 0 if fid got re-opened, 1 if not, < 0 on error
> + */
>  static int coroutine_fn v9fs_reopen_fid(V9fsPDU *pdu, V9fsFidState *f)
>  {
>  int err = 1;
> @@ -282,33 +283,32 @@ static V9fsFidState *coroutine_fn get_fid(V9fsPDU
> *pdu, int32_t fid) V9fsFidState *f;
>  V9fsState *s = pdu->s;
> 
> -QSIMPLEQ_FOREACH(f, &s->fid_list, next) {
> +f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid));
> +if (f) {
>  BUG_ON(f->clunked);
> -if (f->fid == fid) {
> -/*
> - * Update the fid ref upfront so that
> - * we don't get reclaimed when we yield
> - * in open later.
> - */
> -f->ref++;
> -/*
> - * check whether we need to reopen the
> - * file. We might have closed the fd
> - * while trying to free up some file
> - * descriptors.
> - */
> -err = v9fs_reopen_fid(pdu, f);
> -if (err < 0) {
> -f->ref--;
> -return NULL;
> -}
> -/*
> - * Mark the fid as referenced so that the LRU
> - * reclaim won't close the file descriptor
> - */
> -f->flags |= FID_REFERENCED;
> -return f;
> + 

[PATCH v6] 9pfs: use GHashTable for fid table

2022-10-04 Thread Linus Heckemann
The previous implementation would iterate over the fid table for
lookup operations, resulting in an operation with O(n) complexity on
the number of open files and poor cache locality -- for every open,
stat, read, write, etc operation.

This change uses a hashtable for this instead, significantly improving
the performance of the 9p filesystem. The runtime of NixOS's simple
installer test, which copies ~122k files totalling ~1.8GiB from 9p,
decreased by a factor of about 10.

Signed-off-by: Linus Heckemann 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Greg Kurz 
Message-Id: <20220908112353.289267-1-...@sphalerite.org>
[CS: - Retain BUG_ON(f->clunked) in get_fid().
 - Add TODO comment in clunk_fid(). ]
Signed-off-by: Christian Schoenebeck 
---
Christian Schoenebeck writes:
> Not a big deal, but I start thinking whether to keep BUG_ON() here as well.
> That would require using g_hash_table_lookup() here instead of
> g_hash_table_contains(). Not that I would insist.

Done.

> checkpatch.pl

Fixed.

> OK, I get it that you squashed your previous patch and that you ended up with
> this comment instead. But if you read the old comment version here, you'll
> notice that it describes the actual problem more accurately: especially that
> v9fs_reopen_fid() and put_fid() are the 2 functions that may cause the
> coroutine to "yield" here. That's an important information to be retained in
> this comment here as it's not obvious.

Reworded to mention these functions explicitly.

>  I would not move this to the 2nd loop. I would still set the
> FID_NON_RECLAIMABLE flag here, for the same reasons that you are bumping the
> refcount already in the first loop below.

Good point! Fixed.

>  ... that's not the same behaviour as before, is it? Old behaviour was to put
> the respective (last) fid only on error. And this would now put all fids.

Indeed it isn't the old behaviour, but I believe it's correct: since
before we only incremented the reference counter for each one when we
started reopening it. Now, we increment _all_ their refcounts, so we
need to put all of them back as well.

>  Wasn't there something like GLIST_FOREACH()? Then you wouldn't need to add
> that variable.

glib does provide g_list_foreach, but that requires a function
pointer. I'm not aware of a macro for this. I could change this to use
a QLIST (for which we do have a macro) instead of a GList if you
insist, but I'd default to leaving this as is.

Thanks for your repeated reviews and patience!

Linus


 hw/9pfs/9p.c | 195 +--
 hw/9pfs/9p.h |   2 +-
 2 files changed, 113 insertions(+), 84 deletions(-)

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index aebadeaa03..729d3181e0 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -256,7 +256,8 @@ static size_t v9fs_string_size(V9fsString *str)
 }

 /*
- * returns 0 if fid got re-opened, 1 if not, < 0 on error */
+ * returns 0 if fid got re-opened, 1 if not, < 0 on error
+ */
 static int coroutine_fn v9fs_reopen_fid(V9fsPDU *pdu, V9fsFidState *f)
 {
 int err = 1;
@@ -282,33 +283,32 @@ static V9fsFidState *coroutine_fn get_fid(V9fsPDU *pdu, 
int32_t fid)
 V9fsFidState *f;
 V9fsState *s = pdu->s;

-QSIMPLEQ_FOREACH(f, &s->fid_list, next) {
+f = g_hash_table_lookup(s->fids, GINT_TO_POINTER(fid));
+if (f) {
 BUG_ON(f->clunked);
-if (f->fid == fid) {
-/*
- * Update the fid ref upfront so that
- * we don't get reclaimed when we yield
- * in open later.
- */
-f->ref++;
-/*
- * check whether we need to reopen the
- * file. We might have closed the fd
- * while trying to free up some file
- * descriptors.
- */
-err = v9fs_reopen_fid(pdu, f);
-if (err < 0) {
-f->ref--;
-return NULL;
-}
-/*
- * Mark the fid as referenced so that the LRU
- * reclaim won't close the file descriptor
- */
-f->flags |= FID_REFERENCED;
-return f;
+/*
+ * Update the fid ref upfront so that
+ * we don't get reclaimed when we yield
+ * in open later.
+ */
+f->ref++;
+/*
+ * check whether we need to reopen the
+ * file. We might have closed the fd
+ * while trying to free up some file
+ * descriptors.
+ */
+err = v9fs_reopen_fid(pdu, f);
+if (err < 0) {
+f->ref--;
+return NULL;
 }
+/*
+ * Mark the fid as referenced so that the LRU
+ * reclaim won't close the file descriptor
+ */
+f->flags |= FID_REFERENCED;
+return f;
 }
 return NULL;
 }
@@ -317,12 +317,11 @@ static V9fsFidState *alloc_fid(V9fsState *s, int32_t fid)
 {
 V9fsFidState *f;

-QSIMPLEQ_FOREACH(f, &s->fid_list, next

Re: [PATCH v5 2/2] block: Refactor get_tmp_filename()

2022-10-04 Thread Kevin Wolf
Am 02.10.2022 um 11:38 hat Bin Meng geschrieben:
> Hi Kevin,
> 
> On Fri, Sep 30, 2022 at 6:13 PM Kevin Wolf  wrote:
> >
> > Am 28.09.2022 um 16:41 hat Bin Meng geschrieben:
> > > From: Bin Meng 
> > >
> > > At present there are two callers of get_tmp_filename() and they are
> > > inconsistent.
> > >
> > > One does:
> > >
> > > /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
> > > char *tmp_filename = g_malloc0(PATH_MAX + 1);
> > > ...
> > > ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
> > >
> > > while the other does:
> > >
> > > s->qcow_filename = g_malloc(PATH_MAX);
> > > ret = get_tmp_filename(s->qcow_filename, PATH_MAX);
> > >
> > > As we can see different 'size' arguments are passed. There are also
> > > platform specific implementations inside the function, and the use
> > > of snprintf is really undesirable.
> > >
> > > The function name is also misleading. It creates a temporary file,
> > > not just a filename.
> > >
> > > Refactor this routine by changing its name and signature to:
> > >
> > > char *create_tmp_file(Error **errp)
> > >
> > > and use g_file_open_tmp() for a consistent implementation.
> > >
> > > Signed-off-by: Bin Meng 
> > > ---
> > >
> > > Changes in v5:
> > > - minor change in the commit message
> > > - add some notes in the function comment block
> > > - add g_autofree for tmp_filename
> > >
> > > Changes in v4:
> > > - Rename the function to create_tmp_file() and take "Error **errp" as
> > >   a parameter, so that callers can pass errp all the way down to this
> > >   routine.
> > > - Commit message updated to reflect the latest change
> > >
> > > Changes in v3:
> > > - Do not use errno directly, instead still let get_tmp_filename() return
> > >   a negative number to indicate error
> > >
> > > Changes in v2:
> > > - Use g_autofree and g_steal_pointer
> > >
> > >  include/block/block_int-common.h |  2 +-
> > >  block.c  | 45 
> > >  block/vvfat.c|  7 +++--
> > >  3 files changed, 20 insertions(+), 34 deletions(-)
> > >
> > > diff --git a/include/block/block_int-common.h 
> > > b/include/block/block_int-common.h
> > > index 8947abab76..d7c0a7e96f 100644
> > > --- a/include/block/block_int-common.h
> > > +++ b/include/block/block_int-common.h
> > > @@ -1230,7 +1230,7 @@ static inline BlockDriverState *child_bs(BdrvChild 
> > > *child)
> > >  }
> > >
> > >  int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp);
> > > -int get_tmp_filename(char *filename, int size);
> > > +char *create_tmp_file(Error **errp);
> > >  void bdrv_parse_filename_strip_prefix(const char *filename, const char 
> > > *prefix,
> > >QDict *options);
> > >
> > > diff --git a/block.c b/block.c
> > > index 582c205307..bd3006d85d 100644
> > > --- a/block.c
> > > +++ b/block.c
> > > @@ -860,35 +860,25 @@ int bdrv_probe_geometry(BlockDriverState *bs, 
> > > HDGeometry *geo)
> > >
> > >  /*
> > >   * Create a uniquely-named empty temporary file.
> > > - * Return 0 upon success, otherwise a negative errno value.
> > > + * Return the actual file name used upon success, otherwise NULL.
> > > + * This string should be freed with g_free() when not needed any longer.
> > > + *
> > > + * Note: creating a temporary file for the caller to (re)open is
> > > + * inherently racy. Use g_file_open_tmp() instead whenever practical.
> > >   */
> > > -int get_tmp_filename(char *filename, int size)
> > > +char *create_tmp_file(Error **errp)
> > >  {
> > > -#ifdef _WIN32
> > > -char temp_dir[MAX_PATH];
> > > -/* GetTempFileName requires that its output buffer (4th param)
> > > -   have length MAX_PATH or greater.  */
> > > -assert(size >= MAX_PATH);
> > > -return (GetTempPath(MAX_PATH, temp_dir)
> > > -&& GetTempFileName(temp_dir, "qem", 0, filename)
> >
> > We're using different prefixes on Windows and on Linux. This patch
> > unifies both paths to use the Linux name. Nobody should rely on the name
> > of temporary files, so there is hope it won't break anything.
> >
> > > -? 0 : -GetLastError());
> > > -#else
> > > +g_autofree char *name = NULL;
> > > +g_autoptr(GError) err = NULL;
> > >  int fd;
> > > -const char *tmpdir;
> > > -tmpdir = getenv("TMPDIR");
> > > -if (!tmpdir) {
> > > -tmpdir = "/var/tmp";
> > > -}
> > > -if (snprintf(filename, size, "%s/vl.XX", tmpdir) >= size) {
> > > -return -EOVERFLOW;
> > > -}
> > > -fd = mkstemp(filename);
> > > +
> > > +fd = g_file_open_tmp("vl.XX", &name, &err);
> >
> > This implicitly reverts commit 69bef7931e8, g_file_open_tmp() uses /tmp
> > as the default instead of /var/tmp as this function does before the
> > patch.
> 
> Oops, thanks for the pointer. Commit message of 69bef7931e8 does not
> explicitely explain why to change from /tmp to /var/tmp. Is that
> because QEMU block codes write a hu