On 2022/08/22 21:12, Sam Li wrote: > Stefan Hajnoczi <stefa...@redhat.com> 于2022年8月23日周二 08:49写道: >> >> On Tue, Aug 16, 2022 at 02:25:18PM +0800, Sam Li wrote: >>> By adding zone management operations in BlockDriver, storage controller >>> emulation can use the new block layer APIs including Report Zone and >>> four zone management operations (open, close, finish, reset). >>> >>> Add zoned storage commands of the device: zone_report(zrp), zone_open(zo), >>> zone_close(zc), zone_reset(zrs), zone_finish(zf). >>> >>> For example, to test zone_report, use following command: >>> $ ./build/qemu-io --image-opts driver=zoned_host_device, >>> filename=/dev/nullb0 >>> -c "zrp offset nr_zones" >>> >>> Signed-off-by: Sam Li <faithilike...@gmail.com> >>> Reviewed-by: Hannes Reinecke <h...@suse.de> >>> --- >>> block/block-backend.c | 50 +++++ >>> block/file-posix.c | 341 +++++++++++++++++++++++++++++- >>> block/io.c | 41 ++++ >>> include/block/block-common.h | 1 - >>> include/block/block-io.h | 13 ++ >>> include/block/block_int-common.h | 22 +- >>> include/block/raw-aio.h | 6 +- >>> include/sysemu/block-backend-io.h | 6 + >>> meson.build | 1 + >>> qapi/block-core.json | 8 +- >>> qemu-io-cmds.c | 143 +++++++++++++ >>> 11 files changed, 625 insertions(+), 7 deletions(-) >>> >>> diff --git a/block/block-backend.c b/block/block-backend.c >>> index d4a5df2ac2..fc639b0cd7 100644 >>> --- a/block/block-backend.c >>> +++ b/block/block-backend.c >>> @@ -1775,6 +1775,56 @@ int coroutine_fn blk_co_flush(BlockBackend *blk) >>> return ret; >>> } >>> >>> +/* >>> + * Send a zone_report command. >>> + * offset is a byte offset from the start of the device. No alignment >>> + * required for offset. >>> + * nr_zones represents IN maximum and OUT actual. >>> + */ >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, >>> + unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones) >>> +{ >>> + int ret; >>> + IO_CODE(); >>> + >>> + blk_inc_in_flight(blk); /* increase before waiting */ >>> + blk_wait_while_drained(blk); >>> + if (!blk_is_available(blk)) { >>> + blk_dec_in_flight(blk); >>> + return -ENOMEDIUM; >>> + } >>> + ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones); >>> + blk_dec_in_flight(blk); >>> + return ret; >>> +} >>> + >>> +/* >>> + * Send a zone_management command. >>> + * offset is the starting zone specified as a sector offset. >>> + * len is the maximum number of sectors the command should operate on. >>> + */ >>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, >>> + int64_t offset, int64_t len) >>> +{ >>> + int ret; >>> + IO_CODE(); >>> + >>> + ret = blk_check_byte_request(blk, offset, len); >>> + if (ret < 0) { >>> + return ret; >>> + } >> >> blk_check_byte_request() calls blk_is_available() and returns -ENOMEDIUM >> when it fails. You can therefore move this down and replace "if >> (!blk_is_available(blk)) {". >> >>> + blk_inc_in_flight(blk); >>> + blk_wait_while_drained(blk); >>> + if (!blk_is_available(blk)) { >>> + blk_dec_in_flight(blk); >>> + return -ENOMEDIUM; >>> + } >>> + ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len); >>> + blk_dec_in_flight(blk); >>> + return ret; >>> +} >>> + >>> void blk_drain(BlockBackend *blk) >>> { >>> BlockDriverState *bs = blk_bs(blk); >>> diff --git a/block/file-posix.c b/block/file-posix.c >>> index 727389488c..29f67082d9 100644 >>> --- a/block/file-posix.c >>> +++ b/block/file-posix.c >>> @@ -67,6 +67,9 @@ >>> #include <sys/param.h> >>> #include <sys/syscall.h> >>> #include <sys/vfs.h> >>> +#if defined(CONFIG_BLKZONED) >>> +#include <linux/blkzoned.h> >>> +#endif >>> #include <linux/cdrom.h> >>> #include <linux/fd.h> >>> #include <linux/fs.h> >>> @@ -216,6 +219,13 @@ typedef struct RawPosixAIOData { >>> PreallocMode prealloc; >>> Error **errp; >>> } truncate; >>> + struct { >>> + unsigned int *nr_zones; >>> + BlockZoneDescriptor *zones; >>> + } zone_report; >>> + struct { >>> + unsigned long ioctl_op; >>> + } zone_mgmt; >>> }; >>> } RawPosixAIOData; >>> >>> @@ -1328,7 +1338,7 @@ static void raw_refresh_limits(BlockDriverState *bs, >>> Error **errp) >>> #endif >>> >>> if (bs->sg || S_ISBLK(st.st_mode)) { >>> - int ret = hdev_get_max_hw_transfer(s->fd, &st); >>> + ret = hdev_get_max_hw_transfer(s->fd, &st); >>> >>> if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { >>> bs->bl.max_hw_transfer = ret; >>> @@ -1340,11 +1350,32 @@ static void raw_refresh_limits(BlockDriverState >>> *bs, Error **errp) >>> } >>> } >>> >>> - ret = get_sysfs_zoned_model(s->fd, &st, &zoned); >>> + ret = get_sysfs_zoned_model(&st, &zoned); >>> if (ret < 0) { >>> zoned = BLK_Z_NONE; >>> } >>> bs->bl.zoned = zoned; >>> + if (zoned != BLK_Z_NONE) { >>> + ret = get_sysfs_long_val(&st, "chunk_sectors"); >>> + if (ret > 0) { >>> + bs->bl.zone_sectors = ret; >>> + } >>> + >>> + ret = get_sysfs_long_val(&st, "zone_append_max_bytes"); >>> + if (ret > 0) { >>> + bs->bl.zone_append_max_bytes = ret; >>> + } >>> + >>> + ret = get_sysfs_long_val(&st, "max_open_zones"); >>> + if (ret > 0) { >>> + bs->bl.max_open_zones = ret; >>> + } >>> + >>> + ret = get_sysfs_long_val(&st, "max_active_zones"); >>> + if (ret > 0) { >>> + bs->bl.max_active_zones = ret; >>> + } >>> + } >>> } >>> >>> static int check_for_dasd(int fd) >>> @@ -1839,6 +1870,134 @@ static off_t copy_file_range(int in_fd, off_t >>> *in_off, int out_fd, >>> } >>> #endif >>> >>> +/* >>> + * parse_zone - Fill a zone descriptor >>> + */ >>> +#if defined(CONFIG_BLKZONED) >>> +static inline void parse_zone(struct BlockZoneDescriptor *zone, >>> + struct blk_zone *blkz) { >> >> Declaring the second argument "const struct blk_zone *blkz" would make >> it clear that this function converts from blk_zone to >> BlockZoneDescriptor. >> >>> + zone->start = blkz->start; >>> + zone->length = blkz->len; >>> + zone->cap = blkz->capacity; >>> + zone->wp = blkz->wp; >>> + >>> + switch (blkz->type) { >>> + case BLK_ZONE_TYPE_SEQWRITE_REQ: >>> + zone->type = BLK_ZT_SWR; >>> + break; >>> + case BLK_ZONE_TYPE_SEQWRITE_PREF: >>> + zone->type = BLK_ZT_SWP; >>> + break; >>> + case BLK_ZONE_TYPE_CONVENTIONAL: >>> + zone->type = BLK_ZT_CONV; >>> + break; >>> + default: >>> + error_report("Invalid zone type: 0x%x", blkz->type); >> >> Or g_assert_not_reached() to indicate that this should never happen. If >> it does happen the process will call abort(3) and it will terminate with >> a coredump file for debugging. >> >>> + } >>> + >>> + switch (blkz->cond) { >>> + case BLK_ZONE_COND_NOT_WP: >>> + zone->cond = BLK_ZS_NOT_WP; >>> + break; >>> + case BLK_ZONE_COND_EMPTY: >>> + zone->cond = BLK_ZS_EMPTY; >>> + break; >>> + case BLK_ZONE_COND_IMP_OPEN: >>> + zone->cond =BLK_ZS_IOPEN; >>> + break; >>> + case BLK_ZONE_COND_EXP_OPEN: >>> + zone->cond = BLK_ZS_EOPEN; >>> + break; >>> + case BLK_ZONE_COND_CLOSED: >>> + zone->cond = BLK_ZS_CLOSED; >>> + break; >>> + case BLK_ZONE_COND_READONLY: >>> + zone->cond = BLK_ZS_RDONLY; >>> + break; >>> + case BLK_ZONE_COND_FULL: >>> + zone->cond = BLK_ZS_FULL; >>> + break; >>> + case BLK_ZONE_COND_OFFLINE: >>> + zone->cond = BLK_ZS_OFFLINE; >>> + break; >>> + default: >>> + error_report("Invalid zone condition 0x%x", blkz->cond); >> >> Same here. >> >>> + } >>> +} >>> +#endif >>> + >>> +static int handle_aiocb_zone_report(void *opaque) { >>> +#if defined(CONFIG_BLKZONED) >>> + RawPosixAIOData *aiocb = opaque; >>> + int fd = aiocb->aio_fildes; >>> + unsigned int *nr_zones = aiocb->zone_report.nr_zones; >>> + BlockZoneDescriptor *zones = aiocb->zone_report.zones; >>> + int64_t sector = aiocb->aio_offset; >>> + >>> + struct blk_zone *blkz; >>> + int64_t rep_size; >>> + unsigned int nrz; >>> + int ret, n = 0, i = 0; >>> + >>> + nrz = *nr_zones; >>> + rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct >>> blk_zone); >>> + g_autofree struct blk_zone_report *rep = NULL; >>> + rep = g_malloc(rep_size); >>> + >>> + blkz = (struct blk_zone *)(rep + 1); >>> + while (n < nrz) { >>> + memset(rep, 0, rep_size); >>> + rep->sector = sector; >>> + rep->nr_zones = nrz - n; >>> + >>> + ret = ioctl(fd, BLKREPORTZONE, rep); >> >> Does this ioctl() need "do { ... } while (ret == -1 && errno == EINTR)"? > > No? We discussed this before. I guess even EINTR should be propagated > back to the guest. Maybe Damien can talk more about why.
In the kernel, completion of zone management IO requests are waited for using wait_for_completion_io() which uses TASK_UNINTERRUPTIBLE. So a signal will not abort anything. So I do not think that the do { } while() loop is necessary. > >> >>> + if (ret != 0) { >>> + error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed >>> %d", >>> + fd, sector, errno); >>> + return -errno; >>> + } >>> + >>> + if (!rep->nr_zones) { >>> + break; >>> + } >>> + >>> + for (i = 0; i < rep->nr_zones; i++, n++) { >>> + parse_zone(&zones[n], &blkz[i]); >>> + /* The next report should start after the last zone reported */ >>> + sector = blkz[i].start + blkz[i].len; >>> + } >>> + } >>> + >>> + *nr_zones = n; >>> + return 0; >>> +#else >>> + return -ENOTSUP; >>> +#endif >>> +} >>> + >>> +static int handle_aiocb_zone_mgmt(void *opaque) { >>> +#if defined(CONFIG_BLKZONED) >>> + RawPosixAIOData *aiocb = opaque; >>> + int fd = aiocb->aio_fildes; >>> + int64_t sector = aiocb->aio_offset; >>> + int64_t nr_sectors = aiocb->aio_nbytes; >>> + unsigned long ioctl_op = aiocb->zone_mgmt.ioctl_op; >>> + struct blk_zone_range range; >>> + int ret; >>> + >>> + /* Execute the operation */ >>> + range.sector = sector; >>> + range.nr_sectors = nr_sectors; >>> + do { >>> + ret = ioctl(fd, ioctl_op, &range); >>> + } while (ret != 0 && errno == EINTR); >>> + >>> + return ret; >> >> if (ret < 0) { >> return -errno; >> } >> return 0; >> >>> +#else >>> + return -ENOTSUP; >>> +#endif >>> +} >>> + >>> static int handle_aiocb_copy_range(void *opaque) >>> { >>> RawPosixAIOData *aiocb = opaque; >>> @@ -3011,6 +3170,124 @@ static void raw_account_discard(BDRVRawState *s, >>> uint64_t nbytes, int ret) >>> } >>> } >>> >>> +/* >>> + * zone report - Get a zone block device's information in the form >>> + * of an array of zone descriptors. >>> + * >>> + * @param bs: passing zone block device file descriptor >>> + * @param zones: an array of zone descriptors to hold zone >>> + * information on reply >>> + * @param offset: offset can be any byte within the zone size. >>> + * @param len: (not sure yet. >>> + * @return 0 on success, -1 on failure >>> + */ >>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t >>> offset, >>> + unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones) { >>> +#if defined(CONFIG_BLKZONED) >>> + BDRVRawState *s = bs->opaque; >>> + RawPosixAIOData acb; >>> + >>> + acb = (RawPosixAIOData) { >>> + .bs = bs, >>> + .aio_fildes = s->fd, >>> + .aio_type = QEMU_AIO_ZONE_REPORT, >>> + /* zoned block devices use 512-byte sectors */ >>> + .aio_offset = offset / 512, >>> + .zone_report = { >>> + .nr_zones = nr_zones, >>> + .zones = zones, >>> + }, >>> + }; >>> + >>> + return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb); >>> +#else >>> + return -ENOTSUP; >>> +#endif >>> +} >>> + >>> +/* >>> + * zone management operations - Execute an operation on a zone >>> + */ >>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp >>> op, >>> + int64_t offset, int64_t len) { >>> +#if defined(CONFIG_BLKZONED) >>> + BDRVRawState *s = bs->opaque; >>> + RawPosixAIOData acb; >>> + int64_t zone_sector, zone_sector_mask; >>> + const char *ioctl_name; >>> + unsigned long ioctl_op; >>> + int ret; >>> + >>> + struct stat st; >>> + if (fstat(s->fd, &st) < 0) { >>> + ret = -errno; >>> + return ret; >>> + } >>> + zone_sector = get_sysfs_long_val(&st, "chunk_sectors"); >>> + if (zone_sector < 0) { >>> + error_report("invalid zone sector size %" PRId64 "", zone_sector); >>> + return -EINVAL; >>> + } >>> + >>> + zone_sector_mask = zone_sector - 1; >>> + if (offset & zone_sector_mask) { >>> + error_report("sector offset %" PRId64 " is not aligned to zone >>> size " >>> + "%" PRId64 "", offset, zone_sector); >>> + return -EINVAL; >>> + } >>> + >>> + if (len & zone_sector_mask) { >>> + error_report("number of sectors %" PRId64 " is not aligned to zone >>> size" >>> + " %" PRId64 "", len, zone_sector); >>> + return -EINVAL; >>> + } >>> + >>> + switch (op) { >>> + case BLK_ZO_OPEN: >>> + ioctl_name = "BLKOPENZONE"; >>> + ioctl_op = BLKOPENZONE; >>> + break; >>> + case BLK_ZO_CLOSE: >>> + ioctl_name = "BLKCLOSEZONE"; >>> + ioctl_op = BLKCLOSEZONE; >>> + break; >>> + case BLK_ZO_FINISH: >>> + ioctl_name = "BLKFINISHZONE"; >>> + ioctl_op = BLKFINISHZONE; >>> + break; >>> + case BLK_ZO_RESET: >>> + ioctl_name = "BLKRESETZONE"; >>> + ioctl_op = BLKRESETZONE; >>> + break; >>> + default: >>> + error_report("Invalid zone operation 0x%x", op); >>> + return -EINVAL; >>> + } >>> + >>> + acb = (RawPosixAIOData) { >>> + .bs = bs, >>> + .aio_fildes = s->fd, >>> + .aio_type = QEMU_AIO_ZONE_MGMT, >>> + .aio_offset = offset, >>> + .aio_nbytes = len, >>> + .zone_mgmt = { >>> + .ioctl_op = ioctl_op, >>> + }, >>> + }; >>> + >>> + ret = raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb); >>> + if (ret != 0) { >>> + error_report("ioctl %s failed %d", ioctl_name, errno); >>> + return -errno; >>> + } >>> + >>> + return ret; >>> +#else >>> + return -ENOTSUP; >>> +#endif >>> +} >>> + >>> static coroutine_fn int >>> raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes, >>> bool blkdev) >>> @@ -3511,6 +3788,14 @@ static void hdev_parse_filename(const char >>> *filename, QDict *options, >>> bdrv_parse_filename_strip_prefix(filename, "host_device:", options); >>> } >>> >>> +#if defined(CONFIG_BLKZONED) >>> +static void zoned_host_device_parse_filename(const char *filename, QDict >>> *options, >>> + Error **errp) >>> +{ >>> + bdrv_parse_filename_strip_prefix(filename, "zoned_host_device:", >>> options); >>> +} >>> +#endif >> >> Sorry, I asked you to add this function but I've changed my mind and I >> think it should not be present. .bdrv_parse_filename() helps legacy >> drivers convert arguments into QDict *options. But this is a new driver >> that no one expects to work with string filenames. Therefore >> .bdrv_parse_filename can be dropped. >> >>> + >>> static bool hdev_is_sg(BlockDriverState *bs) >>> { >>> >>> @@ -3741,6 +4026,55 @@ static BlockDriver bdrv_host_device = { >>> #endif >>> }; >>> >>> +#if defined(CONFIG_BLKZONED) >>> +static BlockDriver bdrv_zoned_host_device = { >>> + .format_name = "zoned_host_device", >>> + .protocol_name = "zoned_host_device", >>> + .instance_size = sizeof(BDRVRawState), >>> + .bdrv_needs_filename = true, >>> + .bdrv_probe_device = hdev_probe_device, >>> + .bdrv_parse_filename = zoned_host_device_parse_filename, >>> + .bdrv_file_open = hdev_open, >>> + .bdrv_close = raw_close, >>> + .bdrv_reopen_prepare = raw_reopen_prepare, >>> + .bdrv_reopen_commit = raw_reopen_commit, >>> + .bdrv_reopen_abort = raw_reopen_abort, >>> + .bdrv_co_create_opts = bdrv_co_create_opts_simple, >>> + .create_opts = &bdrv_create_opts_simple, >>> + .mutable_opts = mutable_opts, >>> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache, >>> + .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, >>> + >>> + .bdrv_co_preadv = raw_co_preadv, >>> + .bdrv_co_pwritev = raw_co_pwritev, >>> + .bdrv_co_flush_to_disk = raw_co_flush_to_disk, >>> + .bdrv_co_pdiscard = hdev_co_pdiscard, >>> + .bdrv_co_copy_range_from = raw_co_copy_range_from, >>> + .bdrv_co_copy_range_to = raw_co_copy_range_to, >>> + .bdrv_refresh_limits = raw_refresh_limits, >>> + .bdrv_io_plug = raw_aio_plug, >>> + .bdrv_io_unplug = raw_aio_unplug, >>> + .bdrv_attach_aio_context = raw_aio_attach_aio_context, >>> + >>> + .bdrv_co_truncate = raw_co_truncate, >>> + .bdrv_getlength = raw_getlength, >>> + .bdrv_get_info = raw_get_info, >>> + .bdrv_get_allocated_file_size >>> + = raw_get_allocated_file_size, >>> + .bdrv_get_specific_stats = hdev_get_specific_stats, >>> + .bdrv_check_perm = raw_check_perm, >>> + .bdrv_set_perm = raw_set_perm, >>> + .bdrv_abort_perm_update = raw_abort_perm_update, >>> + .bdrv_probe_blocksizes = hdev_probe_blocksizes, >>> + .bdrv_probe_geometry = hdev_probe_geometry, >>> + .bdrv_co_ioctl = hdev_co_ioctl, >>> + >>> + /* zone management operations */ >>> + .bdrv_co_zone_report = raw_co_zone_report, >>> + .bdrv_co_zone_mgmt = raw_co_zone_mgmt, >>> +}; >>> +#endif >>> + >>> #if defined(__linux__) || defined(__FreeBSD__) || >>> defined(__FreeBSD_kernel__) >>> static void cdrom_parse_filename(const char *filename, QDict *options, >>> Error **errp) >>> @@ -4001,6 +4335,9 @@ static void bdrv_file_init(void) >>> bdrv_register(&bdrv_file); >>> #if defined(HAVE_HOST_BLOCK_DEVICE) >>> bdrv_register(&bdrv_host_device); >>> +#if defined(CONFIG_BLKZONED) >>> + bdrv_register(&bdrv_zoned_host_device); >>> +#endif >>> #ifdef __linux__ >>> bdrv_register(&bdrv_host_cdrom); >>> #endif >>> diff --git a/block/io.c b/block/io.c >>> index 0a8cbefe86..de9ec1d740 100644 >>> --- a/block/io.c >>> +++ b/block/io.c >>> @@ -3198,6 +3198,47 @@ out: >>> return co.ret; >>> } >>> >>> +int bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, >>> + unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones) >>> +{ >>> + BlockDriver *drv = bs->drv; >>> + CoroutineIOCompletion co = { >>> + .coroutine = qemu_coroutine_self(), >>> + }; >>> + IO_CODE(); >>> + >>> + bdrv_inc_in_flight(bs); >>> + if (!drv || !drv->bdrv_co_zone_report) { >>> + co.ret = -ENOTSUP; >>> + goto out; >>> + } >>> + co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones); >>> +out: >>> + bdrv_dec_in_flight(bs); >>> + return co.ret; >>> +} >>> + >>> +int bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, >>> + int64_t offset, int64_t len) >>> +{ >>> + BlockDriver *drv = bs->drv; >>> + CoroutineIOCompletion co = { >>> + .coroutine = qemu_coroutine_self(), >>> + }; >>> + IO_CODE(); >>> + >>> + bdrv_inc_in_flight(bs); >>> + if (!drv || !drv->bdrv_co_zone_mgmt) { >>> + co.ret = -ENOTSUP; >>> + goto out; >>> + } >>> + co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len); >>> +out: >>> + bdrv_dec_in_flight(bs); >>> + return co.ret; >>> +} >>> + >>> void *qemu_blockalign(BlockDriverState *bs, size_t size) >>> { >>> IO_CODE(); >>> diff --git a/include/block/block-common.h b/include/block/block-common.h >>> index 36bd0e480e..5102fa6858 100644 >>> --- a/include/block/block-common.h >>> +++ b/include/block/block-common.h >>> @@ -23,7 +23,6 @@ >>> */ >>> #ifndef BLOCK_COMMON_H >>> #define BLOCK_COMMON_H >>> - >>> #include "block/aio.h" >>> #include "block/aio-wait.h" >>> #include "qemu/iov.h" >> >> Unrelated whitespace change. Please drop this. >> >>> diff --git a/include/block/block-io.h b/include/block/block-io.h >>> index fd25ffa9be..55ad261e16 100644 >>> --- a/include/block/block-io.h >>> +++ b/include/block/block-io.h >>> @@ -88,6 +88,13 @@ int bdrv_co_ioctl(BlockDriverState *bs, int req, void >>> *buf); >>> /* Ensure contents are flushed to disk. */ >>> int coroutine_fn bdrv_co_flush(BlockDriverState *bs); >>> >>> +/* Report zone information of zone block device. */ >>> +int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, >>> + unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones); >>> +int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op, >>> + int64_t offset, int64_t len); >>> + >>> int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); >>> bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); >>> int bdrv_block_status(BlockDriverState *bs, int64_t offset, >>> @@ -297,6 +304,12 @@ bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector >>> *qiov, int64_t pos); >>> int generated_co_wrapper >>> bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); >>> >>> +int generated_co_wrapper >>> +blk_zone_report(BlockBackend *blk, int64_t offset, unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones); >>> +int generated_co_wrapper >>> +blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op, int64_t offset, int64_t >>> len); >>> + >>> /** >>> * bdrv_parent_drained_begin_single: >>> * >>> diff --git a/include/block/block_int-common.h >>> b/include/block/block_int-common.h >>> index 7f7863cc9e..de44c7b6f4 100644 >>> --- a/include/block/block_int-common.h >>> +++ b/include/block/block_int-common.h >>> @@ -94,7 +94,6 @@ typedef struct BdrvTrackedRequest { >>> struct BdrvTrackedRequest *waiting_for; >>> } BdrvTrackedRequest; >>> >>> - >>> struct BlockDriver { >>> /* >>> * These fields are initialized when this object is created, >> >> Unrelated whitespace change. Please drop this. >> >>> @@ -691,6 +690,12 @@ struct BlockDriver { >>> QEMUIOVector *qiov, >>> int64_t pos); >>> >>> + int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs, >>> + int64_t offset, unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones); >>> + int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, >>> BlockZoneOp op, >>> + int64_t offset, int64_t len); >>> + >>> /* removable device specific */ >>> bool (*bdrv_is_inserted)(BlockDriverState *bs); >>> void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); >>> @@ -828,6 +833,21 @@ typedef struct BlockLimits { >>> >>> /* device zone model */ >>> BlockZoneModel zoned; >>> + >>> + /* zone size expressed in 512-byte sectors */ >>> + uint32_t zone_sectors; >>> + >>> + /* total number of zones */ >>> + unsigned int nr_zones; >>> + >>> + /* maximum size in bytes of a zone append write operation */ >>> + int64_t zone_append_max_bytes; >>> + >>> + /* maximum number of open zones */ >>> + int64_t max_open_zones; >>> + >>> + /* maximum number of active zones */ >>> + int64_t max_active_zones; >>> } BlockLimits; >>> >>> typedef struct BdrvOpBlocker BdrvOpBlocker; >>> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h >>> index 21fc10c4c9..3d26929cdd 100644 >>> --- a/include/block/raw-aio.h >>> +++ b/include/block/raw-aio.h >>> @@ -29,6 +29,8 @@ >>> #define QEMU_AIO_WRITE_ZEROES 0x0020 >>> #define QEMU_AIO_COPY_RANGE 0x0040 >>> #define QEMU_AIO_TRUNCATE 0x0080 >>> +#define QEMU_AIO_ZONE_REPORT 0x0100 >>> +#define QEMU_AIO_ZONE_MGMT 0x0200 >>> #define QEMU_AIO_TYPE_MASK \ >>> (QEMU_AIO_READ | \ >>> QEMU_AIO_WRITE | \ >>> @@ -37,7 +39,9 @@ >>> QEMU_AIO_DISCARD | \ >>> QEMU_AIO_WRITE_ZEROES | \ >>> QEMU_AIO_COPY_RANGE | \ >>> - QEMU_AIO_TRUNCATE) >>> + QEMU_AIO_TRUNCATE | \ >>> + QEMU_AIO_ZONE_REPORT | \ >>> + QEMU_AIO_ZONE_MGMT) >>> >>> /* AIO flags */ >>> #define QEMU_AIO_MISALIGNED 0x1000 >>> diff --git a/include/sysemu/block-backend-io.h >>> b/include/sysemu/block-backend-io.h >>> index 50f5aa2e07..6e7df1d93b 100644 >>> --- a/include/sysemu/block-backend-io.h >>> +++ b/include/sysemu/block-backend-io.h >>> @@ -156,6 +156,12 @@ int generated_co_wrapper >>> blk_pwrite_zeroes(BlockBackend *blk, int64_t offset, >>> int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset, >>> int64_t bytes, BdrvRequestFlags >>> flags); >>> >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset, >>> + unsigned int *nr_zones, >>> + BlockZoneDescriptor *zones); >>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op, >>> + int64_t offset, int64_t len); >>> + >>> int generated_co_wrapper blk_pdiscard(BlockBackend *blk, int64_t offset, >>> int64_t bytes); >>> int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, >>> diff --git a/meson.build b/meson.build >>> index 294e9a8f32..c3219b0e87 100644 >>> --- a/meson.build >>> +++ b/meson.build >>> @@ -1883,6 +1883,7 @@ config_host_data.set('CONFIG_REPLICATION', >>> get_option('live_block_migration').al >>> # has_header >>> config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h')) >>> config_host_data.set('CONFIG_LINUX_MAGIC_H', >>> cc.has_header('linux/magic.h')) >>> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h')) >>> config_host_data.set('CONFIG_VALGRIND_H', >>> cc.has_header('valgrind/valgrind.h')) >>> config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h')) >>> config_host_data.set('HAVE_DRM_H', cc.has_header('libdrm/drm.h')) >>> diff --git a/qapi/block-core.json b/qapi/block-core.json >>> index 2173e7734a..c6bbb7a037 100644 >>> --- a/qapi/block-core.json >>> +++ b/qapi/block-core.json >>> @@ -2942,6 +2942,7 @@ >>> # @compress: Since 5.0 >>> # @copy-before-write: Since 6.2 >>> # @snapshot-access: Since 7.0 >>> +# @zoned_host_device: Since 7.2 >>> # >>> # Since: 2.9 >>> ## >>> @@ -2955,7 +2956,8 @@ >>> 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', >>> 'parallels', >>> 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', >>> { 'name': 'replication', 'if': 'CONFIG_REPLICATION' }, >>> - 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } >>> + 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', >>> + { 'name': 'zoned_host_device', 'if': 'CONFIG_BLKZONED' } ] } >>> >>> ## >>> # @BlockdevOptionsFile: >>> @@ -4329,7 +4331,9 @@ >>> 'vhdx': 'BlockdevOptionsGenericFormat', >>> 'vmdk': 'BlockdevOptionsGenericCOWFormat', >>> 'vpc': 'BlockdevOptionsGenericFormat', >>> - 'vvfat': 'BlockdevOptionsVVFAT' >>> + 'vvfat': 'BlockdevOptionsVVFAT', >>> + 'zoned_host_device': { 'type': 'BlockdevOptionsFile', >>> + 'if': 'CONFIG_BLKZONED' } >>> } } >>> >>> ## >>> diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c >>> index 952dc940f1..687c3a624c 100644 >>> --- a/qemu-io-cmds.c >>> +++ b/qemu-io-cmds.c >>> @@ -1712,6 +1712,144 @@ static const cmdinfo_t flush_cmd = { >>> .oneline = "flush all in-core file state to disk", >>> }; >>> >>> +static int zone_report_f(BlockBackend *blk, int argc, char **argv) >>> +{ >>> + int ret; >>> + int64_t offset; >>> + unsigned int nr_zones; >>> + >>> + ++optind; >>> + offset = cvtnum(argv[optind]); >>> + ++optind; >>> + nr_zones = cvtnum(argv[optind]); >>> + >>> + g_autofree BlockZoneDescriptor *zones = NULL; >>> + zones = g_new(BlockZoneDescriptor, nr_zones); >>> + ret = blk_zone_report(blk, offset, &nr_zones, zones); >>> + if (ret < 0) { >>> + printf("zone report failed: %s\n", strerror(-ret)); >>> + } else { >>> + for (int i = 0; i < nr_zones; ++i) { >>> + printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", " >>> + "cap"" 0x%" PRIx64 ",wptr 0x%" PRIx64 ", " >>> + "zcond:%u, [type: %u]\n", >>> + zones[i].start, zones[i].length, zones[i].cap, >>> zones[i].wp, >>> + zones[i].cond, zones[i].type); >>> + } >>> + } >>> + return ret; >>> +} >>> + >>> +static const cmdinfo_t zone_report_cmd = { >>> + .name = "zone_report", >>> + .altname = "zrp", >>> + .cfunc = zone_report_f, >>> + .argmin = 2, >>> + .argmax = 2, >>> + .args = "offset number", >>> + .oneline = "report zone information", >>> +}; >>> + >>> +static int zone_open_f(BlockBackend *blk, int argc, char **argv) >>> +{ >>> + int ret; >>> + int64_t offset, len; >>> + ++optind; >>> + offset = cvtnum(argv[optind]); >>> + ++optind; >>> + len = cvtnum(argv[optind]); >>> + ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len); >>> + if (ret < 0) { >>> + printf("zone open failed: %s\n", strerror(-ret)); >>> + } >>> + return ret; >>> +} >>> + >>> +static const cmdinfo_t zone_open_cmd = { >>> + .name = "zone_open", >>> + .altname = "zo", >>> + .cfunc = zone_open_f, >>> + .argmin = 2, >>> + .argmax = 2, >>> + .args = "offset len", >>> + .oneline = "explicit open a range of zones in zone block device", >>> +}; >>> + >>> +static int zone_close_f(BlockBackend *blk, int argc, char **argv) >>> +{ >>> + int ret; >>> + int64_t offset, len; >>> + ++optind; >>> + offset = cvtnum(argv[optind]); >>> + ++optind; >>> + len = cvtnum(argv[optind]); >>> + ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len); >>> + if (ret < 0) { >>> + printf("zone close failed: %s\n", strerror(-ret)); >>> + } >>> + return ret; >>> +} >>> + >>> +static const cmdinfo_t zone_close_cmd = { >>> + .name = "zone_close", >>> + .altname = "zc", >>> + .cfunc = zone_close_f, >>> + .argmin = 2, >>> + .argmax = 2, >>> + .args = "offset len", >>> + .oneline = "close a range of zones in zone block device", >>> +}; >>> + >>> +static int zone_finish_f(BlockBackend *blk, int argc, char **argv) >>> +{ >>> + int ret; >>> + int64_t offset, len; >>> + ++optind; >>> + offset = cvtnum(argv[optind]); >>> + ++optind; >>> + len = cvtnum(argv[optind]); >>> + ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len); >>> + if (ret < 0) { >>> + printf("zone finish failed: %s\n", strerror(-ret)); >>> + } >>> + return ret; >>> +} >>> + >>> +static const cmdinfo_t zone_finish_cmd = { >>> + .name = "zone_finish", >>> + .altname = "zf", >>> + .cfunc = zone_finish_f, >>> + .argmin = 2, >>> + .argmax = 2, >>> + .args = "offset len", >>> + .oneline = "finish a range of zones in zone block device", >>> +}; >>> + >>> +static int zone_reset_f(BlockBackend *blk, int argc, char **argv) >>> +{ >>> + int ret; >>> + int64_t offset, len; >>> + ++optind; >>> + offset = cvtnum(argv[optind]); >>> + ++optind; >>> + len = cvtnum(argv[optind]); >>> + ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len); >>> + if (ret < 0) { >>> + printf("zone reset failed: %s\n", strerror(-ret)); >>> + } >>> + return ret; >>> +} >>> + >>> +static const cmdinfo_t zone_reset_cmd = { >>> + .name = "zone_reset", >>> + .altname = "zrs", >>> + .cfunc = zone_reset_f, >>> + .argmin = 2, >>> + .argmax = 2, >>> + .args = "offset len", >>> + .oneline = "reset a zone write pointer in zone block device", >>> +}; >>> + >>> static int truncate_f(BlockBackend *blk, int argc, char **argv); >>> static const cmdinfo_t truncate_cmd = { >>> .name = "truncate", >>> @@ -2504,6 +2642,11 @@ static void __attribute((constructor)) >>> init_qemuio_commands(void) >>> qemuio_add_command(&aio_write_cmd); >>> qemuio_add_command(&aio_flush_cmd); >>> qemuio_add_command(&flush_cmd); >>> + qemuio_add_command(&zone_report_cmd); >>> + qemuio_add_command(&zone_open_cmd); >>> + qemuio_add_command(&zone_close_cmd); >>> + qemuio_add_command(&zone_finish_cmd); >>> + qemuio_add_command(&zone_reset_cmd); >>> qemuio_add_command(&truncate_cmd); >>> qemuio_add_command(&length_cmd); >>> qemuio_add_command(&info_cmd); >>> -- >>> 2.37.1 >>> -- Damien Le Moal Western Digital Research