On Sun, May 10, 2026 at 07:50:58PM +0200, Sam Li wrote: > By adding zone operations and zoned metadata, the zoned emulation > capability enables full emulation support of zoned device using > a qcow2 file. The zoned device metadata includes zone type, > zoned device state and write pointer of each zone, which is stored > to an array of unsigned integers. > > Each zone of a zoned device makes state transitions following > the zone state machine. The zone state machine mainly describes > five states, IMPLICIT OPEN, EXPLICIT OPEN, FULL, EMPTY and CLOSED. > READ ONLY and OFFLINE states will generally be affected by device > internal events. The operations on zones cause corresponding state > changing. > > Zoned devices have limits on zone resources, which put constraints on > write operations on zones. It is managed by active zone queues > following LRU policy. > > Signed-off-by: Sam Li <[email protected]> > --- > block/qcow2.c | 851 +++++++++++++++++++++++++++++++++++++++++- > block/qcow2.h | 13 +- > block/trace-events | 2 + > hw/block/virtio-blk.c | 22 +- > include/qemu/queue.h | 1 + > include/qemu/range.h | 4 + > 6 files changed, 884 insertions(+), 9 deletions(-)
Here are is the feedback I have so far:
>
> diff --git a/block/qcow2.c b/block/qcow2.c
> index b543bcf3e3..c84b67976e 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -195,6 +195,293 @@ qcow2_extract_crypto_opts(QemuOpts *opts, const char
> *fmt, Error **errp)
> return cryptoopts_qdict;
> }
>
> +#define QCOW2_ZT_IS_CONV(wp) (wp & 1ULL << 59)
Patch 1 does not document in the file format specification that write
pointer bit 59 indicates a conventional zone.
> +#define QCOW2_GET_WP(wp) ((wp << 5) >> 5)
> +
> +/*
> + * To emulate a real zoned device, closed, empty and full states are
> + * preserved after a power cycle. The open states are in-memory and will
> + * be lost after closing the device. Read-only and offline states are
> + * device-internal events, which are not considered for simplicity.
> + */
> +static inline BlockZoneState qcow2_get_zone_state(BlockDriverState *bs,
> + uint32_t index)
> +{
> + BDRVQcow2State *s = bs->opaque;
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> + uint64_t zone_wp = bs->wps->wp[index];
> + uint64_t zone_start;
> +
> + if (QCOW2_ZT_IS_CONV(zone_wp)) {
> + return BLK_ZS_NOT_WP;
> + }
> +
> + if (QTAILQ_IN_USE(zone_entry, exp_open_zone_entry)) {
> + return BLK_ZS_EOPEN;
> + }
> + if (QTAILQ_IN_USE(zone_entry, imp_open_zone_entry)) {
> + return BLK_ZS_IOPEN;
> + }
> +
> + zone_start = index * bs->bl.zone_size;
> + if (zone_wp == zone_start) {
> + return BLK_ZS_EMPTY;
> + }
> + if (zone_wp >= zone_start + bs->bl.zone_capacity) {
> + return BLK_ZS_FULL;
> + }
> + if (zone_wp > zone_start) {
> + if (!QTAILQ_IN_USE(zone_entry, closed_zone_entry)) {
> + /*
> + * The number of closed zones is not always updated in time when
> + * the device is closed. However, it only matters when doing
> + * zone report. Refresh the count and list of closed zones to
> + * provide correct zone states for zone report.
> + */
> + QTAILQ_INSERT_HEAD(&s->closed_zones, zone_entry,
> closed_zone_entry);
> + s->nr_zones_closed++;
> + }
> + return BLK_ZS_CLOSED;
> + }
> + return BLK_ZS_NOT_WP;
> +}
> +
> +static void qcow2_rm_exp_open_zone(BDRVQcow2State *s,
> + uint32_t index)
> +{
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> + QTAILQ_REMOVE(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
> + s->nr_zones_exp_open--;
> +}
> +
> +static void qcow2_rm_imp_open_zone(BDRVQcow2State *s,
> + int32_t index)
> +{
> + Qcow2ZoneListEntry *zone_entry;
> + if (index < 0) {
> + /* Apply LRU when the index is not specified. */
> + zone_entry = QTAILQ_LAST(&s->imp_open_zones);
> + } else {
> + zone_entry = &s->zone_list_entries[index];
> + }
> +
> + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> + s->nr_zones_imp_open--;
> +}
> +
> +static void qcow2_rm_open_zone(BDRVQcow2State *s,
> + uint32_t index)
> +{
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> + if (QTAILQ_IN_USE(zone_entry, exp_open_zone_entry)) {
> + qcow2_rm_exp_open_zone(s, index);
> + } else if (QTAILQ_IN_USE(zone_entry, imp_open_zone_entry)) {
> + qcow2_rm_imp_open_zone(s, index);
> + }
> +}
> +
> +static void qcow2_rm_closed_zone(BDRVQcow2State *s,
> + uint32_t index)
> +{
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> + QTAILQ_REMOVE(&s->closed_zones, zone_entry, closed_zone_entry);
> + s->nr_zones_closed--;
> +}
> +
> +static void qcow2_do_imp_open_zone(BDRVQcow2State *s,
> + uint32_t index,
> + BlockZoneState zs)
> +{
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> + switch (zs) {
> + case BLK_ZS_EMPTY:
> + break;
> + case BLK_ZS_CLOSED:
> + qcow2_rm_closed_zone(s, index);
> + break;
> + case BLK_ZS_IOPEN:
> + /*
> + * The LRU policy: update the zone that is most recently
> + * used to the head of the zone list
> + */
> + if (zone_entry == QTAILQ_FIRST(&s->imp_open_zones)) {
> + return;
> + }
> + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> + s->nr_zones_imp_open--;
> + break;
> + default:
> + return;
> + }
> +
> + QTAILQ_INSERT_HEAD(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> + s->nr_zones_imp_open++;
> +}
> +
> +static void qcow2_do_exp_open_zone(BDRVQcow2State *s,
> + uint32_t index)
> +{
> + Qcow2ZoneListEntry *zone_entry = &s->zone_list_entries[index];
> +
> + QTAILQ_INSERT_HEAD(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
> + s->nr_zones_exp_open++;
> +}
> +
> +/*
> + * The list of zones is managed using an LRU policy: the last
> + * zone of the list is always the one that was least recently used
> + * for writing and is chosen as the zone to close to be able to
> + * implicitly open another zone.
> + *
> + * We can only close the open zones. The index is not specified
> + * when it is less than 0.
> + */
> +static void qcow2_do_close_zone(BlockDriverState *bs,
> + int32_t index,
> + BlockZoneState zs)
> +{
> + BDRVQcow2State *s = bs->opaque;
> + Qcow2ZoneListEntry *zone_entry;
> +
> + if (index >= 0) {
> + zone_entry = &s->zone_list_entries[index];
> + } else {
> + /* before removal of the last implicitly open zone */
> + zone_entry = QTAILQ_LAST(&s->imp_open_zones);
> + }
> +
> + if (zs == BLK_ZS_IOPEN) {
> + qcow2_rm_imp_open_zone(s, index);
> + goto close_zone;
> + }
> +
> + if (index >= 0 && zs == BLK_ZS_EOPEN) {
> + qcow2_rm_exp_open_zone(s, index);
> + /*
> + * The zone state changes when the zone is removed from the list of
> + * open zones (explicitly open -> empty). The closed zone list is
> + * refreshed during get_zone_state().
> + */
> + qcow2_get_zone_state(bs, index);
> + }
> + return;
> +
> +close_zone:
> + QTAILQ_INSERT_HEAD(&s->closed_zones, zone_entry, closed_zone_entry);
> + s->nr_zones_closed++;
> +}
> +
> +/*
> + * Read/Write the new wp value to the dedicated location of the image file.
> + */
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_rw_wp_at(BlockDriverState *bs, uint64_t *wp,
> + int32_t index, bool is_write) {
> + BDRVQcow2State *s = bs->opaque;
> + g_autofree uint64_t *temp = NULL;
> + uint64_t wpv = *wp;
> + int ret;
> +
> + if (is_write) {
> + ret = bdrv_pwrite(bs->file, s->zoned_header.zonedmeta_offset
> + + sizeof(uint64_t) * index, sizeof(uint64_t), wp,
> 0);
> + if (ret < 0) {
> + *wp = wpv;
> + goto exit;
> + }
> + } else {
> + temp = g_new(uint64_t, 1);
> + ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset
> + + sizeof(uint64_t) * index, sizeof(uint64_t), temp,
> 0);
> + if (ret < 0) {
> + goto exit;
> + }
> +
> + *wp = *temp;
> + }
> +
> + trace_qcow2_wp_tracking(index, *wp >> BDRV_SECTOR_BITS);
> + return ret;
> +
> +exit:
> + error_report("Failed to %s metadata to file", is_write ? "write" :
> "read");
> + return ret;
> +}
> +
> +static bool qcow2_can_activate_zone(BlockDriverState *bs)
> +{
> + BDRVQcow2State *s = bs->opaque;
> +
> + /* When the max active zone is zero, there is no limit on active zones */
> + if (!s->zoned_header.max_active_zones) {
> + return true;
> + }
> +
> + /* Active zones are zones that are open or closed */
> + return s->nr_zones_exp_open + s->nr_zones_imp_open + s->nr_zones_closed
> + < s->zoned_header.max_active_zones;
> +}
> +
> +/*
> + * This function manages open zones under active zones limit. It checks
> + * if a zone can transition to open state while maintaining max open and
> + * active zone limits.
> + */
> +static bool qcow2_can_open_zone(BlockDriverState *bs)
> +{
> + BDRVQcow2State *s = bs->opaque;
> +
> + /* When the max open zone is zero, there is no limit on open zones */
> + if (!s->zoned_header.max_open_zones) {
> + return true;
> + }
> +
> + /*
> + * The open zones are zones with the states of explicitly and
> + * implicitly open.
> + */
> + if (s->nr_zones_imp_open + s->nr_zones_exp_open <
> + s->zoned_header.max_open_zones) {
> + return true;
> + }
> +
> + /*
> + * Zones are managed one at a time. Thus, the number of implicitly open
> + * zone can never be over the open zone limit. When the active zone limit
> + * is not reached, close only one implicitly open zone.
> + */
> + if (qcow2_can_activate_zone(bs)) {
> + qcow2_do_close_zone(bs, -1, BLK_ZS_IOPEN);
> + trace_qcow2_imp_open_zones(0x23, s->nr_zones_imp_open);
> + return true;
> + }
> + return false;
> +}
> +
> +static inline int coroutine_fn GRAPH_RDLOCK
> +qcow2_refresh_zonedmeta(BlockDriverState *bs)
> +{
> + int ret;
> + BDRVQcow2State *s = bs->opaque;
> + uint64_t wps_size = s->zoned_header.zonedmeta_size;
> + g_autofree uint64_t *temp = NULL;
> +
> + temp = g_new(uint64_t, s->zoned_header.nr_zones);
> + ret = bdrv_pread(bs->file, s->zoned_header.zonedmeta_offset,
> + wps_size, temp, 0);
> + if (ret < 0) {
> + error_report("Cannot read metadata");
> + return ret;
> + }
> +
> + memcpy(bs->wps->wp, temp, wps_size);
> + return 0;
> +}
> +
> /*
> * Passing by the zoned device configurations by a zoned_header struct, check
> * if the zone device options are under constraints. Return false when some
> @@ -527,7 +814,23 @@ qcow2_read_extensions(BlockDriverState *bs, uint64_t
> start_offset,
> be32_to_cpu(zoned_ext.max_active_zones);
> zoned_ext.max_append_bytes =
> be32_to_cpu(zoned_ext.max_append_bytes);
> + zoned_ext.zonedmeta_offset =
> + be64_to_cpu(zoned_ext.zonedmeta_offset);
> + zoned_ext.zonedmeta_size = be64_to_cpu(zoned_ext.zonedmeta_size);
> s->zoned_header = zoned_ext;
> + bs->wps = g_malloc(sizeof(BlockZoneWps)
> + + s->zoned_header.zonedmeta_size);
> + ret = qcow2_refresh_zonedmeta(bs);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + s->zone_list_entries = g_new0(Qcow2ZoneListEntry,
> + zoned_ext.nr_zones);
> + QTAILQ_INIT(&s->exp_open_zones);
> + QTAILQ_INIT(&s->imp_open_zones);
> + QTAILQ_INIT(&s->closed_zones);
> + qemu_co_mutex_init(&bs->wps->colock);
>
> /* refuse to open broken images */
> if (zoned_ext.nr_zones != DIV_ROUND_UP(bs->total_sectors *
> @@ -2883,21 +3186,119 @@ static coroutine_fn GRAPH_RDLOCK int
> qcow2_co_pwritev_task_entry(AioTask *task)
> t->l2meta);
> }
>
> +/*
> + * If it is an append write request, the offset pointer needs to be updated
> to
> + * the wp value of that zone after the IO completion. The unique pointer is
> + * passed on to this function to prevent the value being changed in
> condition of
> + * multiple concurrent writes.
> + */
> static int coroutine_fn GRAPH_RDLOCK
> -qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
> - QEMUIOVector *qiov, size_t qiov_offset,
> - BdrvRequestFlags flags)
> +qcow2_co_pwv_part(BlockDriverState *bs, int64_t *offset_ptr, int64_t bytes,
> + QEMUIOVector *qiov, size_t qiov_offset, bool is_append,
> + BdrvRequestFlags flags)
> {
> BDRVQcow2State *s = bs->opaque;
> int offset_in_cluster;
> int ret;
> unsigned int cur_bytes; /* number of sectors in current iteration */
> uint64_t host_offset;
> + int64_t offset = *offset_ptr;
> QCowL2Meta *l2meta = NULL;
> AioTaskPool *aio = NULL;
> + int64_t start_offset, start_bytes;
> + BlockZoneState zs;
> + int64_t end_zone, end_offset;
> + uint64_t *wp;
> + int64_t zone_size = bs->bl.zone_size;
> + int64_t zone_capacity = bs->bl.zone_capacity;
> + int index;
>
> trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
>
> + start_offset = offset;
> + start_bytes = bytes;
> + if (bs->bl.zoned == BLK_Z_HM) {
> + index = start_offset / zone_size;
> + wp = &bs->wps->wp[index];
> + if (!QCOW2_ZT_IS_CONV(*wp)) {
> + if (offset != *wp && !is_append) {
> + /* The write offset must be equal to the zone write pointer
> */
> + error_report("Offset 0x%" PRIx64 " of regular writes must be
> "
> + "equal to the zone write pointer 0x%" PRIx64 "",
> + offset, *wp);
> + return -EINVAL;
> + }
> +
> + if (is_append) {
> + /*
> + * The offset of append write is the write pointer value of
> + * that zone.
> + */
> + start_offset = *wp;
> + }
> +
> + end_offset = start_offset + start_bytes;
> +
> + /* Only allow writes when there are zone resources left */
> + zs = qcow2_get_zone_state(bs, index);
> + if (zs == BLK_ZS_CLOSED || zs == BLK_ZS_EMPTY) {
> + if (!qcow2_can_open_zone(bs)) {
> + error_report("no more open zones available");
> + return -EINVAL;
> + }
> + }
> +
> + /*
> + * Align up (start_offset, zone_size), the start offset is not
> + * necessarily power of two.
> + */
> + end_zone = index * zone_size + zone_capacity;
> + /* Write cannot exceed the zone capacity. */
> + if (end_offset > end_zone) {
> + error_report("write exceeds zone capacity with end_offset:"
> + "0x%lx, end_zone: 0x%lx",
> + end_offset / 512, end_zone / 512);
> + return -EINVAL;
> + }
> +
> + /*
> + * Real drives change states before it can write to the zone. If
> + * the write fails, the zone state may have changed.
> + *
> + * The zone state transitions to implicit open when the original
> + * state is empty or closed. When the wp reaches the end, the
> + * open states (explicit open, implicit open) become full.
> + */
> + zs = qcow2_get_zone_state(bs, index);
> + if (!(end_offset & (zone_capacity - 1))) {
> + /* Being aligned to zone capacity implies full state */
> + qcow2_rm_open_zone(s, index);
> + trace_qcow2_imp_open_zones(0x24,
> + s->nr_zones_imp_open);
> + } else {
> + qcow2_do_imp_open_zone(s, index, zs);
> + trace_qcow2_imp_open_zones(0x24,
> + s->nr_zones_imp_open);
> + }
> +
> + /*
> + * The write pointer is update before IO completion, with the
> + * assumption that the write IO will succeed.
> + */
> + qemu_co_mutex_lock(&bs->wps->colock);
> + if (is_append) {
> + *offset_ptr = *wp;
> + }
> + *wp = end_offset;
> + ret = qcow2_rw_wp_at(bs, wp, index, true);
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + if (ret < 0) {
> + error_report("failed to update write pointer");
> + return -EINVAL;
> + }
> + }
> + }
> +
> while (bytes != 0 && aio_task_pool_status(aio) == 0) {
>
> l2meta = NULL;
> @@ -2943,6 +3344,7 @@ qcow2_co_pwritev_part(BlockDriverState *bs, int64_t
> offset, int64_t bytes,
> qiov_offset += cur_bytes;
> trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
> }
> +
> ret = 0;
>
> qemu_co_mutex_lock(&s->lock);
> @@ -2961,11 +3363,32 @@ fail_nometa:
> g_free(aio);
> }
>
> + if (ret < 0 && bs->bl.zoned == BLK_Z_HM) {
> + /* update the wp when write IO failed */
> + qemu_co_mutex_lock(&bs->wps->colock);
> + index = start_offset / zone_size;
> + wp = &bs->wps->wp[index];
> + if (!QCOW2_ZT_IS_CONV(*wp)) {
> + ret = qcow2_rw_wp_at(bs, wp, index, false);
> + }
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + }
> +
> trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
>
> return ret;
> }
>
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
> + QEMUIOVector *qiov, size_t qiov_offset,
> + BdrvRequestFlags flags)
> +{
> + return qcow2_co_pwv_part(bs, &offset, bytes, qiov, qiov_offset, false,
> + flags);
> +}
> +
> +
> static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs)
> {
> BDRVQcow2State *s = bs->opaque;
> @@ -3001,6 +3424,25 @@ static int GRAPH_RDLOCK
> qcow2_inactivate(BlockDriverState *bs)
> return result;
> }
>
> +static void qcow2_do_close_all_zone(BDRVQcow2State *s)
> +{
> + Qcow2ZoneListEntry *zone_entry, *next;
> +
> + QTAILQ_FOREACH_SAFE(zone_entry, &s->imp_open_zones, imp_open_zone_entry,
> + next) {
> + QTAILQ_REMOVE(&s->imp_open_zones, zone_entry, imp_open_zone_entry);
> + s->nr_zones_imp_open--;
> + }
> +
> + QTAILQ_FOREACH_SAFE(zone_entry, &s->exp_open_zones, exp_open_zone_entry,
> + next) {
> + QTAILQ_REMOVE(&s->exp_open_zones, zone_entry, exp_open_zone_entry);
> + s->nr_zones_exp_open--;
> + }
> +
> + assert(s->nr_zones_imp_open + s->nr_zones_exp_open == 0);
> +}
> +
> static void coroutine_mixed_fn GRAPH_RDLOCK
> qcow2_do_close(BlockDriverState *bs, bool close_data_file)
> {
> @@ -3040,6 +3482,8 @@ qcow2_do_close(BlockDriverState *bs, bool
> close_data_file)
>
> qcow2_refcount_close(bs);
> qcow2_free_snapshots(bs);
> + qcow2_do_close_all_zone(s);
> + g_free(bs->wps);
> }
>
> static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs)
> @@ -3357,7 +3801,10 @@ int qcow2_update_header(BlockDriverState *bs)
> .max_active_zones =
> cpu_to_be32(s->zoned_header.max_active_zones),
> .max_append_bytes =
> - cpu_to_be32(s->zoned_header.max_append_bytes)
> + cpu_to_be32(s->zoned_header.max_append_bytes),
> + .zonedmeta_offset =
> + cpu_to_be64(s->zoned_header.zonedmeta_offset),
> + .zonedmeta_size =
> cpu_to_be64(s->zoned_header.zonedmeta_size),
> };
> ret = header_ext_add(buf, QCOW2_EXT_MAGIC_ZONED_FORMAT,
> &zoned_header, sizeof(zoned_header),
> @@ -3766,7 +4213,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options,
> Error **errp)
> int version;
> int refcount_order;
> uint64_t *refcount_table;
> - int ret;
> + uint64_t zoned_meta_size, zoned_clusterlen;
> + int ret, offset, i;
> uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
>
> assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
> @@ -4113,6 +4561,42 @@ qcow2_co_create(BlockdevCreateOptions *create_options,
> Error **errp)
> ret = -EINVAL;
> goto unlock;
> }
> +
> + uint32_t nrz = s->zoned_header.nr_zones;
> + zoned_meta_size = sizeof(uint64_t) * nrz;
> + g_autofree uint64_t *meta = NULL;
> + meta = g_new0(uint64_t, nrz);
> +
> + for (i = 0; i < s->zoned_header.conventional_zones; ++i) {
> + meta[i] = i * s->zoned_header.zone_size;
> + meta[i] |= 1ULL << 59;
> + }
> +
> + for (; i < nrz; ++i) {
> + meta[i] = i * s->zoned_header.zone_size;
> + }
> +
> + offset = qcow2_alloc_clusters(blk_bs(blk), zoned_meta_size);
> + if (offset < 0) {
> + ret = offset;
> + error_setg_errno(errp, -ret, "Could not allocate clusters "
> + "for zoned metadata size");
> + goto unlock;
> + }
> + s->zoned_header.zonedmeta_offset = offset;
> + s->zoned_header.zonedmeta_size = zoned_meta_size;
> +
> + zoned_clusterlen = size_to_clusters(s, zoned_meta_size)
> + * s->cluster_size;
> + ret = qcow2_pre_write_overlap_check(blk_bs(blk), 0, offset,
> + zoned_clusterlen, false);
> + assert(ret == 0);
> + ret = bdrv_pwrite(blk_bs(blk)->file, offset, zoned_meta_size, meta,
> 0);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "Could not write zoned metadata "
> + "to disk");
> + goto unlock;
> + }
> } else {
> s->zoned_header.zoned = BLK_Z_NONE;
> }
> @@ -4512,6 +4996,359 @@ qcow2_co_pdiscard(BlockDriverState *bs, int64_t
> offset, int64_t bytes)
> return ret;
> }
>
> +static int coroutine_fn
> +qcow2_co_zone_report(BlockDriverState *bs, int64_t offset,
> + unsigned int *nr_zones, BlockZoneDescriptor *zones)
> +{
> + BDRVQcow2State *s = bs->opaque;
> + uint64_t zone_size = s->zoned_header.zone_size;
> + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> + int64_t size = bs->bl.nr_zones * zone_size;
> + unsigned int nrz;
> + int i = 0;
> + int si;
> +
> + if (offset >= capacity) {
> + error_report("offset %" PRId64 " is equal to or greater than the "
> + "device capacity %" PRId64 "", offset, capacity);
> + return -EINVAL;
> + }
> +
> + nrz = ((*nr_zones) < bs->bl.nr_zones) ? (*nr_zones) : bs->bl.nr_zones;
> + si = offset / zone_size; /* Zone size cannot be 0 for zoned device */
> + qemu_co_mutex_lock(&bs->wps->colock);
> + for (; i < nrz; ++i) {
> + if (i + si >= bs->bl.nr_zones) {
> + break;
> + }
> +
> + zones[i].start = (si + i) * zone_size;
> +
> + /* The last zone can be smaller than the zone size */
> + if ((si + i + 1) == bs->bl.nr_zones && size > capacity) {
> + uint32_t l = zone_size - (size - capacity);
> + zones[i].length = l;
> + zones[i].cap = l;
> + } else {
> + zones[i].length = zone_size;
> + zones[i].cap = zone_size;
> + }
> +
> + uint64_t wp = bs->wps->wp[si + i];
> + if (QCOW2_ZT_IS_CONV(wp)) {
> + zones[i].type = BLK_ZT_CONV;
> + zones[i].state = BLK_ZS_NOT_WP;
> + /* Clear masking bits */
> + wp = QCOW2_GET_WP(wp);
> + } else {
> + zones[i].type = BLK_ZT_SWR;
> + zones[i].state = qcow2_get_zone_state(bs, si + i);
> + }
> + zones[i].wp = wp;
> + }
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + *nr_zones = i;
> + return 0;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_open_zone(BlockDriverState *bs, uint32_t index) {
> + BDRVQcow2State *s = bs->opaque;
> + int ret;
> +
> + qemu_co_mutex_lock(&bs->wps->colock);
> + BlockZoneState zs = qcow2_get_zone_state(bs, index);
> + trace_qcow2_imp_open_zones(BLK_ZO_OPEN, s->nr_zones_imp_open);
> +
> + switch (zs) {
> + case BLK_ZS_EMPTY:
> + if (!qcow2_can_activate_zone(bs)) {
> + ret = -EBUSY;
> + goto unlock;
> + }
> + break;
> + case BLK_ZS_IOPEN:
> + qcow2_rm_imp_open_zone(s, index);
> + break;
> + case BLK_ZS_EOPEN:
> + return 0;
> + case BLK_ZS_CLOSED:
> + if (!qcow2_can_open_zone(bs)) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> + qcow2_rm_closed_zone(s, index);
> + break;
> + case BLK_ZS_FULL:
> + break;
> + default:
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> + qcow2_do_exp_open_zone(s, index);
> + ret = 0;
> +
> +unlock:
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + return ret;
> +}
> +
> +static int qcow2_close_zone(BlockDriverState *bs, uint32_t index)
> +{
> + int ret;
> +
> + qemu_co_mutex_lock(&bs->wps->colock);
> + BlockZoneState zs = qcow2_get_zone_state(bs, index);
> +
> + switch (zs) {
> + case BLK_ZS_EMPTY:
> + break;
> + case BLK_ZS_IOPEN:
> + break;
> + case BLK_ZS_EOPEN:
> + break;
> + case BLK_ZS_CLOSED:
> + /* Closing a closed zone is not an error */
> + ret = 0;
> + goto unlock;
> + case BLK_ZS_FULL:
> + break;
> + default:
> + ret = -EINVAL;
> + goto unlock;
> + }
> + qcow2_do_close_zone(bs, index, zs);
> + ret = 0;
> +
> +unlock:
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_finish_zone(BlockDriverState *bs, uint32_t index) {
> + BDRVQcow2State *s = bs->opaque;
> + int ret;
> +
> + qemu_co_mutex_lock(&bs->wps->colock);
> + uint64_t *wp = &bs->wps->wp[index];
> + BlockZoneState zs = qcow2_get_zone_state(bs, index);
> +
> + switch (zs) {
> + case BLK_ZS_EMPTY:
> + if (!qcow2_can_activate_zone(bs)) {
> + ret = -EBUSY;
> + goto unlock;
> + }
> + break;
> + case BLK_ZS_IOPEN:
> + qcow2_rm_imp_open_zone(s, index);
> + trace_qcow2_imp_open_zones(BLK_ZO_FINISH, s->nr_zones_imp_open);
> + break;
> + case BLK_ZS_EOPEN:
> + qcow2_rm_exp_open_zone(s, index);
> + break;
> + case BLK_ZS_CLOSED:
> + if (!qcow2_can_open_zone(bs)) {
> + ret = -EINVAL;
> + goto unlock;
> + }
> + qcow2_rm_closed_zone(s, index);
> + break;
> + case BLK_ZS_FULL:
> + ret = 0;
> + goto unlock;
> + default:
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> + *wp = ((uint64_t)index + 1) * s->zoned_header.zone_size;
> + ret = qcow2_rw_wp_at(bs, wp, index, true);
> +
> +unlock:
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_reset_zone(BlockDriverState *bs, uint32_t index,
> + int64_t len) {
> + BDRVQcow2State *s = bs->opaque;
> + int nrz = bs->bl.nr_zones;
> + int zone_size = bs->bl.zone_size;
> + int n, ret = 0;
> +
> + qemu_co_mutex_lock(&bs->wps->colock);
> + uint64_t *wp = &bs->wps->wp[index];
> + if (len == bs->total_sectors << BDRV_SECTOR_BITS) {
> + n = nrz;
> + index = 0;
> + } else {
> + n = len / zone_size;
> + }
> +
> + for (int i = 0; i < n; ++i) {
> + uint64_t *wp_i = (uint64_t *)(wp + i);
> + uint64_t wpi_v = *wp_i;
> + if (QCOW2_ZT_IS_CONV(wpi_v)) {
> + continue;
> + }
> +
> + BlockZoneState zs = qcow2_get_zone_state(bs, index + i);
> + switch (zs) {
> + case BLK_ZS_EMPTY:
> + break;
> + case BLK_ZS_IOPEN:
> + qcow2_rm_imp_open_zone(s, index + i);
> + trace_qcow2_imp_open_zones(BLK_ZO_RESET, s->nr_zones_imp_open);
> + break;
> + case BLK_ZS_EOPEN:
> + qcow2_rm_exp_open_zone(s, index + i);
> + break;
> + case BLK_ZS_CLOSED:
> + qcow2_rm_closed_zone(s, index + i);
> + break;
> + case BLK_ZS_FULL:
> + break;
> + default:
> + ret = -EINVAL;
> + goto unlock;
> + }
> +
> + if (zs == BLK_ZS_EMPTY) {
> + continue;
> + }
> +
> + *wp_i = ((uint64_t)index + i) * zone_size;
> + ret = qcow2_rw_wp_at(bs, wp_i, index + i, true);
> + if (ret < 0) {
> + goto unlock;
> + }
> + /* clear data */
> + ret = qcow2_co_pwrite_zeroes(bs, *wp_i, zone_size, 0);
> + if (ret < 0) {
> + error_report("Failed to reset zone at 0x%" PRIx64 "", *wp_i);
> + }
> + }
> +
> +unlock:
> + qemu_co_mutex_unlock(&bs->wps->colock);
> + return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
> + int64_t offset, int64_t len)
> +{
> + BDRVQcow2State *s = bs->opaque;
> + int ret = 0;
> + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> + int64_t zone_size = s->zoned_header.zone_size;
> + int64_t zone_size_mask = zone_size - 1;
> + uint32_t index = offset / zone_size;
> + BlockZoneWps *wps = bs->wps;
> +
> + if (offset >= capacity) {
> + error_report("offset %" PRId64 " is equal to or greater than the"
> + "device capacity %" PRId64 "", offset, capacity);
> + return -EINVAL;
> + }
> +
> + if (offset & zone_size_mask) {
> + error_report("sector offset %" PRId64 " is not aligned to zone size"
> + " %" PRId64 "", offset / 512, zone_size / 512);
> + return -EINVAL;
> + }
> +
> + if (((offset + len) < capacity && len & zone_size_mask) ||
> + offset + len > capacity) {
> + error_report("number of sectors %" PRId64 " is not aligned to zone"
> + " size %" PRId64 "", len / 512, zone_size / 512);
> + return -EINVAL;
> + }
> +
> + qemu_co_mutex_lock(&wps->colock);
> + uint64_t wpv = wps->wp[index];
Do we need to check that index < nr_zones?
> + if (QCOW2_ZT_IS_CONV(wpv) && len != capacity) {
Does the len != capacity check cover all cases or is it just for
BLK_ZO_RESET? For example, when BLK_ZO_OPEN is called with len ==
capacity it seems possible to perform an open operation on the zone?
> + error_report("zone mgmt operations are not allowed for "
> + "conventional zones");
> + ret = -EIO;
> + goto unlock;
> + }
> + qemu_co_mutex_unlock(&wps->colock);
> +
> + switch (op) {
> + case BLK_ZO_OPEN:
> + ret = qcow2_open_zone(bs, index);
> + break;
> + case BLK_ZO_CLOSE:
> + ret = qcow2_close_zone(bs, index);
> + break;
> + case BLK_ZO_FINISH:
> + ret = qcow2_finish_zone(bs, index);
> + break;
> + case BLK_ZO_RESET:
> + ret = qcow2_reset_zone(bs, index, len);
> + break;
> + default:
> + error_report("Unsupported zone op: 0x%x", op);
> + ret = -ENOTSUP;
> + break;
> + }
> + return ret;
> +
> +unlock:
> + qemu_co_mutex_unlock(&wps->colock);
> + return ret;
> +}
> +
> +static int coroutine_fn GRAPH_RDLOCK
> +qcow2_co_zone_append(BlockDriverState *bs, int64_t *offset, QEMUIOVector
> *qiov,
> + BdrvRequestFlags flags)
> +{
> + assert(flags == 0);
> + int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> + int64_t zone_size_mask = bs->bl.zone_size - 1;
> + int64_t iov_len = 0;
> + int64_t len = 0;
> +
> + if (*offset >= capacity) {
> + error_report("*offset %" PRId64 " is equal to or greater than the"
> + "device capacity %" PRId64 "", *offset, capacity);
> + return -EINVAL;
> + }
> +
> + /* offset + len should not pass the end of that zone starting from
> offset */
> + if (*offset & zone_size_mask) {
> + error_report("sector offset %" PRId64 " is not aligned to zone size "
> + "%" PRId64 "", *offset / 512, bs->bl.zone_size / 512);
> + return -EINVAL;
> + }
> +
> + int64_t wg = bs->bl.write_granularity;
> + int64_t wg_mask = wg - 1;
> + for (int i = 0; i < qiov->niov; i++) {
> + iov_len = qiov->iov[i].iov_len;
> + if (iov_len & wg_mask) {
> + error_report("len of IOVector[%d] 0x%" PRIx64 " is not aligned
> to "
> + "block size 0x%" PRIx64 "", i, iov_len, wg);
> + return -EINVAL;
> + }
> + }
> + len = qiov->size;
> +
> + if ((len >> BDRV_SECTOR_BITS) > bs->bl.max_append_sectors) {
> + error_report("len 0x%" PRIx64 " in sectors is greater than "
> + "max_append_sectors 0x%" PRIx32 "",
> + len >> BDRV_SECTOR_BITS, bs->bl.max_append_sectors);
> + return -EINVAL;
> + }
> +
> + return qcow2_co_pwv_part(bs, offset, len, qiov, 0, true, 0);
> +}
> +
> static int coroutine_fn GRAPH_RDLOCK
> qcow2_co_copy_range_from(BlockDriverState *bs,
> BdrvChild *src, int64_t src_offset,
> @@ -6578,6 +7415,10 @@ BlockDriver bdrv_qcow2 = {
> .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
> .bdrv_make_empty = qcow2_make_empty,
>
> + .bdrv_co_zone_report = qcow2_co_zone_report,
> + .bdrv_co_zone_mgmt = qcow2_co_zone_mgmt,
This struct seems to align the equals signs so it's nicer to look at.
Please align this equals sign for consistency.
> + .bdrv_co_zone_append = qcow2_co_zone_append,
> +
> .bdrv_snapshot_create = qcow2_snapshot_create,
> .bdrv_snapshot_goto = qcow2_snapshot_goto,
> .bdrv_snapshot_delete = qcow2_snapshot_delete,
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 743f2e3e79..4f72b8d62a 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -256,9 +256,9 @@ typedef struct Qcow2ZonedHeaderExtension {
> } QEMU_PACKED Qcow2ZonedHeaderExtension;
>
> typedef struct Qcow2ZoneListEntry {
> - QLIST_ENTRY(Qcow2ZoneListEntry) exp_open_zone_entry;
> - QLIST_ENTRY(Qcow2ZoneListEntry) imp_open_zone_entry;
> - QLIST_ENTRY(Qcow2ZoneListEntry) closed_zone_entry;
> + QTAILQ_ENTRY(Qcow2ZoneListEntry) exp_open_zone_entry;
> + QTAILQ_ENTRY(Qcow2ZoneListEntry) imp_open_zone_entry;
> + QTAILQ_ENTRY(Qcow2ZoneListEntry) closed_zone_entry;
> } Qcow2ZoneListEntry;
Please squash this into the patch that introduced Qcow2ZoneListEntry or,
better yet, remove it from that patch and defer the definition of this
struct until the current patch.
>
> typedef struct Qcow2UnknownHeaderExtension {
> @@ -456,6 +456,13 @@ typedef struct BDRVQcow2State {
>
> /* States of zoned device */
> Qcow2ZonedHeaderExtension zoned_header;
> + QTAILQ_HEAD(, Qcow2ZoneListEntry) exp_open_zones;
> + QTAILQ_HEAD(, Qcow2ZoneListEntry) imp_open_zones;
> + QTAILQ_HEAD(, Qcow2ZoneListEntry) closed_zones;
> + Qcow2ZoneListEntry *zone_list_entries;
> + uint32_t nr_zones_exp_open;
> + uint32_t nr_zones_imp_open;
> + uint32_t nr_zones_closed;
> } BDRVQcow2State;
>
> typedef struct Qcow2COWRegion {
> diff --git a/block/trace-events b/block/trace-events
> index 950c82d4b8..30a3e303ca 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -76,6 +76,8 @@ qcow2_writev_data(void *co, uint64_t offset) "co %p offset
> 0x%" PRIx64
> qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int64_t bytes) "co
> %p offset 0x%" PRIx64 " bytes %" PRId64
> qcow2_pwrite_zeroes(void *co, int64_t offset, int64_t bytes) "co %p offset
> 0x%" PRIx64 " bytes %" PRId64
> qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset
> 0x%" PRIx64 " nb_clusters %d"
> +qcow2_wp_tracking(int index, uint64_t wp) "wps[%d]: 0x%" PRIx64
> +qcow2_imp_open_zones(uint8_t op, int nrz) "nr_imp_open_zones after op 0x%x:
> %d"
>
> # qcow2-cluster.c
> qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p
> offset 0x%" PRIx64 " bytes %d"
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 9cb9f1fb2b..285db19ac7 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
Please move the virtio-blk changes into a separate patch. It doesn't
seem to be directly related to qcow2 support.
> @@ -288,6 +288,9 @@ static void virtio_blk_submit_multireq(VirtIOBlock *s,
> MultiReqBuffer *mrb)
> int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
> uint32_t max_transfer;
> int64_t sector_num = 0;
> + BlockDriverState *bs = blk_bs(s->blk);
> + bool zone_cross;
> + int64_t zone_sector, end_sector;
>
> if (mrb->num_reqs == 1) {
> submit_requests(s, mrb, 0, 1, -1);
> @@ -303,17 +306,34 @@ static void virtio_blk_submit_multireq(VirtIOBlock *s,
> MultiReqBuffer *mrb)
> for (i = 0; i < mrb->num_reqs; i++) {
> VirtIOBlockReq *req = mrb->reqs[i];
> if (num_reqs > 0) {
> + zone_cross = false;
> +
> + /*
> + * On zoned backends, a single backend write must not span a zone
> + * boundary. Bail out of merging if combining req into the
> current
> + * batch would straddle a zone.
> + */
> + if (bs && bs->bl.zone_size > 0) {
> + zone_sector = bs->bl.zone_size / BDRV_SECTOR_SIZE;
> + end_sector = req->sector_num
> + + req->qiov.size / BDRV_SECTOR_SIZE - 1;
> + zone_cross = (sector_num / zone_sector) !=
> + (end_sector / zone_sector);
> + }
> +
> /*
> * NOTE: We cannot merge the requests in below situations:
> * 1. requests are not sequential
> * 2. merge would exceed maximum number of IOVs
> * 3. merge would exceed maximum transfer length of backend
> device
> + * 4. merge would cross a zone boundary on a zoned backend
> */
> if (sector_num + nb_sectors != req->sector_num ||
> niov > blk_get_max_iov(s->blk) - req->qiov.niov ||
> req->qiov.size > max_transfer ||
> nb_sectors > (max_transfer -
> - req->qiov.size) / BDRV_SECTOR_SIZE) {
> + req->qiov.size) / BDRV_SECTOR_SIZE ||
> + zone_cross) {
> submit_requests(s, mrb, start, num_reqs, niov);
> num_reqs = 0;
> }
> diff --git a/include/qemu/queue.h b/include/qemu/queue.h
> index e029e7bf66..3f0a48740e 100644
> --- a/include/qemu/queue.h
> +++ b/include/qemu/queue.h
> @@ -179,6 +179,7 @@ struct {
> \
> #define QLIST_EMPTY(head) ((head)->lh_first == NULL)
> #define QLIST_FIRST(head) ((head)->lh_first)
> #define QLIST_NEXT(elm, field) ((elm)->field.le_next)
> +#define QLIST_LAST(head, field) (*(head)->lh_first->field.le_prev)
If QLIST_LAST() is supposed to fetch the last element in the list, then
it won't work:
#define QLIST_INSERT_HEAD(head, elm, field) do { \
if (((elm)->field.le_next = (head)->lh_first) != NULL) \
(head)->lh_first->field.le_prev = &(elm)->field.le_next;\
(head)->lh_first = (elm); \
(elm)->field.le_prev = &(head)->lh_first; \
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The first node's le_prev points to the head's lh_first field, not to the
last node.
>
>
> /*
> diff --git a/include/qemu/range.h b/include/qemu/range.h
> index d446ad885d..d39ba68407 100644
> --- a/include/qemu/range.h
> +++ b/include/qemu/range.h
> @@ -213,6 +213,10 @@ static inline int range_covers_byte(uint64_t offset,
> uint64_t len,
> static inline bool ranges_overlap(uint64_t first1, uint64_t len1,
> uint64_t first2, uint64_t len2)
> {
> + if (first1 + len1 == 0 || first2 + len2 == 0) {
> + return false;
> + }
What is this?
> +
> uint64_t last1 = range_get_last(first1, len1);
> uint64_t last2 = range_get_last(first2, len2);
>
> --
> 2.43.0
>
signature.asc
Description: PGP signature
