On Sun, 2022-10-16 at 23:05 +0800, Sam Li wrote: > This patch extends virtio-blk emulation to handle zoned device commands > by calling the new block layer APIs to perform zoned device I/O on > behalf of the guest. It supports Report Zone, four zone oparations (open, > close, finish, reset), and Append Zone. > > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does > support zoned block devices. Regular block devices(conventional zones) > will not be set. > > Then the guest os can use blkzone(8) to test those commands on zoned devices. > Furthermore, using zonefs to test zone append write is also supported. > > Signed-off-by: Sam Li <faithilike...@gmail.com> > --- > hw/block/virtio-blk-common.c | 2 + > hw/block/virtio-blk.c | 412 ++++++++++++++++++++++++++++++++- > include/hw/virtio/virtio-blk.h | 11 +- > 3 files changed, 422 insertions(+), 3 deletions(-) > > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c > index ac52d7c176..e2f8e2f6da 100644 > --- a/hw/block/virtio-blk-common.c > +++ b/hw/block/virtio-blk-common.c > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = { > .end = endof(struct virtio_blk_config, discard_sector_alignment)}, > {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES, > .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)}, > + {.flags = 1ULL << VIRTIO_BLK_F_ZONED, > + .end = endof(struct virtio_blk_config, zoned)}, > {} > }; > > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c > index 8131ec2dbc..58891aea31 100644 > --- a/hw/block/virtio-blk.c > +++ b/hw/block/virtio-blk.c > @@ -26,6 +26,9 @@ > #include "hw/virtio/virtio-blk.h" > #include "dataplane/virtio-blk.h" > #include "scsi/constants.h" > +#if defined(CONFIG_BLKZONED) > +#include <linux/blkzoned.h> > +#endif > #ifdef __linux__ > # include <scsi/sg.h> > #endif > @@ -55,10 +58,29 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, > unsigned char status) > { > VirtIOBlock *s = req->dev; > VirtIODevice *vdev = VIRTIO_DEVICE(s); > + int64_t inhdr_len, n; > + void *buf; > > trace_virtio_blk_req_complete(vdev, req, status); > > - stb_p(&req->in->status, status); > + iov_discard_undo(&req->inhdr_undo); > + if (virtio_ldl_p(vdev, &req->out.type) == VIRTIO_BLK_T_ZONE_APPEND) { > + inhdr_len = sizeof(struct virtio_blk_zone_append_inhdr); > + req->in.in_hdr->status = status; > + buf = req->in.in_hdr; > + } else { > + inhdr_len = sizeof(struct virtio_blk_inhdr); > + req->in.zone_append_inhdr->status = status; > + buf = req->in.zone_append_inhdr; > + } > + > + n = iov_from_buf(req->elem.in_sg, req->elem.in_num, > + req->in_len - inhdr_len, buf, inhdr_len); > + if (n != inhdr_len) { > + virtio_error(vdev, "Driver provided input buffer less than size of " > + "in header"); > + } > + > iov_discard_undo(&req->inhdr_undo); > iov_discard_undo(&req->outhdr_undo); > virtqueue_push(req->vq, &req->elem, req->in_len); > @@ -592,6 +614,334 @@ err: > return err_status; > } > > +typedef struct ZoneCmdData { > + VirtIOBlockReq *req; > + union { > + struct { > + unsigned int nr_zones; > + BlockZoneDescriptor *zones; > + } zone_report_data; > + struct { > + int64_t offset; > + } zone_append_data; > + }; > +} ZoneCmdData; > + > +/* > + * check zoned_request: error checking before issuing requests. If all checks > + * passed, return true. > + * append: true if only zone append requests issued. > + */ > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len, > + bool append, uint8_t *status) { > + BlockDriverState *bs = blk_bs(s->blk); > + int index = offset / bs->bl.zone_size; > + > + if (offset < 0 || len < 0 || offset > bs->bl.capacity - len) { > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + return false; > + } > + > + if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) { > + *status = VIRTIO_BLK_S_UNSUPP; > + return false; > + } > + > + if (append) { > + if ((offset % bs->bl.write_granularity) != 0) { > + *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP; > + return false; > + } > + > + if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) { > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + return false; > + } > + > + if (len / 512 > bs->bl.max_append_sectors) { > + if (bs->bl.max_append_sectors == 0) { > + *status = VIRTIO_BLK_S_UNSUPP; > + } else { > + *status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + } > + return false; > + } > + } > + return true; > +} > + > +static void virtio_blk_zone_report_complete(void *opaque, int ret) > +{ > + ZoneCmdData *data = opaque; > + VirtIOBlockReq *req = data->req; > + VirtIOBlock *s = req->dev; > + VirtIODevice *vdev = VIRTIO_DEVICE(req->dev); > + struct iovec *in_iov = req->elem.in_sg; > + unsigned in_num = req->elem.in_num; > + int64_t zrp_size, nz, n, j = 0; > + int8_t err_status = VIRTIO_BLK_S_OK; > + > + if (ret) { > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + goto out; > + } > + > + nz = data->zone_report_data.nr_zones; > + struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { > + .nr_zones = cpu_to_le64(nz), > + }; > + > + zrp_size = sizeof(struct virtio_blk_zone_report) > + + sizeof(struct virtio_blk_zone_descriptor) * nz; > + n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); > + if (n != sizeof(zrp_hdr)) { > + virtio_error(vdev, "Driver provided intput buffer that is too > small!");
s/intput/input > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + goto out; > + } > + > + for (size_t i = sizeof(zrp_hdr); i < zrp_size; > + i += sizeof(struct virtio_blk_zone_descriptor), ++j) { > + struct virtio_blk_zone_descriptor desc = > + (struct virtio_blk_zone_descriptor) { > + .z_start = cpu_to_le64(data->zone_report_data.zones[j].start) > + >> BDRV_SECTOR_BITS, > + .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap) > + >> BDRV_SECTOR_BITS, > + .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp) > + >> BDRV_SECTOR_BITS, I think these should be + .z_start = cpu_to_le64(data->zone_report_data.zones[j].start + >> BDRV_SECTOR_BITS), + .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap + >> BDRV_SECTOR_BITS), + .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp + >> BDRV_SECTOR_BITS), > + }; > + > + switch (data->zone_report_data.zones[j].type) { > + case BLK_ZT_CONV: > + desc.z_type = BLK_ZONE_TYPE_CONVENTIONAL; > + break; > + case BLK_ZT_SWR: > + desc.z_type = BLK_ZONE_TYPE_SEQWRITE_REQ; > + break; > + case BLK_ZT_SWP: > + desc.z_type = BLK_ZONE_TYPE_SEQWRITE_PREF; > + break; > + default: > + g_assert_not_reached(); > + } why do we need to convert these to system-defined values? The output of this function will be sent to the driver that uses the values defined in virtio-zbd spec. And the conversion to BLK_ZS_xxx/BLK_ZT_xxx has already been done in parse_zone() function in file-posix.c > + > + switch (data->zone_report_data.zones[j].cond) { > + case BLK_ZS_RDONLY: > + desc.z_state = BLK_ZONE_COND_READONLY; > + break; > + case BLK_ZS_OFFLINE: > + desc.z_state = BLK_ZONE_COND_OFFLINE; > + break; > + case BLK_ZS_EMPTY: > + desc.z_state = BLK_ZONE_COND_EMPTY; > + break; > + case BLK_ZS_CLOSED: > + desc.z_state = BLK_ZONE_COND_CLOSED; > + break; > + case BLK_ZS_FULL: > + desc.z_state = BLK_ZONE_COND_FULL; > + break; > + case BLK_ZS_EOPEN: > + desc.z_state = BLK_ZONE_COND_EXP_OPEN; > + break; > + case BLK_ZS_IOPEN: > + desc.z_state = BLK_ZONE_COND_IMP_OPEN; > + break; > + case BLK_ZS_NOT_WP: > + desc.z_state = BLK_ZONE_COND_NOT_WP; > + break; > + default: > + g_assert_not_reached(); > + } same here. I think the code to form the descriptor can be reduced to + struct virtio_blk_zone_descriptor desc = (struct virtio_blk_zone_descriptor) { + .z_start = cpu_to_le64(data->zone_report_data.zones[j].start >> BDRV_SECTOR_BITS), + .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap >> BDRV_SECTOR_BITS), + .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp >> BDRV_SECTOR_BITS), + .z_type = data->zone_report_data.zones[j].type, + .z_state = data->zone_report_data.zones[j].state, + }; > + > + /* TODO: it takes O(n^2) time complexity. Optimizations required. */ > + n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc)); > + if (n != sizeof(desc)) { > + virtio_error(vdev, "Driver provided input buffer " > + "for descriptors that is too small!"); > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + goto out; > + } > + } > + goto out; this goto is not needed. > + > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + g_free(data->zone_report_data.zones); > + g_free(data); > +} > + > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) > +{ > + VirtIOBlock *s = req->dev; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + unsigned int nr_zones; > + ZoneCmdData *data; > + int64_t zone_size, offset; > + uint8_t err_status; > + > + if (req->in_len < sizeof(struct virtio_blk_inhdr) + > + sizeof(struct virtio_blk_zone_report) + > + sizeof(struct virtio_blk_zone_descriptor)) { > + virtio_error(vdev, "in buffer too small for zone report"); > + return -1; > + } > + > + /* start byte offset of the zone report */ > + offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > + if (!check_zoned_request(s, offset, 0, false, &err_status)) { > + goto out; > + } > + > + nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) - > + sizeof(struct virtio_blk_zone_report)) / > + sizeof(struct virtio_blk_zone_descriptor); > + > + zone_size = sizeof(BlockZoneDescriptor) * nr_zones; > + data = g_malloc(sizeof(ZoneCmdData)); > + data->req = req; > + data->zone_report_data.nr_zones = nr_zones; > + data->zone_report_data.zones = g_malloc(zone_size), > + > + blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones, > + data->zone_report_data.zones, > + virtio_blk_zone_report_complete, data); > + return 0; > + > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + return err_status; > +} > + > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) > +{ > + ZoneCmdData *data = opaque; > + VirtIOBlockReq *req = data->req; > + VirtIOBlock *s = req->dev; > + int8_t err_status = VIRTIO_BLK_S_OK; > + > + if (ret) { > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + goto out; > + } > + goto out; > + > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + g_free(data); > +} > + > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) > +{ > + VirtIOBlock *s = req->dev; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + BlockDriverState *bs = blk_bs(s->blk); > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > + uint64_t len; > + uint8_t err_status = VIRTIO_BLK_S_OK; > + > + uint32_t type = virtio_ldl_p(vdev, &req->out.type); > + if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) { > + /* Entire drive capacity */ > + offset = 0; > + len = bs->bl.capacity; > + } else { > + if (bs->bl.zone_size > bs->bl.capacity - offset) { > + /* The zoned device allows the last smaller zone. */ > + len = bs->bl.capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1); > + } else { > + len = bs->bl.zone_size; > + } > + } > + > + if (!check_zoned_request(s, offset, 0, false, &err_status)) { > + goto out; > + } > + > + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); > + data->req = req; > + blk_aio_zone_mgmt(s->blk, op, offset, len, > + virtio_blk_zone_mgmt_complete, data); > + > + return 0; > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + return err_status; > +} > + > +static void virtio_blk_zone_append_complete(void *opaque, int ret) > +{ > + ZoneCmdData *data = opaque; > + VirtIOBlockReq *req = data->req; > + VirtIOBlock *s = req->dev; > + struct iovec *in_iov = req->elem.in_sg; > + unsigned in_num = req->elem.in_num; > + uint8_t err_status = VIRTIO_BLK_S_OK; > + > + if (ret) { > + err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD; > + goto out; > + } > + > + struct virtio_blk_zone_append_inhdr zap_hdr = > + (struct virtio_blk_zone_append_inhdr) { > + .append_sector = cpu_to_le64(data->zone_append_data.offset > + >> BDRV_SECTOR_BITS), > + }; > + iov_from_buf(in_iov, in_num, 0, &zap_hdr, sizeof(zap_hdr)); > + goto out; > + > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + g_free(data); > +} > + > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req, > + struct iovec *out_iov, > + uint64_t niov) { > + VirtIOBlock *s = req->dev; > + VirtIODevice *vdev = VIRTIO_DEVICE(s); > + uint8_t err_status = VIRTIO_BLK_S_OK; > + > + int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512; > + int64_t len = iov_size(out_iov, niov); > + > + if (!check_zoned_request(s, offset, len, true, &err_status)) { > + goto out; > + } > + > + ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); > + data->req = req; > + data->zone_append_data.offset = offset; > + qemu_iovec_init_external(&req->qiov, out_iov, niov); > + blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, > 0, > + virtio_blk_zone_append_complete, data); > + return 0; > + > +out: > + aio_context_acquire(blk_get_aio_context(s->conf.conf.blk)); > + virtio_blk_req_complete(req, err_status); > + virtio_blk_free_request(req); > + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); > + return err_status; > +} > + > static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer > *mrb) > { > uint32_t type; > @@ -624,7 +974,7 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, > MultiReqBuffer *mrb) > > /* We always touch the last byte, so just see how big in_iov is. */ > req->in_len = iov_size(in_iov, in_num); > - req->in = (void *)in_iov[in_num - 1].iov_base > + req->in.in_hdr = (void *)in_iov[in_num - 1].iov_base > + in_iov[in_num - 1].iov_len > - sizeof(struct virtio_blk_inhdr); > iov_discard_back_undoable(in_iov, &in_num, sizeof(struct > virtio_blk_inhdr), > @@ -678,6 +1028,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq > *req, > MultiReqBuffer *mrb) > case VIRTIO_BLK_T_FLUSH: > virtio_blk_handle_flush(req, mrb); > break; > + case VIRTIO_BLK_T_ZONE_REPORT: > + virtio_blk_handle_zone_report(req); > + break; > + case VIRTIO_BLK_T_ZONE_OPEN: > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN); > + break; > + case VIRTIO_BLK_T_ZONE_CLOSE: > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE); > + break; > + case VIRTIO_BLK_T_ZONE_FINISH: > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH); > + break; > + case VIRTIO_BLK_T_ZONE_RESET: > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); > + break; > + case VIRTIO_BLK_T_ZONE_RESET_ALL: > + virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET); > + break; > case VIRTIO_BLK_T_SCSI_CMD: > virtio_blk_handle_scsi(req); > break; > @@ -696,6 +1064,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq > *req, > MultiReqBuffer *mrb) > virtio_blk_free_request(req); > break; > } > + case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT: > + /* > + * It is not safe to access req->elem.out_sg directly because it > + * may be modified by virtio_blk_handle_request(). > + */ > + virtio_blk_handle_zone_append(req, out_iov, out_num); > + break; > /* > * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with > * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch > statement, > @@ -895,6 +1270,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, > uint8_t *config) > { > VirtIOBlock *s = VIRTIO_BLK(vdev); > BlockConf *conf = &s->conf.conf; > + BlockDriverState *bs = blk_bs(s->blk); > struct virtio_blk_config blkcfg; > uint64_t capacity; > int64_t length; > @@ -954,6 +1330,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, > uint8_t *config) > blkcfg.write_zeroes_may_unmap = 1; > virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1); > } > + if (bs->bl.zoned != BLK_Z_NONE) { > + switch (bs->bl.zoned) { > + case BLK_Z_HM: > + blkcfg.zoned.model = VIRTIO_BLK_Z_HM; > + break; > + case BLK_Z_HA: > + blkcfg.zoned.model = VIRTIO_BLK_Z_HA; > + break; > + default: > + g_assert_not_reached(); > + } > + > + virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors, > + bs->bl.zone_size / 512); > + virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones, > + bs->bl.max_active_zones); > + virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones, > + bs->bl.max_open_zones); > + virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size); > + virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors, > + bs->bl.max_append_sectors); > + } else { > + blkcfg.zoned.model = VIRTIO_BLK_Z_NONE; > + } > memcpy(config, &blkcfg, s->config_size); > } > > @@ -1118,6 +1518,7 @@ static void virtio_blk_device_realize(DeviceState *dev, > Error **errp) > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > VirtIOBlock *s = VIRTIO_BLK(dev); > VirtIOBlkConf *conf = &s->conf; > + BlockDriverState *bs = blk_bs(conf->conf.blk); > Error *err = NULL; > unsigned i; > > @@ -1163,6 +1564,13 @@ static void virtio_blk_device_realize(DeviceState *dev, > Error **errp) > return; > } > > + if (bs->bl.zoned != BLK_Z_NONE) { > + virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); > + if (bs->bl.zoned == BLK_Z_HM) { > + virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD); > + } > + } > + > if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) && > (!conf->max_discard_sectors || > conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) { > diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h > index d311c57cca..88901a9f55 100644 > --- a/include/hw/virtio/virtio-blk.h > +++ b/include/hw/virtio/virtio-blk.h > @@ -30,6 +30,12 @@ struct virtio_blk_inhdr > unsigned char status; > }; > > +struct virtio_blk_zone_append_inhdr { > + unsigned char status; > + uint8_t reserved[7]; > + int64_t append_sector; > +}; > + > #define VIRTIO_BLK_AUTO_NUM_QUEUES UINT16_MAX > > struct VirtIOBlkConf > @@ -73,7 +79,10 @@ typedef struct VirtIOBlockReq { > VirtQueue *vq; > IOVDiscardUndo inhdr_undo; > IOVDiscardUndo outhdr_undo; > - struct virtio_blk_inhdr *in; > + union { > + struct virtio_blk_inhdr *in_hdr; > + struct virtio_blk_zone_append_inhdr *zone_append_inhdr; > + } in; > struct virtio_blk_outhdr out; > QEMUIOVector qiov; > size_t in_len;