On Mon, Jun 07, 2021 at 09:32:53PM +0800, zhenwei pi wrote: > Add a new qemu block driver which uses libnvmf as userspace NVMe over > fabric initiator. > > Currently QEMU uses 4 NVMF IO-queues by a RR policy, test with a > linux kernel NVMF target, QEMU gets about 220K IOPS. > > Thanks to Famz for several suggestions. > > Signed-off-by: zhenwei pi <pizhen...@bytedance.com> > --- > block/meson.build | 1 + > block/nvmf.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++ > configure | 8 +- > meson.build | 8 + > meson_options.txt | 2 + > 5 files changed, 443 insertions(+), 1 deletion(-) > create mode 100644 block/nvmf.c > > diff --git a/block/meson.build b/block/meson.build > index 01861e1545..adf4e753ae 100644 > --- a/block/meson.build > +++ b/block/meson.build > @@ -77,6 +77,7 @@ foreach m : [ > [libnfs, 'nfs', files('nfs.c')], > [libssh, 'ssh', files('ssh.c')], > [rbd, 'rbd', files('rbd.c')], > + [libnvmf, 'nvmf', files('nvmf.c')], > ] > if m[0].found() > module_ss = ss.source_set() > diff --git a/block/nvmf.c b/block/nvmf.c > new file mode 100644 > index 0000000000..d09ca41b58 > --- /dev/null > +++ b/block/nvmf.c > @@ -0,0 +1,425 @@ > +/* > + * NVMe over fabric block driver based on libnvmf > + * > + * Copyright 2020-2021 zhenwei pi > + * > + * Authors: > + * zhenwei pi <pizhen...@bytedance.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > + > +#include "qemu/osdep.h" > + > +#include <poll.h>
Why is this system header required? > +#include "qemu/error-report.h" > +#include "block/block_int.h" > +#include "block/nvme.h" > +#include "qemu/iov.h" > +#include "qemu/option.h" > +#include "qemu/module.h" > +#include "qapi/error.h" > +#include "qapi/qmp/qdict.h" > +#include "qapi/qmp/qstring.h" > + > +#include "nvmf/nvmf.h" This should probably be <nvmf/nvmf.h>. > + > +#define DEF_IO_QUEUES 4 /* default IO queues 4 by RR policy > */ Why did you choose the round-robin approach instead of a single queue? Is the idea to allow multi-core NVMeoF targets to process I/O queues in parallel? Or is the goal to avoid TCP connections stalling? Or something else? > +#define NVMF_KATO 30000 /* default keepalive time 30s */ > +#define NVMF_URI "uri" > +#define PROTOCOL_TCP_PREF "nvmf-tcp://" > +#define PROTOCOL_RDMA_PREF "nvmf-rdma://" > + > +typedef struct NvmfQueue { > + int qid; > + CoQueue wait_queue; > + CoMutex wait_lock; > +} NvmfQueue; > + > +typedef struct NvmfHost { > + nvmf_options_t opts; > + nvmf_ctrl_t ctrl; > + AioContext *aio_context; > + QemuSpin lock; > + > + unsigned long requests; > + int nr_ioqueues; > + NvmfQueue *ioqueues; > +} NvmfHost; > + > +typedef struct NvmfReq { > + NvmfHost *host; > + nvmf_req_t req; > + Coroutine *co; > + int qid; > + int retval; > + bool done; > +} NvmfReq; > + > +typedef enum NvmfOp { > + NvmfOpRead, > + NvmfOpWrite, > + NvmfOpDiscard, > + NvmfOpWritezeroes > +} NvmfOp; > + > +static void nvmf_process(void *opaque) > +{ > + NvmfHost *host = opaque; > + > + nvmf_ctrl_process(host->ctrl); > +} > + > +static void nvmf_attach_aio_context(BlockDriverState *bs, > + AioContext *new_context) > +{ > + NvmfHost *host = bs->opaque; > + > + host->aio_context = new_context; > + > + qemu_spin_lock(&host->lock); > + aio_set_fd_handler(new_context, nvmf_ctrl_fd(host->ctrl), > + false, nvmf_process, NULL, NULL, host); > + qemu_spin_unlock(&host->lock); This lock is strange. aio_set_fd_handler() is thread-safe. Why is it necessary? > +} > + > +static QemuOptsList runtime_opts = { > + .name = "nvmf", > + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), > + .desc = { > + { > + .name = NVMF_URI, > + .type = QEMU_OPT_STRING, > + .help = "NVMe over fabric URI", > + }, > + { /* end of list */ } > + }, > +}; > + > +static void nvmf_parse_filename(const char *filename, QDict *options, > + Error **errp) > +{ > + const char *uri; > + int len; > + > + len = strlen(PROTOCOL_TCP_PREF); > + if (strlen(filename) > len && !strncmp(filename, PROTOCOL_TCP_PREF, > len)) { > + uri = g_strdup(filename); > + qdict_put_str(options, NVMF_URI, uri); > + return; > + } > + > + len = strlen(PROTOCOL_RDMA_PREF); > + if (strlen(filename) > len && !strncmp(filename, PROTOCOL_RDMA_PREF, > len)) { > + uri = g_strdup(filename); > + qdict_put_str(options, NVMF_URI, uri); > + return; > + } > + > + error_setg(errp, "nvmf: invalid filename. Ex, nvmf-tcp/nvmf-rdma"); > +} > + > +static int nvmf_file_open(BlockDriverState *bs, QDict *options, int flags, > + Error **errp) > +{ > + NvmfHost *host = bs->opaque; > + QemuOpts *qopts; > + nvmf_ctrl_t ctrl; > + nvmf_options_t opts; > + const char *uri; > + NvmfQueue *queue; > + unsigned int nsid; > + int i; > + > + qopts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); > + qemu_opts_absorb_qdict(qopts, options, &error_abort); > + uri = qemu_opt_get(qopts, NVMF_URI); > + if (!uri) { > + error_setg(errp, "nvmf: missing URI"); > + goto free_qopts; > + } > + > + host->nr_ioqueues = DEF_IO_QUEUES; > + opts = nvmf_default_options(uri, NULL); > + if (!opts) { > + error_setg(errp, "nvmf: parse uri failed"); > + goto free_uri; > + } > + > + nvmf_options_set_io_queues(opts, host->nr_ioqueues); > + nvmf_options_set_kato(opts, NVMF_KATO); > + ctrl = nvmf_ctrl_create(opts); > + if (!ctrl) { > + error_setg(errp, "nvmf: create ctrl failed"); > + goto free_nvmf_opts; > + } > + > + if (nvmf_ns_count(ctrl) < 1) { > + error_setg(errp, "nvmf: no available namespace"); > + goto release_ctrl; > + } > + > + /* nsid should be specified by command line */ > + nsid = nvmf_ns_id(ctrl); > + if (!nvmf_ns_lbads(ctrl, nsid)) { > + error_setg(errp, "nvmf: invalid LBADS"); > + goto release_ctrl; > + } > + > + if (!nvmf_ns_nsze(ctrl, nsid)) { > + error_setg(errp, "nvmf: invalid size"); > + goto release_ctrl; > + } > + > + host->ioqueues = g_new0(NvmfQueue, host->nr_ioqueues + 1); > + memset(host->ioqueues, 0x00, sizeof(NvmfQueue) * (host->nr_ioqueues + > 1)); g_new0() already zeroes the memory so there is no need to do it again. > + for (i = 0; i < host->nr_ioqueues + 1; i++) { > + queue = host->ioqueues + i; > + queue->qid = i; > + qemu_co_mutex_init(&queue->wait_lock); > + qemu_co_queue_init(&queue->wait_queue); > + } > + > + host->ctrl = ctrl; > + host->opts = opts; > + qemu_spin_init(&host->lock); > + nvmf_attach_aio_context(bs, bdrv_get_aio_context(bs)); > + g_free((void *)uri); If uri is the return value of qemu_opt_get() then it must not be freed. > + > + return 0; > + > +release_ctrl: > + nvmf_ctrl_release(ctrl); > + > +free_nvmf_opts: > + nvmf_options_free(opts); > + > +free_uri: > + g_free((void *)uri); > + > +free_qopts: > + qemu_opts_del(qopts); > + > + return -EINVAL; > +} > + > +static void nvmf_close(BlockDriverState *bs) > +{ > + NvmfHost *host = bs->opaque; > + > + nvmf_ctrl_release(host->ctrl); > + nvmf_options_free(host->opts); > + g_free(host->ioqueues); > +} > + > +static inline bool nvmf_is_lba_aligned(nvmf_ctrl_t ctrl, size_t offset, > + size_t length) > +{ > + unsigned int nsid = nvmf_ns_id(ctrl); > + unsigned char lbads = nvmf_ns_lbads(ctrl, nsid); > + return !(offset & ((1 << lbads) - 1)) && !(length & ((1 << lbads) - 1)); > +} > + > +static inline void nvmf_req_co_init(NvmfHost *host, NvmfReq *req) > +{ > + req->host = host; > + req->req = req; > + req->co = qemu_coroutine_self(); > + req->done = false; > +} > + > +static void nvmf_req_cb(unsigned short status, void *opaque) > +{ > + NvmfReq *req = opaque; > + > + switch (status) { > + case NVME_SUCCESS: > + req->retval = 0; > + break; > + > + case NVME_INVALID_OPCODE: > + req->retval = -ENOSYS; > + break; > + > + case NVME_INVALID_FIELD: > + req->retval = -EINVAL; > + break; > + > + default: > + req->retval = -EIO; > + break; > + } > + > + req->done = true; > + aio_co_wake(req->co); > +} > + > +static coroutine_fn int nvmf_co_io(BlockDriverState *bs, uint64_t offset, > + uint64_t bytes, QEMUIOVector *qiov, > + int flags, NvmfOp op) > +{ > + NvmfHost *host = bs->opaque; > + NvmfQueue *queue; > + NvmfReq req; > + struct iovec iov = {.iov_base = NULL, .iov_len = bytes}; > + int qid; > + > + if (!nvmf_is_lba_aligned(host->ctrl, offset, bytes)) { > + return -EINVAL; > + } > + /* RR policy */ > + qid = (host->requests++ % host->nr_ioqueues) + 1; > + queue = host->ioqueues + qid; > + nvmf_req_co_init(host, &req); > + req.qid = qid; > + > +retry: > + switch (op) { > + case NvmfOpRead: > + req.req = nvmf_read_async(host->ctrl, qid, qiov->iov, qiov->niov, > + offset, 0, nvmf_req_cb, &req); > + break; > + case NvmfOpWrite: > + req.req = nvmf_write_async(host->ctrl, qid, qiov->iov, qiov->niov, > + offset, 0, nvmf_req_cb, &req); > + break; > + case NvmfOpDiscard: > + req.req = nvmf_discard_async(host->ctrl, qid, &iov, 1, offset, 0, > + nvmf_req_cb, &req); > + break; > + case NvmfOpWritezeroes: > + req.req = nvmf_writezeroes_async(host->ctrl, qid, &iov, 1, offset, 0, > + nvmf_req_cb, &req); > + break; > + default: > + return -EOPNOTSUPP; > + } > + > + if (!req.req) { > + /* test queue is full */ > + if (nvmf_queue_nr_inflight(host->ctrl, qid) == > + nvmf_queue_depth(host->ctrl, qid)) { > + qemu_co_mutex_lock(&queue->wait_lock); This works because all requests run in a single AioContext in QEMU. Emanuele Esposito and Paolo Bonzini have been sending multi-queue patches that convert block drivers to support multi-threading. Requests can be submitted from any thread without taking the AioContext lock. It will probably be necessary to move queue->wait_lock to also cover the if expression so there are no race conditions. I think you can leave the code as it is, but there's a chance it will need to be converted before it can be merged since the multi-queue block layer patches are making their way upstream at the same time. > + qemu_co_queue_wait(&queue->wait_queue, &queue->wait_lock); > + qemu_co_mutex_unlock(&queue->wait_lock); > + goto retry; > + } else { > + assert(0); > + } > + } > + > + while (!req.done) { > + qemu_coroutine_yield(); > + } > + > + nvmf_req_free(req.req); > + > + qemu_co_mutex_lock(&queue->wait_lock); > + if (!qemu_co_queue_empty(&queue->wait_queue)) { > + qemu_co_queue_restart_all(&queue->wait_queue); > + } > + qemu_co_mutex_unlock(&queue->wait_lock); > + > + return req.retval; > +} > + > +static coroutine_fn int nvmf_co_preadv(BlockDriverState *bs, uint64_t offset, > + uint64_t bytes, QEMUIOVector *qiov, > + int flags) > +{ > + return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpRead); > +} > + > +static coroutine_fn int nvmf_co_pwritev(BlockDriverState *bs, uint64_t > offset, > + uint64_t bytes, QEMUIOVector *qiov, > + int flags) > +{ > + return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpWrite); > +} > + > +static coroutine_fn int nvmf_co_pwrite_zeroes(BlockDriverState *bs, > + int64_t offset, int bytes, > + BdrvRequestFlags flags) > +{ > + return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpWritezeroes); > +} > + > +static coroutine_fn int nvmf_co_pdiscard(BlockDriverState *bs, int64_t > offset, > + int bytes) > +{ > + return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpDiscard); > +} > + > +static void nvmf_refresh_limits(BlockDriverState *bs, Error **errp) > +{ > + NvmfHost *host = bs->opaque; > + unsigned int nsid = nvmf_ns_id(host->ctrl); > + unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid); > + > + bs->bl.request_alignment = MAX(BDRV_SECTOR_SIZE, (1 << lbads)); > + bs->bl.max_transfer = nvmf_ctrl_mdts(host->ctrl); > + bs->bl.max_iov = nvmf_max_iov(host->ctrl); > +} > + > +static int64_t nvmf_getlength(BlockDriverState *bs) > +{ > + NvmfHost *host = bs->opaque; > + unsigned int nsid = nvmf_ns_id(host->ctrl); > + unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid); > + unsigned long nsze = nvmf_ns_nsze(host->ctrl, nsid); > + > + return (1 << lbads) * nsze; > +} > + > +static void nvmf_detach_aio_context(BlockDriverState *bs) > +{ > + NvmfHost *host = bs->opaque; > + > + qemu_spin_lock(&host->lock); > + aio_set_fd_handler(host->aio_context, nvmf_ctrl_fd(host->ctrl), > + false, NULL, NULL, NULL, NULL); > + qemu_spin_unlock(&host->lock); > +} > + > +static BlockDriver bdrv_nvmf_tcp = { > + .format_name = "nvmf-tcp", > + .protocol_name = "nvmf-tcp", > + .instance_size = sizeof(NvmfHost), > + .bdrv_parse_filename = nvmf_parse_filename, > + .bdrv_file_open = nvmf_file_open, > + .bdrv_close = nvmf_close, > + .bdrv_getlength = nvmf_getlength, > + .bdrv_refresh_limits = nvmf_refresh_limits, > + .bdrv_co_pdiscard = nvmf_co_pdiscard, > + .bdrv_co_preadv = nvmf_co_preadv, > + .bdrv_co_pwritev = nvmf_co_pwritev, > + .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes, > + .bdrv_detach_aio_context = nvmf_detach_aio_context, > + .bdrv_attach_aio_context = nvmf_attach_aio_context, > +}; > + > +static BlockDriver bdrv_nvmf_rdma = { > + .format_name = "nvmf-rdma", > + .protocol_name = "nvmf-rdma", > + .instance_size = sizeof(NvmfHost), > + .bdrv_parse_filename = nvmf_parse_filename, > + .bdrv_file_open = nvmf_file_open, > + .bdrv_close = nvmf_close, > + .bdrv_getlength = nvmf_getlength, > + .bdrv_refresh_limits = nvmf_refresh_limits, > + .bdrv_co_pdiscard = nvmf_co_pdiscard, > + .bdrv_co_preadv = nvmf_co_preadv, > + .bdrv_co_pwritev = nvmf_co_pwritev, > + .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes, > + .bdrv_detach_aio_context = nvmf_detach_aio_context, > + .bdrv_attach_aio_context = nvmf_attach_aio_context, > +}; > + > +static void nvmf_block_init(void) > +{ > + bdrv_register(&bdrv_nvmf_tcp); > + bdrv_register(&bdrv_nvmf_rdma); > +} > + > +block_init(nvmf_block_init); > diff --git a/configure b/configure > index 8dcb9965b2..608b108062 100755 > --- a/configure > +++ b/configure > @@ -393,6 +393,7 @@ vss_win32_sdk="$default_feature" > win_sdk="no" > want_tools="$default_feature" > libiscsi="auto" > +libnvmf="auto" > libnfs="auto" > coroutine="" > coroutine_pool="$default_feature" > @@ -1127,6 +1128,10 @@ for opt do > ;; > --enable-libiscsi) libiscsi="enabled" > ;; > + --disable-libnvmf) libnvmf="disabled" > + ;; > + --enable-libnvmf) libnvmf="enabled" > + ;; > --disable-libnfs) libnfs="disabled" > ;; > --enable-libnfs) libnfs="enabled" > @@ -1889,6 +1894,7 @@ disabled with --disable-FEATURE, default is enabled if > available > spice-protocol spice-protocol > rbd rados block device (rbd) > libiscsi iscsi support > + libnvmf NVMe over Fabric support > libnfs nfs support > smartcard smartcard support (libcacard) > u2f U2F support (u2f-emu) > @@ -6448,7 +6454,7 @@ if test "$skip_meson" = no; then > -Dvhost_user_blk_server=$vhost_user_blk_server > -Dmultiprocess=$multiprocess \ > -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek > -Dguest_agent_msi=$guest_agent_msi -Dbpf=$bpf\ > $(if test "$default_features" = no; then echo > "-Dauto_features=disabled"; fi) \ > - -Dtcg_interpreter=$tcg_interpreter \ > + -Dtcg_interpreter=$tcg_interpreter -Dlibnvmf=$libnvmf\ > $cross_arg \ > "$PWD" "$source_path" > > diff --git a/meson.build b/meson.build > index 626cf932c1..ccdc36266a 100644 > --- a/meson.build > +++ b/meson.build > @@ -478,6 +478,12 @@ if not get_option('libiscsi').auto() or have_block > required: get_option('libiscsi'), > method: 'pkg-config', kwargs: static_kwargs) > endif > +libnvmf = not_found > +if not get_option('libnvmf').auto() or have_block > + libnvmf = dependency('libnvmf', version: '>=0.1', > + required: get_option('libnvmf'), > + method: 'pkg-config', kwargs: static_kwargs) > +endif > zstd = not_found > if not get_option('zstd').auto() or have_block > zstd = dependency('libzstd', version: '>=1.4.0', > @@ -1150,6 +1156,7 @@ config_host_data.set('CONFIG_LIBATTR', have_old_libattr) > config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found()) > config_host_data.set('CONFIG_EBPF', libbpf.found()) > config_host_data.set('CONFIG_LIBISCSI', libiscsi.found()) > +config_host_data.set('CONFIG_LIBNVMF', libnvmf.found()) > config_host_data.set('CONFIG_LIBNFS', libnfs.found()) > config_host_data.set('CONFIG_RBD', rbd.found()) > config_host_data.set('CONFIG_SDL', sdl.found()) > @@ -2736,6 +2743,7 @@ summary_info += {'usb net redir': > config_host.has_key('CONFIG_USB_REDIR')} > summary_info += {'OpenGL support': config_host.has_key('CONFIG_OPENGL')} > summary_info += {'GBM': config_host.has_key('CONFIG_GBM')} > summary_info += {'libiscsi support': libiscsi.found()} > +summary_info += {'libnvmf support': libnvmf.found()} > summary_info += {'libnfs support': libnfs.found()} > if targetos == 'windows' > if config_host.has_key('CONFIG_GUEST_AGENT') > diff --git a/meson_options.txt b/meson_options.txt > index 3d304cac96..102705824f 100644 > --- a/meson_options.txt > +++ b/meson_options.txt > @@ -68,6 +68,8 @@ option('glusterfs', type : 'feature', value : 'auto', > description: 'Glusterfs block device driver') > option('libiscsi', type : 'feature', value : 'auto', > description: 'libiscsi userspace initiator') > +option('libnvmf', type : 'feature', value : 'auto', > + description: 'NVMe over Fabric userspace initiator') > option('libnfs', type : 'feature', value : 'auto', > description: 'libnfs block device driver') > option('mpath', type : 'feature', value : 'auto', > -- > 2.25.1 >
signature.asc
Description: PGP signature