Re: [Qemu-devel] [PATCH v9 07/17] blockdev: adds bdrv_parse_aio to use io_uring
On Wed, 7 Aug, 2019, 17:15 Julia Suvorova, wrote: > On Fri, Aug 2, 2019 at 1:41 AM Aarushi Mehta > wrote: > > +int bdrv_parse_aio(const char *mode, int *flags) > > +{ > > +if (!strcmp(mode, "threads")) { > > +/* do nothing, default */ > > +} else if (!strcmp(mode, "native")) { > > +*flags |= BDRV_O_NATIVE_AIO; > > This 'if' should be covered with CONFIG_LINUX_AIO. > The aio=native definition is shared with Windows hosts' native aio and will break if it was covered. file-posix handles the case. Best regards, Julia Suvorova. > > > +#ifdef CONFIG_LINUX_IO_URING > > +} else if (!strcmp(mode, "io_uring")) { > > +*flags |= BDRV_O_IO_URING; > > +#endif > > +} else { > > +return -1; > > +} > > + > > +return 0; > > +} >
[Qemu-devel] [PATCH v9 17/17] block/io_uring: enable kernel submission polling
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index 1553cd2e58..2a1d79704a 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -288,6 +288,17 @@ static int ioq_submit(LuringState *s) *sqes = luringcb->sqeq; QSIMPLEQ_REMOVE_HEAD(>io_q.submit_queue, next); } +/* + * io_uring_submit() returns sqes in ring for kernel side + * submission polling and sets wakeup flag if needed. + * + * It is not possible for any sqes to have already been + * submitted by the sq_poll as the writes are only made visible + * to the kernel in this function. + * + * For normal I/O, it returns the actual submitted requests + * from io_uring_enter() + */ ret = io_uring_submit(>ring); trace_luring_io_uring_submit(s, ret); /* Prevent infinite loop if submission is refused */ @@ -525,7 +536,11 @@ LuringState *luring_init(Error **errp) s = g_new0(LuringState, 1); trace_luring_init_state(s, sizeof(*s)); struct io_uring *ring = >ring; -rc = io_uring_queue_init(MAX_EVENTS, ring, 0); + +rc = io_uring_queue_init(MAX_EVENTS, ring, IORING_SETUP_SQPOLL); +if (rc == -EOPNOTSUPP) { +rc = io_uring_queue_init(MAX_EVENTS, ring, 0); +} if (rc < 0) { error_setg_errno(errp, errno, "failed to init linux io_uring ring"); g_free(s); -- 2.21.0
[Qemu-devel] [PATCH v9 14/17] tests/qemu-iotests: enable testing with aio options
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- tests/qemu-iotests/check | 15 ++- tests/qemu-iotests/common.rc | 14 ++ tests/qemu-iotests/iotests.py | 9 - 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index c24874ff4a..1e398923fd 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -132,6 +132,7 @@ sortme=false expunge=true have_test_arg=false cachemode=false +aiomode=false tmp="${TEST_DIR}"/$$ rm -f $tmp.list $tmp.tmp $tmp.sed @@ -141,6 +142,7 @@ export IMGFMT_GENERIC=true export IMGPROTO=file export IMGOPTS="" export CACHEMODE="writeback" +export AIOMODE="threads" export QEMU_IO_OPTIONS="" export QEMU_IO_OPTIONS_NO_FMT="" export CACHEMODE_IS_DEFAULT=true @@ -225,6 +227,11 @@ s/ .*//p CACHEMODE_IS_DEFAULT=false cachemode=false continue +elif $aiomode +then +AIOMODE="$r" +aiomode=false +continue fi xpand=true @@ -269,6 +276,7 @@ other options -n show me, do not run tests -o options -o options to pass to qemu-img create/convert -c mode cache mode +-i mode AIO mode -makecheck pretty print output for make check testlist options @@ -433,10 +441,13 @@ testlist options cachemode=true xpand=false ;; +-i) +aiomode=true +xpand=false +;; -T)# deprecated timestamp option xpand=false ;; - -v) verbose=true xpand=false @@ -515,6 +526,8 @@ done # Set qemu-io cache mode with $CACHEMODE we have QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --cache $CACHEMODE" +# Set qemu-io aio mode with $AIOMODE we have +QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --aio $AIOMODE" QEMU_IO_OPTIONS_NO_FMT="$QEMU_IO_OPTIONS" if [ "$IMGOPTSSYNTAX" != "true" ]; then diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 5502c3da2f..03f4a1cd7f 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -490,6 +490,20 @@ _default_cache_mode() return fi } +_supported_aio_modes() +{ +for mode; do +if [ "$mode" = "$AIOMODE" ]; then +return +fi +done +_notrun "not suitable for aio mode: $AIOMODE" +} +_default_aio_mode() +{ +AIOMODE="$1" +QEMU_IO="$QEMU_IO --aio $1" +} _unsupported_imgopts() { diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index ce74177ab1..76f1ab0945 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -58,6 +58,7 @@ imgproto = os.environ.get('IMGPROTO', 'file') test_dir = os.environ.get('TEST_DIR') output_dir = os.environ.get('OUTPUT_DIR', '.') cachemode = os.environ.get('CACHEMODE') +aiomode = os.environ.get('AIOMODE') qemu_default_machine = os.environ.get('QEMU_DEFAULT_MACHINE') socket_scm_helper = os.environ.get('SOCKET_SCM_HELPER', 'socket_scm_helper') @@ -457,6 +458,7 @@ class VM(qtest.QEMUQtestMachine): options.append('file=%s' % path) options.append('format=%s' % format) options.append('cache=%s' % cachemode) +options.append('aio=%s' % aiomode) if opts: options.append(opts) @@ -799,6 +801,10 @@ def verify_cache_mode(supported_cache_modes=[]): if supported_cache_modes and (cachemode not in supported_cache_modes): notrun('not suitable for this cache mode: %s' % cachemode) +def verify_aio_mode(supported_aio_modes=[]): +if supported_aio_modes and (aiomode not in supported_aio_modes): +notrun('not suitable for this aio mode: %s' % aiomode) + def supports_quorum(): return 'quorum' in qemu_img_pipe('--help') @@ -843,7 +849,7 @@ def skip_if_unsupported(required_formats=[], read_only=False): return skip_test_decorator def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], - unsupported_fmts=[]): +supported_aio_modes=[], unsupported_fmts=[]): '''Run tests''' global debug @@ -861,6 +867,7 @@ def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], verify_image_format(supported_fmts, unsupported_fmts) verify_platform(supported_oses) verify_cache_mode(supported_cache_modes) +verify_aio_mode(supported_aio_modes) if debug: output = sys.stdout -- 2.21.0
[Qemu-devel] [PATCH v9 16/17] block/io_uring: adds fd registration
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 107 - block/trace-events | 1 + 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index 86f32e18a1..1553cd2e58 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -45,10 +45,16 @@ typedef struct LuringQueue { QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; } LuringQueue; +typedef struct LuringFd { +int *fd_array; +GHashTable *fd_lookup; +} LuringFd; + typedef struct LuringState { AioContext *aio_context; struct io_uring ring; +LuringFd fd_reg; /* io queue for submit at batch. Protected by AioContext lock. */ LuringQueue io_q; @@ -306,6 +312,94 @@ static int ioq_submit(LuringState *s) return ret; } +/** + * luring_fd_register: + * + * Register file descriptors, see luring_fd_lookup + */ +static int luring_fd_register(struct io_uring *ring, LuringFd *fd_reg, int fd) +{ +int ret, nr; +GHashTable *lookup = fd_reg->fd_lookup; +nr = g_hash_table_size(lookup); + +/* If adding new, API requires older registrations to be removed */ +if (nr) { +/* + * See linux b19062a56726, register needs the ring mutex, any + * submission in progress will complete before unregistering begins + * and new ones will have to wait. + */ +ret = io_uring_unregister_files(ring); +if (ret < 0) { +return ret; +} +} + +fd_reg->fd_array = g_realloc_n(fd_reg->fd_array, nr + 1, sizeof(int)); +fd_reg->fd_array[nr] = fd; + +g_hash_table_insert(lookup, GINT_TO_POINTER(fd), GINT_TO_POINTER(nr)); +trace_luring_fd_register(fd, nr); +return io_uring_register_files(ring, fd_reg->fd_array, nr + 1); +} +/** + * luring_fd_unregister: + * + * Unregisters file descriptors, TODO: error handling + */ +static void luring_fd_unregister(LuringState *s) +{ +io_uring_unregister_files(>ring); +g_hash_table_unref(s->fd_reg.fd_lookup); +g_free(s->fd_reg.fd_array); +} + +/** + * luring_fd_lookup: + * + * Used to lookup fd index in registered array at submission time + * If the lookup table has not been created or the fd is not in the table, + * the fd is registered. + * + * If registration errors, the hash is cleared and the fd used directly + * + * Unregistering is done at luring_detach_aio_context + */ +static int luring_fd_lookup(LuringState *s, int fd) +{ +int ret; +void *index; +GHashTable *lookup; + +if (!s->fd_reg.fd_lookup) { +s->fd_reg.fd_lookup = g_hash_table_new_full(g_direct_hash, +g_direct_equal, +g_free, g_free); +luring_fd_register(>ring, >fd_reg, fd); +} +lookup = s->fd_reg.fd_lookup; +index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd)); + +if (index < 0) { +ret = luring_fd_register(>ring, >fd_reg, fd); + +if (ret < 0) { +if (ret == -ENOMEM || ret == -EMFILE || +ret == -ENXIO) { +return ret; +} else { +/* Should not reach here */ +g_hash_table_remove_all(lookup); +g_free(s->fd_reg.fd_array); +return ret; +} +} +index = g_hash_table_lookup(lookup, GINT_TO_POINTER(fd)); +} +return GPOINTER_TO_INT(index); +} + void luring_io_plug(BlockDriverState *bs, LuringState *s) { trace_luring_io_plug(s); @@ -337,9 +431,14 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { -int ret; +int ret, fd_index; struct io_uring_sqe *sqes = >sqeq; +fd_index = luring_fd_lookup(s, fd); +if (fd_index >= 0) { +fd = fd_index; +} + switch (type) { case QEMU_AIO_WRITE: io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, @@ -357,7 +456,11 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, __func__, type); abort(); } + io_uring_sqe_set_data(sqes, luringcb); +if (fd_index >= 0) { +io_uring_sqe_set_flags(sqes, IOSQE_FIXED_FILE); +} QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); s->io_q.in_queue++; @@ -383,6 +486,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; + trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, type); ret = luring_do_submit(fd, , s, offset, type); @@ -399,6 +503,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s,
[Qemu-devel] [PATCH v9 13/17] qemu-nbd: adds option for aio engines
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Acked-by: Eric Blake --- qemu-nbd.c| 12 qemu-nbd.texi | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/qemu-nbd.c b/qemu-nbd.c index a8cb39e510..7bb479f3c0 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,7 @@ static void usage(const char *name) "'[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" " --cache=MODE set cache mode (none, writeback, ...)\n" -" --aio=MODEset AIO mode (native or threads)\n" +" --aio=MODEset AIO mode (native, io_uring or threads)\n" " --discard=MODEset discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" " --image-opts treat FILE as a full set of image options\n" @@ -718,13 +718,9 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } seen_aio = true; -if (!strcmp(optarg, "native")) { -flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(optarg, "threads")) { -/* this is the default */ -} else { - error_report("invalid aio mode `%s'", optarg); - exit(EXIT_FAILURE); +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio mode '%s'", optarg); +exit(EXIT_FAILURE); } break; case QEMU_NBD_OPT_DISCARD: diff --git a/qemu-nbd.texi b/qemu-nbd.texi index 7f55657722..3ee3e4bdee 100644 --- a/qemu-nbd.texi +++ b/qemu-nbd.texi @@ -77,8 +77,8 @@ as an read-only device, @var{snapshot_param} format is The cache mode to be used with the file. See the documentation of the emulator's @code{-drive cache=...} option for allowed values. @item --aio=@var{aio} -Set the asynchronous I/O mode between @samp{threads} (the default) -and @samp{native} (Linux only). +Set the asynchronous I/O mode between @samp{threads} (the default), +@samp{native} (Linux only) and @samp{io_uring} (Linux 5.1+). @item --discard=@var{discard} Control whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem. @var{discard} is one of -- 2.21.0
[Qemu-devel] [PATCH v9 12/17] qemu-img: adds option to use aio engine for benchmarking
Signed-off-by: Aarushi Mehta --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 11 ++- qemu-img.texi| 5 - 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 1c93e6d185..77b5a8dda8 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -20,9 +20,9 @@ STEXI ETEXI DEF("bench", img_bench, -"bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] [-U] filename") +"bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-i aio] [-w] [-U] filename") STEXI -@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-i @var{aio}] [-w] [-U] @var{filename} ETEXI DEF("check", img_check, diff --git a/qemu-img.c b/qemu-img.c index 79983772de..27ac33f7d7 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4192,7 +4192,8 @@ static int img_bench(int argc, char **argv) {"force-share", no_argument, 0, 'U'}, {0, 0, 0, 0} }; -c = getopt_long(argc, argv, ":hc:d:f:no:qs:S:t:wU", long_options, NULL); +c = getopt_long(argc, argv, ":hc:d:f:ni:o:qs:S:t:wU", long_options, +NULL); if (c == -1) { break; } @@ -4235,6 +4236,14 @@ static int img_bench(int argc, char **argv) case 'n': flags |= BDRV_O_NATIVE_AIO; break; +case 'i': +ret = bdrv_parse_aio(optarg, ); +if (ret < 0) { +error_report("Invalid aio option: %s", optarg); +ret = -1; +goto out; +} +break; case 'o': { offset = cvtnum(optarg); diff --git a/qemu-img.texi b/qemu-img.texi index c8e9bba515..0a2eccea85 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -206,7 +206,7 @@ Command description: Amends the image format specific @var{options} for the image file @var{filename}. Not all file formats support this operation. -@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [-i @var{aio}][--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} Run a simple sequential I/O benchmark on the specified image. If @code{-w} is specified, a write test is performed, otherwise a read test is performed. @@ -227,6 +227,9 @@ If @code{-n} is specified, the native AIO backend is used if possible. On Linux, this option only works if @code{-t none} or @code{-t directsync} is specified as well. +If @code{-i} is specified, aio option can be used to specify different AIO +backends: @var{threads}, @var{native} or @var{io_uring}. + For write tests, by default a buffer filled with zeros is written. This can be overridden with a pattern byte specified by @var{pattern}. -- 2.21.0
[Qemu-devel] [PATCH v9 11/17] qemu-io: adds option to use aio engine
Signed-off-by: Aarushi Mehta --- qemu-io.c | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index f64eca6940..0abb4af134 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -130,7 +130,8 @@ static void open_help(void) " -C, -- use copy-on-read\n" " -n, -- disable host cache, short for -t none\n" " -U, -- force shared permissions\n" -" -k, -- use kernel AIO implementation (on Linux only)\n" +" -k, -- use kernel AIO implementation (Linux only, prefer use of -i)\n" +" -i, -- use AIO mode (threads, native or io_uring)\n" " -t, -- use the given cache mode for the image\n" " -d, -- use the given discard mode for the image\n" " -o, -- options to be given to the block driver" @@ -172,7 +173,7 @@ static int open_f(BlockBackend *blk, int argc, char **argv) QDict *opts; bool force_share = false; -while ((c = getopt(argc, argv, "snCro:kt:d:U")) != -1) { +while ((c = getopt(argc, argv, "snCro:ki:t:d:U")) != -1) { switch (c) { case 's': flags |= BDRV_O_SNAPSHOT; @@ -204,6 +205,13 @@ static int open_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +qemu_opts_reset(_opts); +return -EINVAL; +} +break; case 'o': if (imageOpts) { printf("--image-opts and 'open -o' are mutually exclusive\n"); @@ -291,7 +299,9 @@ static void usage(const char *name) " -n, --nocachedisable host cache, short for -t none\n" " -C, --copy-on-read enable copy-on-read\n" " -m, --misalign misalign allocations for O_DIRECT\n" -" -k, --native-aio use kernel AIO implementation (on Linux only)\n" +" -k, --native-aio use kernel AIO implementation\n" +" (Linux only, prefer use of -i)\n" +" -i, --aio=MODE use AIO mode (threads, native or io_uring)\n" " -t, --cache=MODE use the given cache mode for the image\n" " -d, --discard=MODE use the given discard mode for the image\n" " -T, --trace [[enable=]][,events=][,file=]\n" @@ -489,7 +499,7 @@ static QemuOptsList file_opts = { int main(int argc, char **argv) { int readonly = 0; -const char *sopt = "hVc:d:f:rsnCmkt:T:U"; +const char *sopt = "hVc:d:f:rsnCmki:t:T:U"; const struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -501,6 +511,7 @@ int main(int argc, char **argv) { "copy-on-read", no_argument, NULL, 'C' }, { "misalign", no_argument, NULL, 'm' }, { "native-aio", no_argument, NULL, 'k' }, +{ "aio", required_argument, NULL, 'i' }, { "discard", required_argument, NULL, 'd' }, { "cache", required_argument, NULL, 't' }, { "trace", required_argument, NULL, 'T' }, @@ -568,6 +579,12 @@ int main(int argc, char **argv) case 'k': flags |= BDRV_O_NATIVE_AIO; break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +exit(1); +} +break; case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { error_report("Invalid cache option: %s", optarg); -- 2.21.0
[Qemu-devel] [PATCH v9 15/17] tests/qemu-iotests: use AIOMODE with various tests
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- tests/qemu-iotests/028 | 3 ++- tests/qemu-iotests/058 | 2 +- tests/qemu-iotests/089 | 4 ++-- tests/qemu-iotests/091 | 7 --- tests/qemu-iotests/109 | 3 ++- tests/qemu-iotests/147 | 5 +++-- tests/qemu-iotests/181 | 10 +- tests/qemu-iotests/183 | 7 --- tests/qemu-iotests/185 | 17 - tests/qemu-iotests/200 | 3 ++- tests/qemu-iotests/201 | 10 +- 11 files changed, 42 insertions(+), 29 deletions(-) diff --git a/tests/qemu-iotests/028 b/tests/qemu-iotests/028 index 01f495912f..59e7b670ed 100755 --- a/tests/qemu-iotests/028 +++ b/tests/qemu-iotests/028 @@ -108,7 +108,8 @@ echo block-backup echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ +id=disk h=$QEMU_HANDLE QEMU_COMM_TIMEOUT=1 diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058 index 8c3212a72f..38d1ed90c0 100755 --- a/tests/qemu-iotests/058 +++ b/tests/qemu-iotests/058 @@ -64,7 +64,7 @@ nbd_snapshot_img="nbd:unix:$nbd_unix_socket" converted_image=$TEST_IMG.converted # Use -f raw instead of -f $IMGFMT for the NBD connection -QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE" +QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE --aio=$AIOMODE" echo echo "== preparing image ==" diff --git a/tests/qemu-iotests/089 b/tests/qemu-iotests/089 index ad029f1f09..059ad75e28 100755 --- a/tests/qemu-iotests/089 +++ b/tests/qemu-iotests/089 @@ -64,7 +64,7 @@ $QEMU_IO -c 'write -P 42 0 512' -c 'write -P 23 512 512' \ $QEMU_IMG convert -f raw -O $IMGFMT "$TEST_IMG.base" "$TEST_IMG" -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0 512' -c 'read -P 23 512 512' \ -c 'read -P 66 1024 512' "json:{ \"driver\": \"$IMGFMT\", @@ -111,7 +111,7 @@ $QEMU_IO -c 'write -P 42 0x38000 512' "$TEST_IMG" | _filter_qemu_io # The "image.filename" part tests whether "a": { "b": "c" } and "a.b": "c" do # the same (which they should). -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0x38000 512' "json:{ \"driver\": \"$IMGFMT\", \"file\": { diff --git a/tests/qemu-iotests/091 b/tests/qemu-iotests/091 index d62ef18a02..78741d3fe7 100755 --- a/tests/qemu-iotests/091 +++ b/tests/qemu-iotests/091 @@ -60,14 +60,15 @@ echo === Starting QEMU VM1 === echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk h1=$QEMU_HANDLE echo echo === Starting QEMU VM2 === echo -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk \ - -incoming "exec: cat '${MIG_FIFO}'" +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk -incoming "exec: cat '${MIG_FIFO}'" h2=$QEMU_HANDLE echo diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 index 9897ceb6cd..451709689a 100755 --- a/tests/qemu-iotests/109 +++ b/tests/qemu-iotests/109 @@ -52,7 +52,8 @@ run_qemu() local qmp_format="$3" local qmp_event="$4" -_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},id=src +_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},\ + aio=${AIOMODE},id=src _send_qemu_cmd $QEMU_HANDLE "{ 'execute': 'qmp_capabilities' }" "return" _send_qemu_cmd $QEMU_HANDLE \ diff --git a/tests/qemu-iotests/147 b/tests/qemu-iotests/147 index 2d84fddb01..632973b23c 100755 --- a/tests/qemu-iotests/147 +++ b/tests/qemu-iotests/147 @@ -24,7 +24,7 @@ import socket import stat import time import iotests -from iotests import cachemode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe +from iotests import cachemode, aiomode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe NBD_PORT_START = 32768 NBD_PORT_END= NBD_PORT_START + 1024 @@ -134,7 +134,8 @@ class BuiltinNBD(NBDBlockdevAddBase): self.server.add_drive_raw('if=none,id=nbd-export,' + 'file=%s,' % test_img + 'format=%s,' % imgfmt + - 'cache=%s' % cachemode) + 'cache=%s' % cachemode + + 'aio=%s' % aiomode) self.server.launch() def tearDown(self): diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181 index e317e63422..547c1b47
[Qemu-devel] [PATCH v9 09/17] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 22 +++--- block/trace-events | 12 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index 902b106954..c8e6526747 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -93,6 +94,8 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, QEMUIOVector *resubmit_qiov; size_t remaining; +trace_luring_resubmit_short_read(s, luringcb, nread); + /* Update read position */ luringcb->total_read = nread; remaining = luringcb->qiov->size - luringcb->total_read; @@ -164,6 +167,7 @@ static void luring_process_completions(LuringState *s) /* Change counters one-by-one because we can be nested. */ s->io_q.in_flight--; +trace_luring_process_completion(s, luringcb, ret); /* total_read is non-zero only for resubmitted read requests */ total_bytes = ret + luringcb->total_read; @@ -264,6 +268,7 @@ static int ioq_submit(LuringState *s) QSIMPLEQ_REMOVE_HEAD(>io_q.submit_queue, next); } ret = io_uring_submit(>ring); +trace_luring_io_uring_submit(s, ret); /* Prevent infinite loop if submission is refused */ if (ret <= 0) { if (ret == -EAGAIN) { @@ -288,12 +293,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(s); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -314,6 +322,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = >sqeq; switch (type) { @@ -337,11 +346,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); s->io_q.in_queue++; - +trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(s, ret); +return ret; } return 0; } @@ -356,8 +368,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; - +trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, + type); ret = luring_do_submit(fd, , s, offset, type); + if (ret < 0) { return ret; } @@ -389,6 +403,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_new0(LuringState, 1); +trace_luring_init_state(s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -406,4 +421,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(s); } diff --git a/block/trace-events b/block/trace-events index d724df0117..66aaf8352b 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,18 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void *s) "%p freed" +luring_io_plug(void *s) "LuringState %p plug" +luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" +luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blo
[Qemu-devel] [PATCH v9 06/17] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index 8d2105729c..2e0a5e20ac 100644 --- a/util/async.c +++ b/util/async.c @@ -276,6 +276,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -340,6 +348,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -431,6 +462,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.21.0
[Qemu-devel] [PATCH v9 10/17] block/io_uring: adds userspace completion polling
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index c8e6526747..86f32e18a1 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -238,6 +238,21 @@ static void qemu_luring_completion_cb(void *opaque) luring_process_completions_and_submit(s); } +static bool qemu_luring_poll_cb(void *opaque) +{ +LuringState *s = opaque; +struct io_uring_cqe *cqes; + +if (io_uring_peek_cqe(>ring, ) == 0) { +if (cqes) { +luring_process_completions_and_submit(s); +return true; +} +} + +return false; +} + static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(_q->submit_queue); @@ -395,7 +410,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context) s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, - qemu_luring_completion_cb, NULL, NULL, s); + qemu_luring_completion_cb, NULL, qemu_luring_poll_cb, s); } LuringState *luring_init(Error **errp) -- 2.21.0
[Qemu-devel] [PATCH v9 07/17] blockdev: adds bdrv_parse_aio to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block.c | 22 ++ blockdev.c| 12 include/block/block.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index cbd8da5f3b..401831e28d 100644 --- a/block.c +++ b/block.c @@ -844,6 +844,28 @@ static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts, return detect_zeroes; } +/** + * Set open flags for aio engine + * + * Return 0 on success, -1 if the engine specified is invalid + */ +int bdrv_parse_aio(const char *mode, int *flags) +{ +if (!strcmp(mode, "threads")) { +/* do nothing, default */ +} else if (!strcmp(mode, "native")) { +*flags |= BDRV_O_NATIVE_AIO; +#ifdef CONFIG_LINUX_IO_URING +} else if (!strcmp(mode, "io_uring")) { +*flags |= BDRV_O_IO_URING; +#endif +} else { +return -1; +} + +return 0; +} + /** * Set open flags for a given discard mode * diff --git a/blockdev.c b/blockdev.c index 4d141e9a1f..a41623ae9a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -383,13 +383,9 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, } if ((aio = qemu_opt_get(opts, "aio")) != NULL) { -if (!strcmp(aio, "native")) { -*bdrv_flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(aio, "threads")) { -/* this is the default */ -} else { - error_setg(errp, "invalid aio option"); - return; +if (bdrv_parse_aio(aio, bdrv_flags) < 0) { +error_setg(errp, "invalid aio option"); +return; } } } @@ -4574,7 +4570,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, diff --git a/include/block/block.h b/include/block/block.h index e29baa172c..ec6b9ea4c8 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -297,6 +297,7 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); int bdrv_parse_discard_flags(const char *mode, int *flags); BdrvChild *bdrv_open_child(const char *filename, -- 2.21.0
[Qemu-devel] [PATCH v9 04/17] block/io_uring: implements interfaces for io_uring
Aborts when sqe fails to be set as sqes cannot be returned to the ring. Adds slow path for short reads for older kernels Signed-off-by: Aarushi Mehta Signed-off-by: Stefan Hajnoczi Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 409 include/block/aio.h | 16 +- include/block/raw-aio.h | 12 ++ 5 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index d6de200453..be688fcd5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2530,6 +2530,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index 35f3bca4d9..6a548af8ed 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..902b106954 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,409 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; + +/* + * Buffered reads may require resubmission, see + * luring_resubmit_short_read(). + */ +int total_read; +QEMUIOVector resubmit_qiov; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * luring_resubmit: + * + * Resubmit a request by appending it to submit_queue. The caller must ensure + * that ioq_submit() is called later so that submit_queue requests are started. + */ +static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +{ +QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); +s->io_q.in_queue++; +} + +/** + * luring_resubmit_short_read: + * + * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async + * context") a buffered I/O request with the start of the file range in the + * page cache could result in a short read. Applications need to resubmit the + * remaining read request. + * + * This is a slow path but recent kernels never take it. + */ +static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, + int nread) +{ +QEMUIOVector *resubmit_qiov; +size_t remaining; + +/* Update read position */ +luringcb->total_read = nread; +remaining = luringcb->qiov->size - luringcb->total_read; + +/* Shorten qiov */ +resubmit_qiov = >resubmit_qiov; +if (resubmit_qiov->iov == NULL) { +qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); +} else { +qemu_iovec_reset(resubmit_qiov); +} +qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, + remaining); + +/* Update sqe */ +luringcb->sqeq.off = nread;
[Qemu-devel] [PATCH v9 08/17] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- block/file-posix.c | 99 -- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4479cc7ab4..4aa42f826f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -156,6 +156,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -425,7 +426,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -484,9 +485,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -495,7 +502,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -559,7 +570,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -581,6 +592,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1874,21 +1901,25 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, return -EIO; /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. + * When using O_DIRECT, the request must be aligned to be able to use + * either libaio or io_uring interface. If not fail back to regular thread + * pool read/write code which emulates this for us if we + * set QEMU_AIO_MISALIGNED. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +
[Qemu-devel] [PATCH v9 05/17] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index be688fcd5e..bca5de7d6c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2536,6 +2536,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.21.0
[Qemu-devel] [PATCH v9 01/17] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index 714e7fb6a1..493dbc2ec5 100755 --- a/configure +++ b/configure @@ -371,6 +371,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1272,6 +1273,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1790,6 +1795,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3975,6 +3981,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6398,6 +6419,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6885,6 +6907,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.21.0
[Qemu-devel] [PATCH v9 02/17] qapi/block-core: add option for io_uring
Only enumerates option for devices that support it. Since QAPI schema supports io_uring, which is the actual name of the Linux API, it is preferred over io-uring. Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..3dc93b483f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.2) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.21.0
[Qemu-devel] [PATCH v9 00/17] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. Benchmarks for the system at https://github.com/rooshm/benchmarks. There is currently an -EIO output when guests are booted from io_uring on ext4, the error is reported upstream. https://lore.kernel.org/linux-block/20190723080701.GA3198@stefanha-x1.localdomain/ v9: - adds sq_polling - fixes leaks in fd_registration v8: - adds fd_registration v7: - completes io-tests options - misc fixes v6: - add slow path for short-read - hooks up fsync - enables qemu-iotests with aio options - adds bdrv_parse_aio Aarushi Mehta (17): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: adds bdrv_parse_aio to use io_uring block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds option to use aio engine qemu-img: adds option to use aio engine for benchmarking qemu-nbd: adds option for aio engines tests/qemu-iotests: enable testing with aio options tests/qemu-iotests: use AIOMODE with various tests block/io_uring: adds fd registration block/io_uring: enable kernel submission polling MAINTAINERS | 8 + block.c | 22 ++ block/Makefile.objs | 3 + block/file-posix.c| 99 -- block/io_uring.c | 560 ++ block/trace-events| 13 + blockdev.c| 12 +- configure | 27 ++ include/block/aio.h | 16 +- include/block/block.h | 2 + include/block/raw-aio.h | 12 + qapi/block-core.json | 4 +- qemu-img-cmds.hx | 4 +- qemu-img.c| 11 +- qemu-img.texi | 5 +- qemu-io.c | 25 +- qemu-nbd.c| 12 +- qemu-nbd.texi | 4 +- stubs/Makefile.objs | 1 + stubs/io_uring.c | 32 ++ tests/qemu-iotests/028| 3 +- tests/qemu-iotests/058| 2 +- tests/qemu-iotests/089| 4 +- tests/qemu-iotests/091| 7 +- tests/qemu-iotests/109| 3 +- tests/qemu-iotests/147| 5 +- tests/qemu-iotests/181| 10 +- tests/qemu-iotests/183| 7 +- tests/qemu-iotests/185| 17 +- tests/qemu-iotests/200| 3 +- tests/qemu-iotests/201| 10 +- tests/qemu-iotests/check | 15 +- tests/qemu-iotests/common.rc | 14 + tests/qemu-iotests/iotests.py | 9 +- util/async.c | 36 +++ 35 files changed, 938 insertions(+), 79 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.21.0
[Qemu-devel] [PATCH v9 03/17] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 50a07c1c33..e29baa172c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.21.0
[Qemu-devel] [PATCH v8 16/16] block/io_uring: adds fd registration
The fd registration API in io_uring registers a series of fds together that cannot be modified later. Thus, a hashtable maintains an index of fds registered and their index in the internally registered array. The existing fd array is unregistered and a new one submitted. Signed-off-by: Aarushi Mehta --- block/io_uring.c | 86 -- block/trace-events | 1 + 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index e2bef380e7..eb8fd23822 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -45,10 +45,17 @@ typedef struct LuringQueue { QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; } LuringQueue; +typedef struct LuringFd { +int *fd_array; +int *fd_index; +GHashTable *fd_lookup; +} LuringFd; + typedef struct LuringState { AioContext *aio_context; struct io_uring ring; +LuringFd fd_reg; /* io queue for submit at batch. Protected by AioContext lock. */ LuringQueue io_q; @@ -305,6 +312,70 @@ static int ioq_submit(LuringState *s) } return ret; } +/** + * luring_fd_register: + * + * Register and unregisters file descriptors, see luring_fd_lookup + */ +static int luring_fd_register(struct io_uring *ring, LuringFd *fd_reg, int fd) +{ +int ret, nr; +GHashTable *lookup = fd_reg->fd_lookup; +nr = g_hash_table_size(lookup); + +/* Unregister */ +if (!fd) { +ret = io_uring_unregister_files(ring); +g_hash_table_remove_all(lookup); +return ret; +} + +/* If adding new, API requires older registrations to be removed */ +if (nr) { +io_uring_unregister_files(ring); +} + +fd_reg->fd_array = g_realloc_n(fd_reg->fd_array, nr + 1, sizeof(int)); +fd_reg->fd_array[nr] = fd; +fd_reg->fd_index = g_realloc_n(fd_reg->fd_index, nr + 1, sizeof(int)); +fd_reg->fd_index[nr] = nr; + +g_hash_table_insert(lookup, _reg->fd_array[nr], _reg->fd_index[nr]); +trace_luring_fd_register(fd, nr); +return io_uring_register_files(ring, fd_reg->fd_array, nr + 1); +} + +/** + * luring_fd_lookup: + * + * Used to lookup fd index in registered array at submission time + * If the lookup table has not been created or the fd is not in the table, + * the fd is registered. + * + * If registration errors, the hash is cleared and the fd used directly + * + * Unregistering is done at luring_detach_aio_context + */ +static int luring_fd_lookup(LuringState *s, int fd) +{ +int *index, ret; +if (!s->fd_reg.fd_lookup) { +s->fd_reg.fd_lookup = g_hash_table_new_full(g_int_hash, g_int_equal, +g_free, g_free); +luring_fd_register(>ring, >fd_reg, fd); +} +index = g_hash_table_lookup(s->fd_reg.fd_lookup, ); + +if (!index) { +ret = luring_fd_register(>ring, >fd_reg, fd); +if (ret < 0) { +g_hash_table_remove_all(s->fd_reg.fd_lookup); +return ret; +} +index = g_hash_table_lookup(s->fd_reg.fd_lookup, ); +} +return *index; +} void luring_io_plug(BlockDriverState *bs, LuringState *s) { @@ -357,7 +428,11 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, __func__, type); abort(); } + io_uring_sqe_set_data(sqes, luringcb); +if (s->fd_reg.fd_array) { +io_uring_sqe_set_flags(sqes, IOSQE_FIXED_FILE); +} QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); s->io_q.in_queue++; @@ -374,15 +449,21 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, } int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, uint64_t offset, QEMUIOVector *qiov, int type) { -int ret; +int ret, fd_index; LuringAIOCB luringcb = { .co = qemu_coroutine_self(), .ret= -EINPROGRESS, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; + +fd_index = luring_fd_lookup(s, fd); +if (fd_index >= 0) { +fd = fd_index; +} + trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, type); ret = luring_do_submit(fd, , s, offset, type); @@ -399,6 +480,7 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, void luring_detach_aio_context(LuringState *s, AioContext *old_context) { +luring_fd_register(>ring, >fd_reg, 0); aio_set_fd_handler(old_context, s->ring.ring_fd, false, NULL, NULL, NULL, s); qemu_bh_delete(s->completion_bh); diff --git a/block/trace-events b/block/trace-events index 66aaf8352b..13571aa182 100644 --- a/block/trace-events +++ b/block/trace-events @@ -71,6 +71,7 @@ luring_co_submit(void *bs, void *s, void *luringcb,
[Qemu-devel] [PATCH v8 15/16] tests/qemu-iotests: use AIOMODE with various tests
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- tests/qemu-iotests/028 | 3 ++- tests/qemu-iotests/058 | 2 +- tests/qemu-iotests/089 | 4 ++-- tests/qemu-iotests/091 | 7 --- tests/qemu-iotests/109 | 3 ++- tests/qemu-iotests/147 | 5 +++-- tests/qemu-iotests/181 | 10 +- tests/qemu-iotests/183 | 7 --- tests/qemu-iotests/185 | 17 - tests/qemu-iotests/200 | 3 ++- tests/qemu-iotests/201 | 10 +- 11 files changed, 42 insertions(+), 29 deletions(-) diff --git a/tests/qemu-iotests/028 b/tests/qemu-iotests/028 index 01f495912f..59e7b670ed 100755 --- a/tests/qemu-iotests/028 +++ b/tests/qemu-iotests/028 @@ -108,7 +108,8 @@ echo block-backup echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ +id=disk h=$QEMU_HANDLE QEMU_COMM_TIMEOUT=1 diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058 index 8c3212a72f..38d1ed90c0 100755 --- a/tests/qemu-iotests/058 +++ b/tests/qemu-iotests/058 @@ -64,7 +64,7 @@ nbd_snapshot_img="nbd:unix:$nbd_unix_socket" converted_image=$TEST_IMG.converted # Use -f raw instead of -f $IMGFMT for the NBD connection -QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE" +QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE --aio=$AIOMODE" echo echo "== preparing image ==" diff --git a/tests/qemu-iotests/089 b/tests/qemu-iotests/089 index ad029f1f09..059ad75e28 100755 --- a/tests/qemu-iotests/089 +++ b/tests/qemu-iotests/089 @@ -64,7 +64,7 @@ $QEMU_IO -c 'write -P 42 0 512' -c 'write -P 23 512 512' \ $QEMU_IMG convert -f raw -O $IMGFMT "$TEST_IMG.base" "$TEST_IMG" -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0 512' -c 'read -P 23 512 512' \ -c 'read -P 66 1024 512' "json:{ \"driver\": \"$IMGFMT\", @@ -111,7 +111,7 @@ $QEMU_IO -c 'write -P 42 0x38000 512' "$TEST_IMG" | _filter_qemu_io # The "image.filename" part tests whether "a": { "b": "c" } and "a.b": "c" do # the same (which they should). -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0x38000 512' "json:{ \"driver\": \"$IMGFMT\", \"file\": { diff --git a/tests/qemu-iotests/091 b/tests/qemu-iotests/091 index d62ef18a02..78741d3fe7 100755 --- a/tests/qemu-iotests/091 +++ b/tests/qemu-iotests/091 @@ -60,14 +60,15 @@ echo === Starting QEMU VM1 === echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk h1=$QEMU_HANDLE echo echo === Starting QEMU VM2 === echo -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk \ - -incoming "exec: cat '${MIG_FIFO}'" +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk -incoming "exec: cat '${MIG_FIFO}'" h2=$QEMU_HANDLE echo diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 index 9897ceb6cd..451709689a 100755 --- a/tests/qemu-iotests/109 +++ b/tests/qemu-iotests/109 @@ -52,7 +52,8 @@ run_qemu() local qmp_format="$3" local qmp_event="$4" -_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},id=src +_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},\ + aio=${AIOMODE},id=src _send_qemu_cmd $QEMU_HANDLE "{ 'execute': 'qmp_capabilities' }" "return" _send_qemu_cmd $QEMU_HANDLE \ diff --git a/tests/qemu-iotests/147 b/tests/qemu-iotests/147 index 2d84fddb01..632973b23c 100755 --- a/tests/qemu-iotests/147 +++ b/tests/qemu-iotests/147 @@ -24,7 +24,7 @@ import socket import stat import time import iotests -from iotests import cachemode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe +from iotests import cachemode, aiomode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe NBD_PORT_START = 32768 NBD_PORT_END= NBD_PORT_START + 1024 @@ -134,7 +134,8 @@ class BuiltinNBD(NBDBlockdevAddBase): self.server.add_drive_raw('if=none,id=nbd-export,' + 'file=%s,' % test_img + 'format=%s,' % imgfmt + - 'cache=%s' % cachemode) + 'cache=%s' % cachemode + + 'aio=%s' % aiomode) self.server.launch() def tearDown(self): diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181 index e317e63422..547c1b47
[Qemu-devel] [PATCH v8 08/16] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- block/file-posix.c | 99 -- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4479cc7ab4..4aa42f826f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -156,6 +156,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -425,7 +426,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -484,9 +485,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -495,7 +502,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -559,7 +570,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -581,6 +592,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1874,21 +1901,25 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, return -EIO; /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. + * When using O_DIRECT, the request must be aligned to be able to use + * either libaio or io_uring interface. If not fail back to regular thread + * pool read/write code which emulates this for us if we + * set QEMU_AIO_MISALIGNED. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +
[Qemu-devel] [PATCH v8 14/16] tests/qemu-iotests: enable testing with aio options
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- tests/qemu-iotests/check | 15 ++- tests/qemu-iotests/common.rc | 14 ++ tests/qemu-iotests/iotests.py | 9 - 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index c24874ff4a..1e398923fd 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -132,6 +132,7 @@ sortme=false expunge=true have_test_arg=false cachemode=false +aiomode=false tmp="${TEST_DIR}"/$$ rm -f $tmp.list $tmp.tmp $tmp.sed @@ -141,6 +142,7 @@ export IMGFMT_GENERIC=true export IMGPROTO=file export IMGOPTS="" export CACHEMODE="writeback" +export AIOMODE="threads" export QEMU_IO_OPTIONS="" export QEMU_IO_OPTIONS_NO_FMT="" export CACHEMODE_IS_DEFAULT=true @@ -225,6 +227,11 @@ s/ .*//p CACHEMODE_IS_DEFAULT=false cachemode=false continue +elif $aiomode +then +AIOMODE="$r" +aiomode=false +continue fi xpand=true @@ -269,6 +276,7 @@ other options -n show me, do not run tests -o options -o options to pass to qemu-img create/convert -c mode cache mode +-i mode AIO mode -makecheck pretty print output for make check testlist options @@ -433,10 +441,13 @@ testlist options cachemode=true xpand=false ;; +-i) +aiomode=true +xpand=false +;; -T)# deprecated timestamp option xpand=false ;; - -v) verbose=true xpand=false @@ -515,6 +526,8 @@ done # Set qemu-io cache mode with $CACHEMODE we have QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --cache $CACHEMODE" +# Set qemu-io aio mode with $AIOMODE we have +QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --aio $AIOMODE" QEMU_IO_OPTIONS_NO_FMT="$QEMU_IO_OPTIONS" if [ "$IMGOPTSSYNTAX" != "true" ]; then diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 5502c3da2f..03f4a1cd7f 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -490,6 +490,20 @@ _default_cache_mode() return fi } +_supported_aio_modes() +{ +for mode; do +if [ "$mode" = "$AIOMODE" ]; then +return +fi +done +_notrun "not suitable for aio mode: $AIOMODE" +} +_default_aio_mode() +{ +AIOMODE="$1" +QEMU_IO="$QEMU_IO --aio $1" +} _unsupported_imgopts() { diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index ce74177ab1..76f1ab0945 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -58,6 +58,7 @@ imgproto = os.environ.get('IMGPROTO', 'file') test_dir = os.environ.get('TEST_DIR') output_dir = os.environ.get('OUTPUT_DIR', '.') cachemode = os.environ.get('CACHEMODE') +aiomode = os.environ.get('AIOMODE') qemu_default_machine = os.environ.get('QEMU_DEFAULT_MACHINE') socket_scm_helper = os.environ.get('SOCKET_SCM_HELPER', 'socket_scm_helper') @@ -457,6 +458,7 @@ class VM(qtest.QEMUQtestMachine): options.append('file=%s' % path) options.append('format=%s' % format) options.append('cache=%s' % cachemode) +options.append('aio=%s' % aiomode) if opts: options.append(opts) @@ -799,6 +801,10 @@ def verify_cache_mode(supported_cache_modes=[]): if supported_cache_modes and (cachemode not in supported_cache_modes): notrun('not suitable for this cache mode: %s' % cachemode) +def verify_aio_mode(supported_aio_modes=[]): +if supported_aio_modes and (aiomode not in supported_aio_modes): +notrun('not suitable for this aio mode: %s' % aiomode) + def supports_quorum(): return 'quorum' in qemu_img_pipe('--help') @@ -843,7 +849,7 @@ def skip_if_unsupported(required_formats=[], read_only=False): return skip_test_decorator def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], - unsupported_fmts=[]): +supported_aio_modes=[], unsupported_fmts=[]): '''Run tests''' global debug @@ -861,6 +867,7 @@ def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], verify_image_format(supported_fmts, unsupported_fmts) verify_platform(supported_oses) verify_cache_mode(supported_cache_modes) +verify_aio_mode(supported_aio_modes) if debug: output = sys.stdout -- 2.21.0
[Qemu-devel] [PATCH v8 13/16] qemu-nbd: adds option for aio engines
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Acked-by: Eric Blake --- qemu-nbd.c| 12 qemu-nbd.texi | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/qemu-nbd.c b/qemu-nbd.c index a8cb39e510..7bb479f3c0 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,7 @@ static void usage(const char *name) "'[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" " --cache=MODE set cache mode (none, writeback, ...)\n" -" --aio=MODEset AIO mode (native or threads)\n" +" --aio=MODEset AIO mode (native, io_uring or threads)\n" " --discard=MODEset discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" " --image-opts treat FILE as a full set of image options\n" @@ -718,13 +718,9 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } seen_aio = true; -if (!strcmp(optarg, "native")) { -flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(optarg, "threads")) { -/* this is the default */ -} else { - error_report("invalid aio mode `%s'", optarg); - exit(EXIT_FAILURE); +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio mode '%s'", optarg); +exit(EXIT_FAILURE); } break; case QEMU_NBD_OPT_DISCARD: diff --git a/qemu-nbd.texi b/qemu-nbd.texi index 7f55657722..3ee3e4bdee 100644 --- a/qemu-nbd.texi +++ b/qemu-nbd.texi @@ -77,8 +77,8 @@ as an read-only device, @var{snapshot_param} format is The cache mode to be used with the file. See the documentation of the emulator's @code{-drive cache=...} option for allowed values. @item --aio=@var{aio} -Set the asynchronous I/O mode between @samp{threads} (the default) -and @samp{native} (Linux only). +Set the asynchronous I/O mode between @samp{threads} (the default), +@samp{native} (Linux only) and @samp{io_uring} (Linux 5.1+). @item --discard=@var{discard} Control whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem. @var{discard} is one of -- 2.21.0
[Qemu-devel] [PATCH v8 06/16] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index 8d2105729c..2e0a5e20ac 100644 --- a/util/async.c +++ b/util/async.c @@ -276,6 +276,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -340,6 +348,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -431,6 +462,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.21.0
[Qemu-devel] [PATCH v8 05/16] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index be688fcd5e..bca5de7d6c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2536,6 +2536,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.21.0
[Qemu-devel] [PATCH v8 11/16] qemu-io: adds option to use aio engine
Signed-off-by: Aarushi Mehta --- qemu-io.c | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index f64eca6940..0abb4af134 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -130,7 +130,8 @@ static void open_help(void) " -C, -- use copy-on-read\n" " -n, -- disable host cache, short for -t none\n" " -U, -- force shared permissions\n" -" -k, -- use kernel AIO implementation (on Linux only)\n" +" -k, -- use kernel AIO implementation (Linux only, prefer use of -i)\n" +" -i, -- use AIO mode (threads, native or io_uring)\n" " -t, -- use the given cache mode for the image\n" " -d, -- use the given discard mode for the image\n" " -o, -- options to be given to the block driver" @@ -172,7 +173,7 @@ static int open_f(BlockBackend *blk, int argc, char **argv) QDict *opts; bool force_share = false; -while ((c = getopt(argc, argv, "snCro:kt:d:U")) != -1) { +while ((c = getopt(argc, argv, "snCro:ki:t:d:U")) != -1) { switch (c) { case 's': flags |= BDRV_O_SNAPSHOT; @@ -204,6 +205,13 @@ static int open_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +qemu_opts_reset(_opts); +return -EINVAL; +} +break; case 'o': if (imageOpts) { printf("--image-opts and 'open -o' are mutually exclusive\n"); @@ -291,7 +299,9 @@ static void usage(const char *name) " -n, --nocachedisable host cache, short for -t none\n" " -C, --copy-on-read enable copy-on-read\n" " -m, --misalign misalign allocations for O_DIRECT\n" -" -k, --native-aio use kernel AIO implementation (on Linux only)\n" +" -k, --native-aio use kernel AIO implementation\n" +" (Linux only, prefer use of -i)\n" +" -i, --aio=MODE use AIO mode (threads, native or io_uring)\n" " -t, --cache=MODE use the given cache mode for the image\n" " -d, --discard=MODE use the given discard mode for the image\n" " -T, --trace [[enable=]][,events=][,file=]\n" @@ -489,7 +499,7 @@ static QemuOptsList file_opts = { int main(int argc, char **argv) { int readonly = 0; -const char *sopt = "hVc:d:f:rsnCmkt:T:U"; +const char *sopt = "hVc:d:f:rsnCmki:t:T:U"; const struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -501,6 +511,7 @@ int main(int argc, char **argv) { "copy-on-read", no_argument, NULL, 'C' }, { "misalign", no_argument, NULL, 'm' }, { "native-aio", no_argument, NULL, 'k' }, +{ "aio", required_argument, NULL, 'i' }, { "discard", required_argument, NULL, 'd' }, { "cache", required_argument, NULL, 't' }, { "trace", required_argument, NULL, 'T' }, @@ -568,6 +579,12 @@ int main(int argc, char **argv) case 'k': flags |= BDRV_O_NATIVE_AIO; break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +exit(1); +} +break; case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { error_report("Invalid cache option: %s", optarg); -- 2.21.0
[Qemu-devel] [PATCH v8 12/16] qemu-img: adds option to use aio engine for benchmarking
Signed-off-by: Aarushi Mehta --- qemu-img-cmds.hx | 4 ++-- qemu-img.c | 11 ++- qemu-img.texi| 5 - 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx index 1c93e6d185..77b5a8dda8 100644 --- a/qemu-img-cmds.hx +++ b/qemu-img-cmds.hx @@ -20,9 +20,9 @@ STEXI ETEXI DEF("bench", img_bench, -"bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-w] [-U] filename") +"bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] [-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S step_size] [-t cache] [-i aio] [-w] [-U] filename") STEXI -@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-i @var{aio}] [-w] [-U] @var{filename} ETEXI DEF("check", img_check, diff --git a/qemu-img.c b/qemu-img.c index 79983772de..27ac33f7d7 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4192,7 +4192,8 @@ static int img_bench(int argc, char **argv) {"force-share", no_argument, 0, 'U'}, {0, 0, 0, 0} }; -c = getopt_long(argc, argv, ":hc:d:f:no:qs:S:t:wU", long_options, NULL); +c = getopt_long(argc, argv, ":hc:d:f:ni:o:qs:S:t:wU", long_options, +NULL); if (c == -1) { break; } @@ -4235,6 +4236,14 @@ static int img_bench(int argc, char **argv) case 'n': flags |= BDRV_O_NATIVE_AIO; break; +case 'i': +ret = bdrv_parse_aio(optarg, ); +if (ret < 0) { +error_report("Invalid aio option: %s", optarg); +ret = -1; +goto out; +} +break; case 'o': { offset = cvtnum(optarg); diff --git a/qemu-img.texi b/qemu-img.texi index c8e9bba515..0a2eccea85 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -206,7 +206,7 @@ Command description: Amends the image format specific @var{options} for the image file @var{filename}. Not all file formats support this operation. -@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [-i @var{aio}][--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} Run a simple sequential I/O benchmark on the specified image. If @code{-w} is specified, a write test is performed, otherwise a read test is performed. @@ -227,6 +227,9 @@ If @code{-n} is specified, the native AIO backend is used if possible. On Linux, this option only works if @code{-t none} or @code{-t directsync} is specified as well. +If @code{-i} is specified, aio option can be used to specify different AIO +backends: @var{threads}, @var{native} or @var{io_uring}. + For write tests, by default a buffer filled with zeros is written. This can be overridden with a pattern byte specified by @var{pattern}. -- 2.21.0
[Qemu-devel] [PATCH v8 03/16] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 50a07c1c33..e29baa172c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.21.0
[Qemu-devel] [PATCH v8 10/16] block/io_uring: adds userspace completion polling
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index ba739ebe06..e2bef380e7 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -238,6 +238,21 @@ static void qemu_luring_completion_cb(void *opaque) luring_process_completions_and_submit(s); } +static bool qemu_luring_poll_cb(void *opaque) +{ +LuringState *s = opaque; +struct io_uring_cqe *cqes; + +if (io_uring_peek_cqe(>ring, ) == 0) { +if (cqes) { +luring_process_completions_and_submit(s); +return true; +} +} + +return false; +} + static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(_q->submit_queue); @@ -395,7 +410,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context) s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, - qemu_luring_completion_cb, NULL, NULL, s); + qemu_luring_completion_cb, NULL, qemu_luring_poll_cb, s); } LuringState *luring_init(Error **errp) -- 2.21.0
[Qemu-devel] [PATCH v8 02/16] qapi/block-core: add option for io_uring
Only enumerates option for devices that support it. Since QAPI schema supports io_uring, which is the actual name of the Linux API, it is preferred over io-uring. Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..3dc93b483f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.2) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.21.0
[Qemu-devel] [PATCH v8 09/16] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 22 +++--- block/trace-events | 12 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index b1f14d2461..ba739ebe06 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -93,6 +94,8 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, QEMUIOVector *resubmit_qiov; size_t remaining; +trace_luring_resubmit_short_read(s, luringcb, nread); + /* Update read position */ luringcb->total_read = nread; remaining = luringcb->qiov->size - luringcb->total_read; @@ -164,6 +167,7 @@ static void luring_process_completions(LuringState *s) /* Change counters one-by-one because we can be nested. */ s->io_q.in_flight--; +trace_luring_process_completion(s, luringcb, ret); /* total_read is non-zero only for resubmitted read requests */ total_bytes = ret + luringcb->total_read; @@ -264,6 +268,7 @@ static int ioq_submit(LuringState *s) QSIMPLEQ_REMOVE_HEAD(>io_q.submit_queue, next); } ret = io_uring_submit(>ring); +trace_luring_io_uring_submit(s, ret); /* Prevent infinite loop if submission is refused */ if (ret <= 0) { if (ret == -EAGAIN) { @@ -288,12 +293,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(s); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -314,6 +322,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = >sqeq; switch (type) { @@ -337,11 +346,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); s->io_q.in_queue++; - +trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(s, ret); +return ret; } return 0; } @@ -356,8 +368,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; - +trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, + type); ret = luring_do_submit(fd, , s, offset, type); + if (ret < 0) { return ret; } @@ -389,6 +403,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_new0(LuringState, 1); +trace_luring_init_state(s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -406,4 +421,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(s); } diff --git a/block/trace-events b/block/trace-events index d724df0117..66aaf8352b 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,18 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void *s) "%p freed" +luring_io_plug(void *s) "LuringState %p plug" +luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" +luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blo
[Qemu-devel] [PATCH v8 01/16] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index 714e7fb6a1..493dbc2ec5 100755 --- a/configure +++ b/configure @@ -371,6 +371,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1272,6 +1273,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1790,6 +1795,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3975,6 +3981,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6398,6 +6419,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6885,6 +6907,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.21.0
[Qemu-devel] [PATCH v8 00/16] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. Benchmarks for the system at https://github.com/rooshm/benchmarks io_uring has similar performance as libaio but supports cache=writeback. Further performance enhancement will be implemented There is currently an -EIO output when guests are booted from io_uring on ext4, the error is reported upstream. https://lore.kernel.org/linux-block/20190723080701.GA3198@stefanha-x1.localdomain/ v8: - adds fd_registration v7: - completes io-tests options - misc fixes v6: - add slow path for short-read - hooks up fsync - enables qemu-iotests with aio options - adds bdrv_parse_aio v5: - Adds completion polling - Extends qemu-io - Adds qemu-iotest v4: - Add error handling - Add trace events - Remove aio submission based code Aarushi Mehta (16): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: adds bdrv_parse_aio to use io_uring block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds option to use aio engine qemu-img: adds option to use aio engine for benchmarking qemu-nbd: adds option for aio engines tests/qemu-iotests: enable testing with aio options tests/qemu-iotests: use AIOMODE with various tests block/io_uring: adds fd registration MAINTAINERS | 8 + block.c | 22 ++ block/Makefile.objs | 3 + block/file-posix.c| 99 +-- block/io_uring.c | 522 ++ block/trace-events| 13 + blockdev.c| 12 +- configure | 27 ++ include/block/aio.h | 16 +- include/block/block.h | 2 + include/block/raw-aio.h | 12 + qapi/block-core.json | 4 +- qemu-img-cmds.hx | 2 +- qemu-img.c| 11 +- qemu-img.texi | 5 +- qemu-io.c | 25 +- qemu-nbd.c| 12 +- qemu-nbd.texi | 4 +- stubs/Makefile.objs | 1 + stubs/io_uring.c | 32 +++ tests/qemu-iotests/028| 3 +- tests/qemu-iotests/058| 2 +- tests/qemu-iotests/089| 4 +- tests/qemu-iotests/091| 7 +- tests/qemu-iotests/109| 3 +- tests/qemu-iotests/147| 5 +- tests/qemu-iotests/181| 10 +- tests/qemu-iotests/183| 7 +- tests/qemu-iotests/185| 17 +- tests/qemu-iotests/200| 3 +- tests/qemu-iotests/201| 10 +- tests/qemu-iotests/check | 15 +- tests/qemu-iotests/common.rc | 14 + tests/qemu-iotests/iotests.py | 9 +- util/async.c | 36 +++ 35 files changed, 899 insertions(+), 78 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.21.0
[Qemu-devel] [PATCH v8 07/16] blockdev: adds bdrv_parse_aio to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block.c | 22 ++ blockdev.c| 12 include/block/block.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index cbd8da5f3b..401831e28d 100644 --- a/block.c +++ b/block.c @@ -844,6 +844,28 @@ static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts, return detect_zeroes; } +/** + * Set open flags for aio engine + * + * Return 0 on success, -1 if the engine specified is invalid + */ +int bdrv_parse_aio(const char *mode, int *flags) +{ +if (!strcmp(mode, "threads")) { +/* do nothing, default */ +} else if (!strcmp(mode, "native")) { +*flags |= BDRV_O_NATIVE_AIO; +#ifdef CONFIG_LINUX_IO_URING +} else if (!strcmp(mode, "io_uring")) { +*flags |= BDRV_O_IO_URING; +#endif +} else { +return -1; +} + +return 0; +} + /** * Set open flags for a given discard mode * diff --git a/blockdev.c b/blockdev.c index 4d141e9a1f..a41623ae9a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -383,13 +383,9 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, } if ((aio = qemu_opt_get(opts, "aio")) != NULL) { -if (!strcmp(aio, "native")) { -*bdrv_flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(aio, "threads")) { -/* this is the default */ -} else { - error_setg(errp, "invalid aio option"); - return; +if (bdrv_parse_aio(aio, bdrv_flags) < 0) { +error_setg(errp, "invalid aio option"); +return; } } } @@ -4574,7 +4570,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, diff --git a/include/block/block.h b/include/block/block.h index e29baa172c..ec6b9ea4c8 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -297,6 +297,7 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); int bdrv_parse_discard_flags(const char *mode, int *flags); BdrvChild *bdrv_open_child(const char *filename, -- 2.21.0
[Qemu-devel] [PATCH v8 04/16] block/io_uring: implements interfaces for io_uring
Aborts when sqe fails to be set as sqes cannot be returned to the ring. Adds slow path for short reads for older kernels Signed-off-by: Aarushi Mehta Signed-off-by: Stefan Hajnoczi Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 409 include/block/aio.h | 16 +- include/block/raw-aio.h | 12 ++ 5 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index d6de200453..be688fcd5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2530,6 +2530,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index 35f3bca4d9..6a548af8ed 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..b1f14d2461 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,409 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; + +/* + * Buffered reads may require resubmission, see + * luring_resubmit_short_read(). + */ +int total_read; +QEMUIOVector resubmit_qiov; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * luring_resubmit: + * + * Resubmit a request by appending it to submit_queue. The caller must ensure + * that ioq_submit() is called later so that submit_queue requests are started. + */ +static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +{ +QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); +s->io_q.in_queue++; +} + +/** + * luring_resubmit_short_read: + * + * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async + * context") a buffered I/O request with the start of the file range in the + * page cache could result in a short read. Applications need to resubmit the + * remaining read request. + * + * This is a slow path but recent kernels never take it. + */ +static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, + int nread) +{ +QEMUIOVector *resubmit_qiov; +size_t remaining; + +/* Update read position */ +luringcb->total_read = nread; +remaining = luringcb->qiov->size - luringcb->total_read; + +/* Shorten qiov */ +resubmit_qiov = >resubmit_qiov; +if (resubmit_qiov->iov == NULL) { +qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); +} else { +qemu_iovec_reset(resubmit_qiov); +} +qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, + remaining); + +/* Update sqe */ +luringcb->sqeq.off = nread;
[Qemu-devel] [PATCH v7 15/15] tests/qemu-iotests: use AIOMODE with various tests
Signed-off-by: Aarushi Mehta --- tests/qemu-iotests/028 | 3 ++- tests/qemu-iotests/058 | 2 +- tests/qemu-iotests/089 | 4 ++-- tests/qemu-iotests/091 | 7 --- tests/qemu-iotests/109 | 3 ++- tests/qemu-iotests/147 | 5 +++-- tests/qemu-iotests/181 | 10 +- tests/qemu-iotests/183 | 7 --- tests/qemu-iotests/185 | 17 - tests/qemu-iotests/200 | 3 ++- tests/qemu-iotests/201 | 10 +- 11 files changed, 42 insertions(+), 29 deletions(-) diff --git a/tests/qemu-iotests/028 b/tests/qemu-iotests/028 index 01f495912f..59e7b670ed 100755 --- a/tests/qemu-iotests/028 +++ b/tests/qemu-iotests/028 @@ -108,7 +108,8 @@ echo block-backup echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ +id=disk h=$QEMU_HANDLE QEMU_COMM_TIMEOUT=1 diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058 index 8c3212a72f..38d1ed90c0 100755 --- a/tests/qemu-iotests/058 +++ b/tests/qemu-iotests/058 @@ -64,7 +64,7 @@ nbd_snapshot_img="nbd:unix:$nbd_unix_socket" converted_image=$TEST_IMG.converted # Use -f raw instead of -f $IMGFMT for the NBD connection -QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE" +QEMU_IO_NBD="$QEMU_IO -f raw --cache=$CACHEMODE --aio=$AIOMODE" echo echo "== preparing image ==" diff --git a/tests/qemu-iotests/089 b/tests/qemu-iotests/089 index ad029f1f09..059ad75e28 100755 --- a/tests/qemu-iotests/089 +++ b/tests/qemu-iotests/089 @@ -64,7 +64,7 @@ $QEMU_IO -c 'write -P 42 0 512' -c 'write -P 23 512 512' \ $QEMU_IMG convert -f raw -O $IMGFMT "$TEST_IMG.base" "$TEST_IMG" -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0 512' -c 'read -P 23 512 512' \ -c 'read -P 66 1024 512' "json:{ \"driver\": \"$IMGFMT\", @@ -111,7 +111,7 @@ $QEMU_IO -c 'write -P 42 0x38000 512' "$TEST_IMG" | _filter_qemu_io # The "image.filename" part tests whether "a": { "b": "c" } and "a.b": "c" do # the same (which they should). -$QEMU_IO_PROG --cache $CACHEMODE \ +$QEMU_IO_PROG --cache $CACHEMODE --aio $AIOMODE \ -c 'read -P 42 0x38000 512' "json:{ \"driver\": \"$IMGFMT\", \"file\": { diff --git a/tests/qemu-iotests/091 b/tests/qemu-iotests/091 index d62ef18a02..78741d3fe7 100755 --- a/tests/qemu-iotests/091 +++ b/tests/qemu-iotests/091 @@ -60,14 +60,15 @@ echo === Starting QEMU VM1 === echo qemu_comm_method="monitor" -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk h1=$QEMU_HANDLE echo echo === Starting QEMU VM2 === echo -_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},id=disk \ - -incoming "exec: cat '${MIG_FIFO}'" +_launch_qemu -drive file="${TEST_IMG}",cache=${CACHEMODE},aio=${AIOMODE},\ + id=disk -incoming "exec: cat '${MIG_FIFO}'" h2=$QEMU_HANDLE echo diff --git a/tests/qemu-iotests/109 b/tests/qemu-iotests/109 index 9897ceb6cd..451709689a 100755 --- a/tests/qemu-iotests/109 +++ b/tests/qemu-iotests/109 @@ -52,7 +52,8 @@ run_qemu() local qmp_format="$3" local qmp_event="$4" -_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},id=src +_launch_qemu -drive file="${source_img}",format=raw,cache=${CACHEMODE},\ + aio=${AIOMODE},id=src _send_qemu_cmd $QEMU_HANDLE "{ 'execute': 'qmp_capabilities' }" "return" _send_qemu_cmd $QEMU_HANDLE \ diff --git a/tests/qemu-iotests/147 b/tests/qemu-iotests/147 index 2d84fddb01..632973b23c 100755 --- a/tests/qemu-iotests/147 +++ b/tests/qemu-iotests/147 @@ -24,7 +24,7 @@ import socket import stat import time import iotests -from iotests import cachemode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe +from iotests import cachemode, aiomode, imgfmt, qemu_img, qemu_nbd, qemu_nbd_early_pipe NBD_PORT_START = 32768 NBD_PORT_END= NBD_PORT_START + 1024 @@ -134,7 +134,8 @@ class BuiltinNBD(NBDBlockdevAddBase): self.server.add_drive_raw('if=none,id=nbd-export,' + 'file=%s,' % test_img + 'format=%s,' % imgfmt + - 'cache=%s' % cachemode) + 'cache=%s' % cachemode + + 'aio=%s' % aiomode) self.server.launch() def tearDown(self): diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181 index e317e63422..547c1b47b0 100755 --- a/tests/qemu-iotes
[Qemu-devel] [PATCH v7 12/15] qemu-img: adds option to use aio engine for benchmarking
Signed-off-by: Aarushi Mehta --- qemu-img.c| 10 +- qemu-img.texi | 5 - 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 79983772de..3fc8dac0b1 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4192,7 +4192,8 @@ static int img_bench(int argc, char **argv) {"force-share", no_argument, 0, 'U'}, {0, 0, 0, 0} }; -c = getopt_long(argc, argv, ":hc:d:f:no:qs:S:t:wU", long_options, NULL); +c = getopt_long(argc, argv, ":hc:d:f:ni:o:qs:S:t:wU", long_options, +NULL); if (c == -1) { break; } @@ -4234,6 +4235,13 @@ static int img_bench(int argc, char **argv) break; case 'n': flags |= BDRV_O_NATIVE_AIO; +case 'i': +ret = bdrv_parse_aio(optarg, ); +if (ret < 0) { +error_report("Invalid aio option: %s", optarg); +ret = -1; +goto out; +} break; case 'o': { diff --git a/qemu-img.texi b/qemu-img.texi index c8e9bba515..0a2eccea85 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -206,7 +206,7 @@ Command description: Amends the image format specific @var{options} for the image file @var{filename}. Not all file formats support this operation. -@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} +@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [--flush-interval=@var{flush_interval}] [-n] [-i @var{aio}][--no-drain] [-o @var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] [-w] [-U] @var{filename} Run a simple sequential I/O benchmark on the specified image. If @code{-w} is specified, a write test is performed, otherwise a read test is performed. @@ -227,6 +227,9 @@ If @code{-n} is specified, the native AIO backend is used if possible. On Linux, this option only works if @code{-t none} or @code{-t directsync} is specified as well. +If @code{-i} is specified, aio option can be used to specify different AIO +backends: @var{threads}, @var{native} or @var{io_uring}. + For write tests, by default a buffer filled with zeros is written. This can be overridden with a pattern byte specified by @var{pattern}. -- 2.21.0
[Qemu-devel] [PATCH v7 09/15] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 22 +++--- block/trace-events | 12 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index d33e554862..c7b2b0a9e2 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -93,6 +94,8 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, QEMUIOVector *resubmit_qiov; size_t remaining; +trace_luring_resubmit_short_read(s, luringcb, nread); + /* Update read position */ luringcb->total_read = nread; remaining = luringcb->qiov->size - luringcb->total_read; @@ -163,6 +166,7 @@ static void luring_process_completions(LuringState *s) /* Change counters one-by-one because we can be nested. */ s->io_q.in_flight--; +trace_luring_process_completion(s, luringcb, ret); /* total_read is non-zero only for resubmitted read requests */ total_bytes = ret + luringcb->total_read; @@ -263,6 +267,7 @@ static int ioq_submit(LuringState *s) QSIMPLEQ_REMOVE_HEAD(>io_q.submit_queue, next); } ret = io_uring_submit(>ring); +trace_luring_io_uring_submit(s, ret); /* Prevent infinite loop if submission is refused */ if (ret <= 0) { if (ret == -EAGAIN) { @@ -287,12 +292,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(s); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -313,6 +321,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = >sqeq; switch (type) { @@ -336,11 +345,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); s->io_q.in_queue++; - +trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(s, ret); +return ret; } return 0; } @@ -355,8 +367,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; - +trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, + type); ret = luring_do_submit(fd, , s, offset, type); + if (ret < 0) { return ret; } @@ -388,6 +402,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_new0(LuringState, 1); +trace_luring_init_state(s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -405,4 +420,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(s); } diff --git a/block/trace-events b/block/trace-events index d724df0117..66aaf8352b 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,18 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void *s) "%p freed" +luring_io_plug(void *s) "LuringState %p plug" +luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" +luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blo
[Qemu-devel] [PATCH v7 14/15] tests/qemu-iotests: enable testing with aio options
Signed-off-by: Aarushi Mehta --- tests/qemu-iotests/check | 15 ++- tests/qemu-iotests/common.rc | 14 ++ tests/qemu-iotests/iotests.py | 9 - 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index c24874ff4a..1e398923fd 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -132,6 +132,7 @@ sortme=false expunge=true have_test_arg=false cachemode=false +aiomode=false tmp="${TEST_DIR}"/$$ rm -f $tmp.list $tmp.tmp $tmp.sed @@ -141,6 +142,7 @@ export IMGFMT_GENERIC=true export IMGPROTO=file export IMGOPTS="" export CACHEMODE="writeback" +export AIOMODE="threads" export QEMU_IO_OPTIONS="" export QEMU_IO_OPTIONS_NO_FMT="" export CACHEMODE_IS_DEFAULT=true @@ -225,6 +227,11 @@ s/ .*//p CACHEMODE_IS_DEFAULT=false cachemode=false continue +elif $aiomode +then +AIOMODE="$r" +aiomode=false +continue fi xpand=true @@ -269,6 +276,7 @@ other options -n show me, do not run tests -o options -o options to pass to qemu-img create/convert -c mode cache mode +-i mode AIO mode -makecheck pretty print output for make check testlist options @@ -433,10 +441,13 @@ testlist options cachemode=true xpand=false ;; +-i) +aiomode=true +xpand=false +;; -T)# deprecated timestamp option xpand=false ;; - -v) verbose=true xpand=false @@ -515,6 +526,8 @@ done # Set qemu-io cache mode with $CACHEMODE we have QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --cache $CACHEMODE" +# Set qemu-io aio mode with $AIOMODE we have +QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --aio $AIOMODE" QEMU_IO_OPTIONS_NO_FMT="$QEMU_IO_OPTIONS" if [ "$IMGOPTSSYNTAX" != "true" ]; then diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 5502c3da2f..03f4a1cd7f 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -490,6 +490,20 @@ _default_cache_mode() return fi } +_supported_aio_modes() +{ +for mode; do +if [ "$mode" = "$AIOMODE" ]; then +return +fi +done +_notrun "not suitable for aio mode: $AIOMODE" +} +_default_aio_mode() +{ +AIOMODE="$1" +QEMU_IO="$QEMU_IO --aio $1" +} _unsupported_imgopts() { diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index ce74177ab1..76f1ab0945 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -58,6 +58,7 @@ imgproto = os.environ.get('IMGPROTO', 'file') test_dir = os.environ.get('TEST_DIR') output_dir = os.environ.get('OUTPUT_DIR', '.') cachemode = os.environ.get('CACHEMODE') +aiomode = os.environ.get('AIOMODE') qemu_default_machine = os.environ.get('QEMU_DEFAULT_MACHINE') socket_scm_helper = os.environ.get('SOCKET_SCM_HELPER', 'socket_scm_helper') @@ -457,6 +458,7 @@ class VM(qtest.QEMUQtestMachine): options.append('file=%s' % path) options.append('format=%s' % format) options.append('cache=%s' % cachemode) +options.append('aio=%s' % aiomode) if opts: options.append(opts) @@ -799,6 +801,10 @@ def verify_cache_mode(supported_cache_modes=[]): if supported_cache_modes and (cachemode not in supported_cache_modes): notrun('not suitable for this cache mode: %s' % cachemode) +def verify_aio_mode(supported_aio_modes=[]): +if supported_aio_modes and (aiomode not in supported_aio_modes): +notrun('not suitable for this aio mode: %s' % aiomode) + def supports_quorum(): return 'quorum' in qemu_img_pipe('--help') @@ -843,7 +849,7 @@ def skip_if_unsupported(required_formats=[], read_only=False): return skip_test_decorator def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], - unsupported_fmts=[]): +supported_aio_modes=[], unsupported_fmts=[]): '''Run tests''' global debug @@ -861,6 +867,7 @@ def main(supported_fmts=[], supported_oses=['linux'], supported_cache_modes=[], verify_image_format(supported_fmts, unsupported_fmts) verify_platform(supported_oses) verify_cache_mode(supported_cache_modes) +verify_aio_mode(supported_aio_modes) if debug: output = sys.stdout -- 2.21.0
[Qemu-devel] [PATCH v7 07/15] blockdev: adds bdrv_parse_aio to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block.c | 22 ++ blockdev.c| 12 include/block/block.h | 1 + 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index cbd8da5f3b..401831e28d 100644 --- a/block.c +++ b/block.c @@ -844,6 +844,28 @@ static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts, return detect_zeroes; } +/** + * Set open flags for aio engine + * + * Return 0 on success, -1 if the engine specified is invalid + */ +int bdrv_parse_aio(const char *mode, int *flags) +{ +if (!strcmp(mode, "threads")) { +/* do nothing, default */ +} else if (!strcmp(mode, "native")) { +*flags |= BDRV_O_NATIVE_AIO; +#ifdef CONFIG_LINUX_IO_URING +} else if (!strcmp(mode, "io_uring")) { +*flags |= BDRV_O_IO_URING; +#endif +} else { +return -1; +} + +return 0; +} + /** * Set open flags for a given discard mode * diff --git a/blockdev.c b/blockdev.c index 4d141e9a1f..a41623ae9a 100644 --- a/blockdev.c +++ b/blockdev.c @@ -383,13 +383,9 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, } if ((aio = qemu_opt_get(opts, "aio")) != NULL) { -if (!strcmp(aio, "native")) { -*bdrv_flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(aio, "threads")) { -/* this is the default */ -} else { - error_setg(errp, "invalid aio option"); - return; +if (bdrv_parse_aio(aio, bdrv_flags) < 0) { +error_setg(errp, "invalid aio option"); +return; } } } @@ -4574,7 +4570,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, diff --git a/include/block/block.h b/include/block/block.h index e29baa172c..ec6b9ea4c8 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -297,6 +297,7 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, Error **errp); +int bdrv_parse_aio(const char *mode, int *flags); int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); int bdrv_parse_discard_flags(const char *mode, int *flags); BdrvChild *bdrv_open_child(const char *filename, -- 2.21.0
[Qemu-devel] [PATCH v7 11/15] qemu-io: adds option to use aio engine
Signed-off-by: Aarushi Mehta --- qemu-io.c | 24 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index f64eca6940..6568f0c369 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -130,7 +130,8 @@ static void open_help(void) " -C, -- use copy-on-read\n" " -n, -- disable host cache, short for -t none\n" " -U, -- force shared permissions\n" -" -k, -- use kernel AIO implementation (on Linux only)\n" +" -k, -- use kernel AIO implementation (Linux only, prefer use of -i)\n" +" -i, -- use AIO mode (threads, native or io_uring)" " -t, -- use the given cache mode for the image\n" " -d, -- use the given discard mode for the image\n" " -o, -- options to be given to the block driver" @@ -172,7 +173,7 @@ static int open_f(BlockBackend *blk, int argc, char **argv) QDict *opts; bool force_share = false; -while ((c = getopt(argc, argv, "snCro:kt:d:U")) != -1) { +while ((c = getopt(argc, argv, "snCro:kit:d:U")) != -1) { switch (c) { case 's': flags |= BDRV_O_SNAPSHOT; @@ -204,6 +205,13 @@ static int open_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +qemu_opts_reset(_opts); +return -EINVAL; +} +break; case 'o': if (imageOpts) { printf("--image-opts and 'open -o' are mutually exclusive\n"); @@ -291,7 +299,9 @@ static void usage(const char *name) " -n, --nocachedisable host cache, short for -t none\n" " -C, --copy-on-read enable copy-on-read\n" " -m, --misalign misalign allocations for O_DIRECT\n" -" -k, --native-aio use kernel AIO implementation (on Linux only)\n" +" -k, --native-aio use kernel AIO implementation\n" +" (Linux only, prefer use of -i)\n" +" -i, --aio=MODE use AIO mode (threads, native or io_uring)\n" " -t, --cache=MODE use the given cache mode for the image\n" " -d, --discard=MODE use the given discard mode for the image\n" " -T, --trace [[enable=]][,events=][,file=]\n" @@ -489,7 +499,7 @@ static QemuOptsList file_opts = { int main(int argc, char **argv) { int readonly = 0; -const char *sopt = "hVc:d:f:rsnCmkt:T:U"; +const char *sopt = "hVc:d:f:rsnCmki:t:T:U"; const struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -501,6 +511,7 @@ int main(int argc, char **argv) { "copy-on-read", no_argument, NULL, 'C' }, { "misalign", no_argument, NULL, 'm' }, { "native-aio", no_argument, NULL, 'k' }, +{ "aio", required_argument, NULL, 'i' }, { "discard", required_argument, NULL, 'd' }, { "cache", required_argument, NULL, 't' }, { "trace", required_argument, NULL, 'T' }, @@ -567,6 +578,11 @@ int main(int argc, char **argv) break; case 'k': flags |= BDRV_O_NATIVE_AIO; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +exit(1); +} break; case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { -- 2.21.0
[Qemu-devel] [PATCH v7 00/15] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. Benchmarks for the system at https://github.com/rooshm/benchmarks io_uring has similar performance as libaio but supports cache=writeback. Further performance enhancement will be implemented There is currently an -EIO output when guests are booted from io_uring on ext4, the error is reported upstream. https://lore.kernel.org/linux-block/20190723080701.GA3198@stefanha-x1.localdomain/ v7: - completes io-tests options - misc fixes v6: - add slow path for short-read - hooks up fsync - enables qemu-iotests with aio options - adds bdrv_parse_aio v5: - Adds completion polling - Extends qemu-io - Adds qemu-iotest v4: - Add error handling - Add trace events - Remove aio submission based code Aarushi Mehta (15): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: adds bdrv_parse_aio to use io_uring block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds option to use aio engine qemu-img: adds option to use aio engine for benchmarking qemu-nbd: adds option for aio engines tests/qemu-iotests: enable testing with aio options tests/qemu-iotests: use AIOMODE with various tests MAINTAINERS | 8 + block.c | 22 ++ block/Makefile.objs | 3 + block/file-posix.c| 99 ++-- block/io_uring.c | 439 ++ block/trace-events| 12 + blockdev.c| 12 +- configure | 27 +++ include/block/aio.h | 16 +- include/block/block.h | 2 + include/block/raw-aio.h | 12 + qapi/block-core.json | 4 +- qemu-img.c| 10 +- qemu-img.texi | 5 +- qemu-io.c | 24 +- qemu-nbd.c| 12 +- qemu-nbd.texi | 4 +- stubs/Makefile.objs | 1 + stubs/io_uring.c | 32 +++ tests/qemu-iotests/028| 3 +- tests/qemu-iotests/058| 2 +- tests/qemu-iotests/089| 4 +- tests/qemu-iotests/091| 7 +- tests/qemu-iotests/109| 3 +- tests/qemu-iotests/147| 5 +- tests/qemu-iotests/181| 10 +- tests/qemu-iotests/183| 7 +- tests/qemu-iotests/185| 17 +- tests/qemu-iotests/200| 3 +- tests/qemu-iotests/201| 10 +- tests/qemu-iotests/check | 15 +- tests/qemu-iotests/common.rc | 14 ++ tests/qemu-iotests/iotests.py | 9 +- util/async.c | 36 +++ 34 files changed, 812 insertions(+), 77 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.21.0
[Qemu-devel] [PATCH v7 10/15] block/io_uring: adds userspace completion polling
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index c7b2b0a9e2..691130b948 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -237,6 +237,21 @@ static void qemu_luring_completion_cb(void *opaque) luring_process_completions_and_submit(s); } +static bool qemu_luring_poll_cb(void *opaque) +{ +LuringState *s = opaque; +struct io_uring_cqe *cqes; + +if (io_uring_peek_cqe(>ring, ) == 0) { +if (cqes) { +luring_process_completions_and_submit(s); +return true; +} +} + +return false; +} + static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(_q->submit_queue); @@ -394,7 +409,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context) s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, - qemu_luring_completion_cb, NULL, NULL, s); + qemu_luring_completion_cb, NULL, qemu_luring_poll_cb, s); } LuringState *luring_init(Error **errp) -- 2.21.0
[Qemu-devel] [PATCH v7 13/15] qemu-nbd: adds option for aio engines
Signed-off-by: Aarushi Mehta --- qemu-nbd.c| 12 qemu-nbd.texi | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/qemu-nbd.c b/qemu-nbd.c index a8cb39e510..7bb479f3c0 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,7 @@ static void usage(const char *name) "'[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" " --cache=MODE set cache mode (none, writeback, ...)\n" -" --aio=MODEset AIO mode (native or threads)\n" +" --aio=MODEset AIO mode (native, io_uring or threads)\n" " --discard=MODEset discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" " --image-opts treat FILE as a full set of image options\n" @@ -718,13 +718,9 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } seen_aio = true; -if (!strcmp(optarg, "native")) { -flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(optarg, "threads")) { -/* this is the default */ -} else { - error_report("invalid aio mode `%s'", optarg); - exit(EXIT_FAILURE); +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio mode '%s'", optarg); +exit(EXIT_FAILURE); } break; case QEMU_NBD_OPT_DISCARD: diff --git a/qemu-nbd.texi b/qemu-nbd.texi index 7f55657722..3ee3e4bdee 100644 --- a/qemu-nbd.texi +++ b/qemu-nbd.texi @@ -77,8 +77,8 @@ as an read-only device, @var{snapshot_param} format is The cache mode to be used with the file. See the documentation of the emulator's @code{-drive cache=...} option for allowed values. @item --aio=@var{aio} -Set the asynchronous I/O mode between @samp{threads} (the default) -and @samp{native} (Linux only). +Set the asynchronous I/O mode between @samp{threads} (the default), +@samp{native} (Linux only) and @samp{io_uring} (Linux 5.1+). @item --discard=@var{discard} Control whether @dfn{discard} (also known as @dfn{trim} or @dfn{unmap}) requests are ignored or passed to the filesystem. @var{discard} is one of -- 2.21.0
[Qemu-devel] [PATCH v7 06/15] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index 8d2105729c..2e0a5e20ac 100644 --- a/util/async.c +++ b/util/async.c @@ -276,6 +276,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -340,6 +348,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -431,6 +462,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.21.0
[Qemu-devel] [PATCH v7 01/15] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index 714e7fb6a1..493dbc2ec5 100755 --- a/configure +++ b/configure @@ -371,6 +371,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1272,6 +1273,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1790,6 +1795,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3975,6 +3981,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6398,6 +6419,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6885,6 +6907,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.21.0
[Qemu-devel] [PATCH v7 05/15] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index be688fcd5e..bca5de7d6c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2536,6 +2536,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.21.0
[Qemu-devel] [PATCH v7 08/15] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Maxim Levitsky Reviewed-by: Stefan Hajnoczi --- block/file-posix.c | 99 -- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4479cc7ab4..4aa42f826f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -156,6 +156,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -425,7 +426,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -484,9 +485,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -495,7 +502,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -559,7 +570,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -581,6 +592,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1874,21 +1901,25 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, return -EIO; /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. + * When using O_DIRECT, the request must be aligned to be able to use + * either libaio or io_uring interface. If not fail back to regular thread + * pool read/write code which emulates this for us if we + * set QEMU_AIO_MISALIGNED. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +
[Qemu-devel] [PATCH v7 02/15] qapi/block-core: add option for io_uring
Only enumerates option for devices that support it. Since QAPI schema supports io_uring, which is the actual name of the Linux API, it is preferred over io-uring. Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..3dc93b483f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.2) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.21.0
[Qemu-devel] [PATCH v7 04/15] block/io_uring: implements interfaces for io_uring
Aborts when sqe fails to be set as sqes cannot be returned to the ring. Adds slow path for short reads for older kernels Signed-off-by: Aarushi Mehta Signed-off-by: Stefan Hajnoczi --- MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 408 include/block/aio.h | 16 +- include/block/raw-aio.h | 12 ++ 5 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index d6de200453..be688fcd5e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2530,6 +2530,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index 35f3bca4d9..6a548af8ed 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..d33e554862 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,408 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; + +/* + * Buffered reads may require resubmission, see + * luring_resubmit_short_read(). + */ +int total_read; +QEMUIOVector resubmit_qiov; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * luring_resubmit: + * + * Resubmit a request by appending it to submit_queue. The caller must ensure + * that ioq_submit() is called later so that submit_queue requests are started. + */ +static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +{ +QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next); +s->io_q.in_queue++; +} + +/** + * luring_resubmit_short_read: + * + * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async + * context") a buffered I/O request with the start of the file range in the + * page cache could result in a short read. Applications need to resubmit the + * remaining read request. + * + * This is a slow path but recent kernels never take it. + */ +static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, + int nread) +{ +QEMUIOVector *resubmit_qiov; +size_t remaining; + +/* Update read position */ +luringcb->total_read = nread; +remaining = luringcb->qiov->size - luringcb->total_read; + +/* Shorten qiov */ +resubmit_qiov = >resubmit_qiov; +if (resubmit_qiov->iov == NULL) { +qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); +} else { +qemu_iovec_reset(resubmit_qiov); +} +qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, + remaining); + +/* Update sqe */ +luringcb->sqeq.off = nread; +luringcb->sqeq.addr = (__u6
[Qemu-devel] [PATCH v7 03/15] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 50a07c1c33..e29baa172c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.21.0
[Qemu-devel] [PATCH v6 13/14] qemu-nbd: adds option for aio engines
Signed-off-by: Aarushi Mehta --- qemu-nbd.c | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/qemu-nbd.c b/qemu-nbd.c index a8cb39e510..e5a71b3501 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,7 @@ static void usage(const char *name) "'[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" " --cache=MODE set cache mode (none, writeback, ...)\n" -" --aio=MODEset AIO mode (native or threads)\n" +" --aio=MODEset AIO mode (native, io_uring or threads)\n" " --discard=MODEset discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" " --image-opts treat FILE as a full set of image options\n" @@ -718,13 +718,9 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } seen_aio = true; -if (!strcmp(optarg, "native")) { -flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(optarg, "threads")) { -/* this is the default */ -} else { - error_report("invalid aio mode `%s'", optarg); - exit(EXIT_FAILURE); +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio mode `%s'", optarg); +exit(EXIT_FAILURE); } break; case QEMU_NBD_OPT_DISCARD: -- 2.21.0
[Qemu-devel] [PATCH v6 10/14] block/io_uring: adds userspace completion polling
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index e9c1dc1dc7..de2a037151 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -237,6 +237,21 @@ static void qemu_luring_completion_cb(void *opaque) luring_process_completions_and_submit(s); } +static bool qemu_luring_poll_cb(void *opaque) +{ +LuringState *s = opaque; +struct io_uring_cqe *cqes; + +if (io_uring_peek_cqe(>ring, ) == 0) { +if (cqes) { +luring_process_completions_and_submit(s); +return true; +} +} + +return false; +} + static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(_q->sq_overflow); @@ -394,7 +409,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context) s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, - qemu_luring_completion_cb, NULL, NULL, s); + qemu_luring_completion_cb, NULL, qemu_luring_poll_cb, s); } LuringState *luring_init(Error **errp) -- 2.21.0
[Qemu-devel] [PATCH v6 14/14] qemu-iotest: enable testing with qemu-io aio options
Signed-off-by: Aarushi Mehta --- tests/qemu-iotests/check | 14 +- tests/qemu-iotests/common.rc | 10 ++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index f925606cc5..2bc5bc62e2 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -132,6 +132,7 @@ sortme=false expunge=true have_test_arg=false cachemode=false +aiomode=false tmp="${TEST_DIR}"/$$ rm -f $tmp.list $tmp.tmp $tmp.sed @@ -141,6 +142,7 @@ export IMGFMT_GENERIC=true export IMGPROTO=file export IMGOPTS="" export CACHEMODE="writeback" +export AIOMODE="threads" export QEMU_IO_OPTIONS="" export QEMU_IO_OPTIONS_NO_FMT="" export CACHEMODE_IS_DEFAULT=true @@ -225,6 +227,10 @@ s/ .*//p CACHEMODE_IS_DEFAULT=false cachemode=false continue +elif $aiomode +then +AIOMODE="$r" +aiomode=false fi xpand=true @@ -269,6 +275,7 @@ other options -n show me, do not run tests -o options -o options to pass to qemu-img create/convert -c mode cache mode +-i mode AIO mode -makecheck pretty print output for make check testlist options @@ -433,10 +440,13 @@ testlist options cachemode=true xpand=false ;; +-i) +aiomode=true +xpand=false +;; -T)# deprecated timestamp option xpand=false ;; - -v) verbose=true xpand=false @@ -515,6 +525,8 @@ done # Set qemu-io cache mode with $CACHEMODE we have QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --cache $CACHEMODE" +# Set qemu-io aio mode with $AIOMODE we have +QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --aio $AIOMODE" QEMU_IO_OPTIONS_NO_FMT="$QEMU_IO_OPTIONS" if [ "$IMGOPTSSYNTAX" != "true" ]; then diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 5502c3da2f..39c04cc169 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -482,6 +482,16 @@ _supported_cache_modes() _notrun "not suitable for cache mode: $CACHEMODE" } +_supported_aio_modes() +{ +for mode; do +if [ "$mode" = "$AIOMODE" ]; then +return +fi +done +_notrun "not suitable for aio mode: $AIOMODE" +} + _default_cache_mode() { if $CACHEMODE_IS_DEFAULT; then -- 2.21.0
[Qemu-devel] [PATCH v6 07/14] blockdev: accept io_uring as option
Signed-off-by: Aarushi Mehta --- blockdev.c | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/blockdev.c b/blockdev.c index 4d141e9a1f..695b3a803b 100644 --- a/blockdev.c +++ b/blockdev.c @@ -383,13 +383,9 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, } if ((aio = qemu_opt_get(opts, "aio")) != NULL) { -if (!strcmp(aio, "native")) { -*bdrv_flags |= BDRV_O_NATIVE_AIO; -} else if (!strcmp(aio, "threads")) { -/* this is the default */ -} else { - error_setg(errp, "invalid aio option"); - return; +if (bdrv_parse_aio(aio, bdrv_flags) < 0) { +error_setg(errp, "invalid aio option"); +return; } } } @@ -4574,7 +4570,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, -- 2.21.0
[Qemu-devel] [PATCH v6 11/14] qemu-io: adds option to use aio engine
Use -i AIOMODE instead of -k. Signed-off-by: Aarushi Mehta --- qemu-io.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index f64eca6940..3cee06248e 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -130,7 +130,7 @@ static void open_help(void) " -C, -- use copy-on-read\n" " -n, -- disable host cache, short for -t none\n" " -U, -- force shared permissions\n" -" -k, -- use kernel AIO implementation (on Linux only)\n" +" -i, -- use AIO mode (threads, native or io_uring)" " -t, -- use the given cache mode for the image\n" " -d, -- use the given discard mode for the image\n" " -o, -- options to be given to the block driver" @@ -187,9 +187,6 @@ static int open_f(BlockBackend *blk, int argc, char **argv) case 'r': readonly = 1; break; -case 'k': -flags |= BDRV_O_NATIVE_AIO; -break; case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { error_report("Invalid cache option: %s", optarg); @@ -204,6 +201,13 @@ static int open_f(BlockBackend *blk, int argc, char **argv) return -EINVAL; } break; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +qemu_opts_reset(_opts); +return -EINVAL; +} +break; case 'o': if (imageOpts) { printf("--image-opts and 'open -o' are mutually exclusive\n"); @@ -291,7 +295,7 @@ static void usage(const char *name) " -n, --nocachedisable host cache, short for -t none\n" " -C, --copy-on-read enable copy-on-read\n" " -m, --misalign misalign allocations for O_DIRECT\n" -" -k, --native-aio use kernel AIO implementation (on Linux only)\n" +" -i, --aio=MODE use AIO mode (threads, native or io_uring)\n" " -t, --cache=MODE use the given cache mode for the image\n" " -d, --discard=MODE use the given discard mode for the image\n" " -T, --trace [[enable=]][,events=][,file=]\n" @@ -489,7 +493,7 @@ static QemuOptsList file_opts = { int main(int argc, char **argv) { int readonly = 0; -const char *sopt = "hVc:d:f:rsnCmkt:T:U"; +const char *sopt = "hVc:d:f:rsnCmit:T:U"; const struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -500,7 +504,7 @@ int main(int argc, char **argv) { "nocache", no_argument, NULL, 'n' }, { "copy-on-read", no_argument, NULL, 'C' }, { "misalign", no_argument, NULL, 'm' }, -{ "native-aio", no_argument, NULL, 'k' }, +{ "aio", required_argument, NULL, 'i' }, { "discard", required_argument, NULL, 'd' }, { "cache", required_argument, NULL, 't' }, { "trace", required_argument, NULL, 'T' }, @@ -565,8 +569,11 @@ int main(int argc, char **argv) case 'm': qemuio_misalign = true; break; -case 'k': -flags |= BDRV_O_NATIVE_AIO; +case 'i': +if (bdrv_parse_aio(optarg, ) < 0) { +error_report("Invalid aio option: %s", optarg); +exit(1); +} break; case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { -- 2.21.0
[Qemu-devel] [PATCH v6 06/14] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index 8d2105729c..2e0a5e20ac 100644 --- a/util/async.c +++ b/util/async.c @@ -276,6 +276,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -340,6 +348,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -431,6 +462,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.21.0
[Qemu-devel] [PATCH v6 12/14] qemu-img: adds option to use aio engine
Use -i AIOMODE instead of -n. Signed-off-by: Aarushi Mehta --- qemu-img.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 79983772de..b7a962afff 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -4192,7 +4192,7 @@ static int img_bench(int argc, char **argv) {"force-share", no_argument, 0, 'U'}, {0, 0, 0, 0} }; -c = getopt_long(argc, argv, ":hc:d:f:no:qs:S:t:wU", long_options, NULL); +c = getopt_long(argc, argv, ":hc:d:f:io:qs:S:t:wU", long_options, NULL); if (c == -1) { break; } @@ -4232,8 +4232,13 @@ static int img_bench(int argc, char **argv) case 'f': fmt = optarg; break; -case 'n': -flags |= BDRV_O_NATIVE_AIO; +case 'i': +ret = bdrv_parse_aio(optarg, ); +if (ret < 0) { +error_report("Invalid aio option: %s", optarg); +ret = -1; +goto out; +} break; case 'o': { -- 2.21.0
[Qemu-devel] [PATCH v6 08/14] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Maxim Levitsky --- block/file-posix.c | 99 -- 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 4479cc7ab4..4aa42f826f 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -156,6 +156,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -425,7 +426,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -484,9 +485,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -495,7 +502,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -559,7 +570,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -581,6 +592,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1874,21 +1901,25 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, return -EIO; /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. + * When using O_DIRECT, the request must be aligned to be able to use + * either libaio or io_uring interface. If not fail back to regular thread + * pool read/write code which emulates this for us if we + * set QEMU_AIO_MISALIGNED. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return laio_co_submit(bs, aio,
[Qemu-devel] [PATCH v6 04/14] block/io_uring: implements interfaces for io_uring
Aborts when sqe fails to be set as sqes cannot be returned to the ring. Adds slow path for short reads for older kernels Signed-off-by: Aarushi Mehta Signed-off-by: Stefan Hajnoczi --- MAINTAINERS | 7 + block.c | 22 +++ block/Makefile.objs | 3 + block/io_uring.c| 408 include/block/aio.h | 16 +- include/block/block.h | 1 + include/block/raw-aio.h | 12 ++ 7 files changed, 468 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index cc9636b43a..651f417f4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2531,6 +2531,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block.c b/block.c index 29e931e217..4aa3500ad8 100644 --- a/block.c +++ b/block.c @@ -844,6 +844,28 @@ static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts, return detect_zeroes; } +/** + * Set flags for aio engine + * + * Return 0 on success, -1 if the engine specifies is invalid + */ +int bdrv_parse_aio(const char *mode, int *flags) +{ +if (!strcmp(mode, "threads")) { +/* do nothing, default */ +} else if (!strcmp(mode, "native")) { +*flags |= BDRV_O_NATIVE_AIO; +#ifdef CONFIG_LINUX_IO_URING +} else if (!strcmp(mode, "io_uring")) { +*flags |= BDRV_O_IO_URING; +#endif +} else { +return -1; +} + +return 0; +} + /** * Set open flags for a given discard mode * diff --git a/block/Makefile.objs b/block/Makefile.objs index 35f3bca4d9..6a548af8ed 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..e8dbc388a6 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,408 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; + +/* + * Buffered reads may require resubmission, see + * luring_resubmit_short_read(). + */ +int total_read; +QEMUIOVector resubmit_qiov; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) sq_overflow; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * luring_resubmit: + * + * Resubmit a request by appending it to sq_overflow. The caller must ensure + * that ioq_submit() is called later so that sq_overflow requests are started. + */ +static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +{ +QSIMPLEQ_INSERT_TAIL(>io_q.sq_overflow, luringcb, next); +s->io_q.in_queue++; +} + +/** + * luring_resubmit_short_read: + * + * Before Linux commit 9d93a3f5a0c ("io_uring: punt short reads to async + * context") a buffered I/O request with the start of the file range in the + * page cache could result in a short read. Applications need to resubmit the + * remaining read r
[Qemu-devel] [PATCH v6 09/14] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 22 +++--- block/trace-events | 12 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index e8dbc388a6..e9c1dc1dc7 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -93,6 +94,8 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, QEMUIOVector *resubmit_qiov; size_t remaining; +trace_luring_resubmit_short_read(s, luringcb, nread); + /* Update read position */ luringcb->total_read = nread; remaining = luringcb->qiov->size - luringcb->total_read; @@ -163,6 +166,7 @@ static void luring_process_completions(LuringState *s) /* Change counters one-by-one because we can be nested. */ s->io_q.in_flight--; +trace_luring_process_completion(s, luringcb, ret); /* total_read is non-zero only for resubmitted read requests */ total_bytes = ret + luringcb->total_read; @@ -263,6 +267,7 @@ static int ioq_submit(LuringState *s) QSIMPLEQ_REMOVE_HEAD(>io_q.sq_overflow, next); } ret = io_uring_submit(>ring); +trace_luring_io_uring_submit(s, ret); /* Prevent infinite loop if submission is refused */ if (ret <= 0) { if (ret == -EAGAIN) { @@ -287,12 +292,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(s); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -313,6 +321,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = >sqeq; switch (type) { @@ -336,11 +345,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, QSIMPLEQ_INSERT_TAIL(>io_q.sq_overflow, luringcb, next); s->io_q.in_queue++; - +trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(s, ret); +return ret; } return 0; } @@ -355,8 +367,10 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, .qiov = qiov, .is_read= (type == QEMU_AIO_READ), }; - +trace_luring_co_submit(bs, s, , fd, offset, qiov ? qiov->size : 0, + type); ret = luring_do_submit(fd, , s, offset, type); + if (ret < 0) { return ret; } @@ -388,6 +402,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_new0(LuringState, 1); +trace_luring_init_state(s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -405,4 +420,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(s); } diff --git a/block/trace-events b/block/trace-events index d724df0117..5060afb2f7 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,18 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void *s) "%p freed" +luring_io_plug(void *s) "LuringState %p plug" +luring_io_unplug(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d plugged %d queued %d inflight %d" +luring_do_submit(void *s, int blocked, int plugged, int queued, int inflight) "LuringState %p blocked %d
[Qemu-devel] [PATCH v6 05/14] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index 651f417f4b..b996bc3abd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2537,6 +2537,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.21.0
[Qemu-devel] [PATCH v6 03/14] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 734c9d2f76..40bd93cd0f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.21.0
[Qemu-devel] [PATCH v6 02/14] qapi/block-core: add option for io_uring
Only enumerates option for devices that support it Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..0a3d4ae7d2 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.1) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.21.0
[Qemu-devel] [PATCH v6 01/14] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index eb635c3b9a..b0e2e2158e 100755 --- a/configure +++ b/configure @@ -370,6 +370,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1271,6 +1272,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1789,6 +1794,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3969,6 +3975,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6392,6 +6413,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6878,6 +6900,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.21.0
[Qemu-devel] [PATCH v6 00/14] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. Benchmarks for the system at https://github.com/rooshm/benchmarks io_uring has similar performance as libaio but supports cache=writeback. Further performance enhancement will be implemented There is currently an -EIO output when guests are booted from io_uring disks for the second time with clean shutdowns that is being investigated. v6: - add slow path for short-read - hooks up fsync - enables qemu-iotests with aio options - adds bdrv_parse_aio v5: - Adds completion polling - Extends qemu-io - Adds qemu-iotest v4: - Add error handling - Add trace events - Remove aio submission based code Aarushi Mehta (14): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: accept io_uring as option block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds option to use aio engine qemu-img: adds option to use aio engine qemu-nbd: adds option for aio engines tests/qemu-iotest: enable testing with qemu-io aio options MAINTAINERS | 8 + block.c | 22 ++ block/Makefile.objs | 3 + block/file-posix.c | 99 ++-- block/io_uring.c | 439 +++ block/trace-events | 12 + blockdev.c | 12 +- configure| 27 +++ include/block/aio.h | 16 +- include/block/block.h| 2 + include/block/raw-aio.h | 12 + qapi/block-core.json | 4 +- qemu-img.c | 11 +- qemu-io.c| 25 +- qemu-nbd.c | 12 +- stubs/Makefile.objs | 1 + stubs/io_uring.c | 32 +++ tests/qemu-iotests/check | 14 +- tests/qemu-iotests/common.rc | 10 + util/async.c | 36 +++ 20 files changed, 746 insertions(+), 51 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.21.0
[Qemu-devel] [PATCH v6 01/14] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index eb635c3b9a..b0e2e2158e 100755 --- a/configure +++ b/configure @@ -370,6 +370,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1271,6 +1272,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1789,6 +1794,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3969,6 +3975,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6392,6 +6413,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6878,6 +6900,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.21.0
[Qemu-devel] [PATCH v6 02/14] qapi/block-core: add option for io_uring
Only enumerates option for devices that support it Signed-off-by: Aarushi Mehta --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..0a3d4ae7d2 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.1) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.21.0
[Qemu-devel] [PATCH v6 00/14] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. Benchmarks for the system at https://github.com/rooshm/benchmarks io_uring has similar performance as libaio but supports cache=writeback. Further performance enhancement will be implemented There is currently an -EIO output when guests are booted from io_uring disks for the second time with clean shutdowns that is being investigated. v6: - add slow path for short-read - hooks up fsync - enables qemu-iotests with aio options - adds bdrv_parse_aio v5: - Adds completion polling - Extends qemu-io - Adds qemu-iotest v4: - Add error handling - Add trace events - Remove aio submission based code Aarushi Mehta (14): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: accept io_uring as option block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds option to use aio engine qemu-img: adds option to use aio engine qemu-nbd: adds option for aio engines tests/qemu-iotest: enable testing with qemu-io aio options MAINTAINERS | 8 + block.c | 22 ++ block/Makefile.objs | 3 + block/file-posix.c | 99 ++-- block/io_uring.c | 439 +++ block/trace-events | 12 + blockdev.c | 12 +- configure| 27 +++ include/block/aio.h | 16 +- include/block/block.h| 2 + include/block/raw-aio.h | 12 + qapi/block-core.json | 4 +- qemu-img.c | 11 +- qemu-io.c| 25 +- qemu-nbd.c | 12 +- stubs/Makefile.objs | 1 + stubs/io_uring.c | 32 +++ tests/qemu-iotests/check | 14 +- tests/qemu-iotests/common.rc | 10 + util/async.c | 36 +++ 20 files changed, 746 insertions(+), 51 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.21.0
[Qemu-devel] [PATCH v5 11/12] qemu-io: adds support for io_uring
Signed-off-by: Aarushi Mehta --- qemu-io.c | 13 + 1 file changed, 13 insertions(+) diff --git a/qemu-io.c b/qemu-io.c index 8d5d5911cb..54b82151c4 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -129,6 +129,7 @@ static void open_help(void) " -n, -- disable host cache, short for -t none\n" " -U, -- force shared permissions\n" " -k, -- use kernel AIO implementation (on Linux only)\n" +" -i -- use kernel io_uring (Linux 5.1+)\n" " -t, -- use the given cache mode for the image\n" " -d, -- use the given discard mode for the image\n" " -o, -- options to be given to the block driver" @@ -188,6 +189,11 @@ static int open_f(BlockBackend *blk, int argc, char **argv) case 'k': flags |= BDRV_O_NATIVE_AIO; break; +#ifdef CONFIG_LINUX_IO_URING +case 'i': +flags |= BDRV_O_IO_URING; +break; +#endif case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { error_report("Invalid cache option: %s", optarg); @@ -290,6 +296,7 @@ static void usage(const char *name) " -C, --copy-on-read enable copy-on-read\n" " -m, --misalign misalign allocations for O_DIRECT\n" " -k, --native-aio use kernel AIO implementation (on Linux only)\n" +" -i --io_uring use kernel io_uring (Linux 5.1+)\n" " -t, --cache=MODE use the given cache mode for the image\n" " -d, --discard=MODE use the given discard mode for the image\n" " -T, --trace [[enable=]][,events=][,file=]\n" @@ -499,6 +506,7 @@ int main(int argc, char **argv) { "copy-on-read", no_argument, NULL, 'C' }, { "misalign", no_argument, NULL, 'm' }, { "native-aio", no_argument, NULL, 'k' }, +{ "io_uring", no_argument, NULL, 'i' }, { "discard", required_argument, NULL, 'd' }, { "cache", required_argument, NULL, 't' }, { "trace", required_argument, NULL, 'T' }, @@ -566,6 +574,11 @@ int main(int argc, char **argv) case 'k': flags |= BDRV_O_NATIVE_AIO; break; +#ifdef CONFIG_LINUX_IO_URING +case 'i': +flags |= BDRV_O_IO_URING; +break; +#endif case 't': if (bdrv_parse_cache_mode(optarg, , ) < 0) { error_report("Invalid cache option: %s", optarg); -- 2.17.1
[Qemu-devel] [PATCH v5 10/12] block/io_uring: adds userspace completion polling
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index 47e027364a..acfaa48151 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -142,6 +142,21 @@ static void qemu_luring_completion_cb(void *opaque) qemu_luring_process_completions_and_submit(s); } +static bool qemu_luring_poll_cb(void *opaque) +{ +LuringState *s = opaque; +struct io_uring_cqe *cqes; + +if (io_uring_peek_cqe(>ring, ) == 0) { +if (!cqes) { +qemu_luring_process_completions_and_submit(s); +return true; +} +} + +return false; +} + static void ioq_init(LuringQueue *io_q) { QSIMPLEQ_INIT(_q->sq_overflow); @@ -294,7 +309,7 @@ void luring_attach_aio_context(LuringState *s, AioContext *new_context) s->aio_context = new_context; s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); aio_set_fd_handler(s->aio_context, s->ring.ring_fd, false, - qemu_luring_completion_cb, NULL, NULL, s); + qemu_luring_completion_cb, NULL, qemu_luring_poll_cb, s); } LuringState *luring_init(Error **errp) -- 2.17.1
[Qemu-devel] [PATCH v5 12/12] qemu-iotests/087: checks for io_uring
Signed-off-by: Aarushi Mehta --- tests/qemu-iotests/087 | 26 ++ tests/qemu-iotests/087.out | 10 ++ 2 files changed, 36 insertions(+) diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087 index d6c8613419..0cc7283ad8 100755 --- a/tests/qemu-iotests/087 +++ b/tests/qemu-iotests/087 @@ -124,6 +124,32 @@ run_qemu_filter_aio <
[Qemu-devel] [PATCH v5 07/12] blockdev: accept io_uring as option
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- blockdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index 3f44b891eb..a2a5b32604 100644 --- a/blockdev.c +++ b/blockdev.c @@ -386,6 +386,8 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, if ((aio = qemu_opt_get(opts, "aio")) != NULL) { if (!strcmp(aio, "native")) { *bdrv_flags |= BDRV_O_NATIVE_AIO; +} else if (!strcmp(aio, "io_uring")) { +*bdrv_flags |= BDRV_O_IO_URING; } else if (!strcmp(aio, "threads")) { /* this is the default */ } else { @@ -4579,7 +4581,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, -- 2.17.1
[Qemu-devel] [PATCH v5 08/12] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta --- block/file-posix.c | 85 +- 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index d018429672..211dfe5337 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -154,6 +154,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -423,7 +424,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -482,9 +483,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -493,7 +500,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -557,7 +568,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -579,6 +590,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1875,16 +1902,20 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, * If this is the case tell the low-level driver that it needs * to copy the buffer. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio && s->needs_alignment) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return laio_co_submit(bs, aio, s->fd, offset, qiov, type); #endif -} } acb = (RawPosixAIOData) { @@ -1920,24 +1951,36 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, static void raw_aio_plug(BlockDriverState *bs) { +BDRVRawState __attribute__((unused)) *s = bs->opaque; #ifdef CONFIG_LINUX_AIO -BDRVRawState *s = bs->opaque; if (s->use_linux_aio) {
[Qemu-devel] [PATCH v5 06/12] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index c10642a385..2709f0edc3 100644 --- a/util/async.c +++ b/util/async.c @@ -277,6 +277,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -341,6 +349,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -432,6 +463,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.17.1
[Qemu-devel] [PATCH v5 09/12] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 14 -- block/trace-events | 8 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index f327c7ef96..47e027364a 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -191,12 +192,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -217,6 +221,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = io_uring_get_sqe(>ring); if (!sqes) { sqes = >sqeq; @@ -242,11 +247,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, } io_uring_sqe_set_data(sqes, luringcb); s->io_q.in_queue++; - +trace_luring_do_submit(s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(ret); +return ret; } return 0; } @@ -294,6 +302,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_malloc0(sizeof(*s)); +trace_luring_init_state((void *)s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -311,4 +320,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(); } diff --git a/block/trace-events b/block/trace-events index eab51497fc..c4564dcd96 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,14 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void) "s freed" +disable luring_io_plug(void) "plug" +disable luring_io_unplug(int blocked, int plugged, int queued, int inflight) "blocked %d plugged %d queued %d inflight %d" +disable luring_do_submit(int blocked, int plugged, int queued, int inflight) "blocked %d plugged %d queued %d inflight %d" +disable luring_do_submit_done(int ret) "submitted to kernel %d" + # qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" qcow2_writev_done_req(void *co, int ret) "co %p ret %d" -- 2.17.1
[Qemu-devel] [PATCH v5 03/12] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index f9415ed740..5e08df716f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.17.1
[Qemu-devel] [PATCH v5 05/12] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index 49f896796e..bc38175124 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2522,6 +2522,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.17.1
[Qemu-devel] [PATCH v5 04/12] block/io_uring: implements interfaces for io_uring
Aborts when sqe fails to be set as sqes cannot be returned to the ring. Signed-off-by: Aarushi Mehta --- MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 314 include/block/aio.h | 16 +- include/block/raw-aio.h | 12 ++ 5 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index 7be1225415..49f896796e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2516,6 +2516,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index ae11605c9f..8fde7a23a5 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..f327c7ef96 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,314 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) sq_overflow; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * qemu_luring_process_completions: + * @s: AIO state + * + * Fetches completed I/O requests, consumes cqes and invokes their callbacks. + * + */ +static void qemu_luring_process_completions(LuringState *s) +{ +struct io_uring_cqe *cqes; +int ret; + +/* + * Request completion callbacks can run the nested event loop. + * Schedule ourselves so the nested event loop will "see" remaining + * completed requests and process them. Without this, completion + * callbacks that wait for other requests using a nested event loop + * would hang forever. + */ +qemu_bh_schedule(s->completion_bh); + +while (io_uring_peek_cqe(>ring, ) == 0) { +if (!cqes) { +break; +} +LuringAIOCB *luringcb = io_uring_cqe_get_data(cqes); +ret = cqes->res; + +if (ret == luringcb->qiov->size) { +ret = 0; +} else if (ret >= 0) { +/* Short Read/Write */ +if (luringcb->is_read) { +/* Read, pad with zeroes */ +qemu_iovec_memset(luringcb->qiov, ret, 0, +luringcb->qiov->size - ret); +} else { +ret = -ENOSPC;; +} +} +luringcb->ret = ret; + +io_uring_cqe_seen(>ring, cqes); +cqes = NULL; +/* Change counters one-by-one because we can be nested. */ +s->io_q.in_flight--; + +/* + * If the coroutine is already entered it must be in ioq_submit() + * and will notice luringcb->ret has been filled in when it + * eventually runs later. Coroutines cannot be entered recursively + * so avoid doing that! + */ +if (!qe
[Qemu-devel] [PATCH v5 01/12] configure: permit use of io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi Reviewed-by: Maxim Levitsky --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index b091b82cb3..7aa18d308d 100755 --- a/configure +++ b/configure @@ -365,6 +365,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1266,6 +1267,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1784,6 +1789,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3973,6 +3979,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6396,6 +6417,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6874,6 +6896,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.17.1
[Qemu-devel] [PATCH v5 02/12] qapi/block-core: add option for io_uring
Option only enumerates for hosts that support it. Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 1defcde048..db7eedd058 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.1) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.17.1
[Qemu-devel] [PATCH v5 00/12] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. v5: - Adds completion polling - Extends qemu-io - Adds qemu-iotest v4: - Add error handling - Add trace events - Remove aio submission based code v3: - Fix major errors in io_uring (sorry) - Option now enumerates for CONFIG_LINUX_IO_URING - pkg config support added Aarushi Mehta (12): configure: permit use of io_uring qapi/block-core: add option for io_uring Only enumerates option for devices that support it block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring Aborts when sqe fails to be set as sqes cannot be returned to the ring. stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: accept io_uring as option block/file-posix.c: extend to use io_uring block: add trace events for io_uring block/io_uring: adds userspace completion polling qemu-io: adds support for io_uring qemu-iotests/087: checks for io_uring MAINTAINERS| 8 + block/Makefile.objs| 3 + block/file-posix.c | 85 -- block/io_uring.c | 339 + block/trace-events | 8 + blockdev.c | 4 +- configure | 27 +++ include/block/aio.h| 16 +- include/block/block.h | 1 + include/block/raw-aio.h| 12 ++ qapi/block-core.json | 4 +- qemu-io.c | 13 ++ stubs/Makefile.objs| 1 + stubs/io_uring.c | 32 tests/qemu-iotests/087 | 26 +++ tests/qemu-iotests/087.out | 10 ++ util/async.c | 36 17 files changed, 606 insertions(+), 19 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.17.1
[Qemu-devel] [PATCH v4 1/9] configure: permit use of io_uring
Reviewed-by: Stefan Hajnoczi Signed-off-by: Aarushi Mehta --- configure | 27 +++ 1 file changed, 27 insertions(+) diff --git a/configure b/configure index 6cdcfb2dc3..cb49ef0fcc 100755 --- a/configure +++ b/configure @@ -365,6 +365,7 @@ xen="" xen_ctrl_version="" xen_pci_passthrough="" linux_aio="" +linux_io_uring="" cap_ng="" attr="" libattr="" @@ -1266,6 +1267,10 @@ for opt do ;; --enable-linux-aio) linux_aio="yes" ;; + --disable-linux-io-uring) linux_io_uring="no" + ;; + --enable-linux-io-uring) linux_io_uring="yes" + ;; --disable-attr) attr="no" ;; --enable-attr) attr="yes" @@ -1784,6 +1789,7 @@ disabled with --disable-FEATURE, default is enabled if available: vde support for vde network netmap support for netmap network linux-aio Linux AIO support + linux-io-uring Linux io_uring support cap-ng libcap-ng support attrattr and xattr support vhost-net vhost-net kernel acceleration support @@ -3973,6 +3979,21 @@ EOF linux_aio=no fi fi +## +# linux-io-uring probe + +if test "$linux_io_uring" != "no" ; then + if $pkg_config liburing; then +linux_io_uring_cflags=$($pkg_config --cflags liburing) +linux_io_uring_libs=$($pkg_config --libs liburing) +linux_io_uring=yes + else +if test "$linux_io_uring" = "yes" ; then + feature_not_found "linux io_uring" "Install liburing devel" +fi +linux_io_uring=no + fi +fi ## # TPM emulation is only on POSIX @@ -6396,6 +6417,7 @@ echo "PIE $pie" echo "vde support $vde" echo "netmap support$netmap" echo "Linux AIO support $linux_aio" +echo "Linux io_uring support $linux_io_uring" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" echo "KVM support $kvm" @@ -6876,6 +6898,11 @@ fi if test "$linux_aio" = "yes" ; then echo "CONFIG_LINUX_AIO=y" >> $config_host_mak fi +if test "$linux_io_uring" = "yes" ; then + echo "CONFIG_LINUX_IO_URING=y" >> $config_host_mak + echo "LINUX_IO_URING_CFLAGS=$linux_io_uring_cflags" >> $config_host_mak + echo "LINUX_IO_URING_LIBS=$linux_io_uring_libs" >> $config_host_mak +fi if test "$attr" = "yes" ; then echo "CONFIG_ATTR=y" >> $config_host_mak fi -- 2.17.1
[Qemu-devel] [PATCH v4 2/9] qapi/block-core: add option for io_uring
Option only enumerates for hosts that support it. Signed-off-by: Aarushi Mehta --- qapi/block-core.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 1defcde048..db7eedd058 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2792,11 +2792,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (since 4.1) # # Since: 2.9 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.17.1
[Qemu-devel] [PATCH v4 8/9] block/file-posix.c: extend to use io_uring
Signed-off-by: Aarushi Mehta --- block/file-posix.c | 85 +- 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index d018429672..211dfe5337 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -154,6 +154,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -423,7 +424,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -482,9 +483,15 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } -aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) - ? BLOCKDEV_AIO_OPTIONS_NATIVE - : BLOCKDEV_AIO_OPTIONS_THREADS; +if (bdrv_flags & BDRV_O_NATIVE_AIO) { +aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE; +#ifdef CONFIG_LINUX_IO_URING +} else if (bdrv_flags & BDRV_O_IO_URING) { +aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING; +#endif +} else { +aio_default = BLOCKDEV_AIO_OPTIONS_THREADS; +} aio = qapi_enum_parse(_lookup, qemu_opt_get(opts, "aio"), aio_default, _err); @@ -493,7 +500,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } + s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -557,7 +568,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* Currently Linux does AIO only for files opened with O_DIRECT */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -579,6 +590,22 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ + s->has_discard = true; s->has_write_zeroes = true; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { @@ -1875,16 +1902,20 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, * If this is the case tell the low-level driver that it needs * to copy the buffer. */ -if (s->needs_alignment) { -if (!bdrv_qiov_is_aligned(bs, qiov)) { -type |= QEMU_AIO_MISALIGNED; +if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { +type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif #ifdef CONFIG_LINUX_AIO -} else if (s->use_linux_aio) { -LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); -assert(qiov->size == bytes); -return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +} else if (s->use_linux_aio && s->needs_alignment) { +LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return laio_co_submit(bs, aio, s->fd, offset, qiov, type); #endif -} } acb = (RawPosixAIOData) { @@ -1920,24 +1951,36 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, static void raw_aio_plug(BlockDriverState *bs) { +BDRVRawState __attribute__((unused)) *s = bs->opaque; #ifdef CONFIG_LINUX_AIO -BDRVRawState *s = bs->opaque; if (s->use_linux_aio) {
[Qemu-devel] [PATCH v4 0/9] Add support for io_uring
This patch series adds support for the newly developed io_uring Linux AIO interface. Linux io_uring is faster than Linux's AIO asynchronous I/O code, offers efficient buffered asynchronous I/O support, the ability to do I/O without performing a system call via polled I/O, and other efficiency enhancements. Testing it requires a host kernel (5.1+) and the liburing library. Use the option -drive aio=io_uring to enable it. v4: - Add error handling - Add trace events - Remove aio submission based code v3: - Fix major errors in io_uring (sorry) - Option now enumerates for CONFIG_LINUX_IO_URING - pkg config support added Aarushi Mehta (9): configure: permit use of io_uring qapi/block-core: add option for io_uring block/block: add BDRV flag for io_uring block/io_uring: implements interfaces for io_uring stubs: add stubs for io_uring interface util/async: add aio interfaces for io_uring blockdev: accept io_uring as option block/file-posix.c: extend to use io_uring block: add trace events for io_uring MAINTAINERS | 8 + block/Makefile.objs | 3 + block/file-posix.c | 85 +-- block/io_uring.c| 325 block/trace-events | 8 + blockdev.c | 4 +- configure | 27 include/block/aio.h | 16 +- include/block/block.h | 1 + include/block/raw-aio.h | 12 ++ qapi/block-core.json| 4 +- stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 util/async.c| 36 + 14 files changed, 543 insertions(+), 19 deletions(-) create mode 100644 block/io_uring.c create mode 100644 stubs/io_uring.c -- 2.17.1
[Qemu-devel] [PATCH v4 4/9] block/io_uring: implements interfaces for io_uring
Aborts when sqes cannot be set as sqes cannot be returned to ring. Signed-off-by: Aarushi Mehta --- MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 315 include/block/aio.h | 16 +- include/block/raw-aio.h | 12 ++ 5 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index a96829ea83..dcaddec21f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2515,6 +2515,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index ae11605c9f..8fde7a23a5 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..536a9fbe87 --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,315 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +Coroutine *co; +struct io_uring_sqe sqeq; +ssize_t ret; +QEMUIOVector *qiov; +bool is_read; +QSIMPLEQ_ENTRY(LuringAIOCB) next; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) sq_overflow; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +/** + * ioq_submit: + * @s: AIO state + * + * Queues pending sqes and submits them + * + */ +static int ioq_submit(LuringState *s); + +/** + * qemu_luring_process_completions: + * @s: AIO state + * + * Fetches completed I/O requests, consumes cqes and invokes their callbacks. + * + */ +static void qemu_luring_process_completions(LuringState *s) +{ +struct io_uring_cqe *cqes; +int ret; + +/* + * Request completion callbacks can run the nested event loop. + * Schedule ourselves so the nested event loop will "see" remaining + * completed requests and process them. Without this, completion + * callbacks that wait for other requests using a nested event loop + * would hang forever. + */ +qemu_bh_schedule(s->completion_bh); + +while (io_uring_peek_cqe(>ring, ) == 0) { +if (!cqes) { +break; +} +LuringAIOCB *luringcb = io_uring_cqe_get_data(cqes); +read_barrier(); +ret = cqes->res; + +if (ret == luringcb->qiov->size) { +ret = 0; +} else if (ret >= 0) { +/* Short Read/Write */ +if (luringcb->is_read) { +/* Read, pad with zeroes */ +qemu_iovec_memset(luringcb->qiov, ret, 0, +luringcb->qiov->size - ret); +} else { +ret = -ENOSPC;; +} +} +luringcb->ret = ret; + +io_uring_cqe_seen(>ring, cqes); +cqes = NULL; +/* Change counters one-by-one because we can be nested. */ +s->io_q.in_flight--; + +/* + * If the coroutine is already entered it must be in ioq_submit() + * and will notice luringcb->ret has been filled in when it + * eventually runs later. Coroutines cannot be entered recursively + * so avoid doing that! + */ +
[Qemu-devel] [PATCH v4 3/9] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 9b083e2bca..60f7c6c01c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.17.1
[Qemu-devel] [PATCH v4 9/9] block: add trace events for io_uring
Signed-off-by: Aarushi Mehta --- block/io_uring.c | 14 -- block/trace-events | 8 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/block/io_uring.c b/block/io_uring.c index 536a9fbe87..e92e45ccca 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -17,6 +17,7 @@ #include "block/raw-aio.h" #include "qemu/coroutine.h" #include "qapi/error.h" +#include "trace.h" #define MAX_EVENTS 128 @@ -192,12 +193,15 @@ static int ioq_submit(LuringState *s) void luring_io_plug(BlockDriverState *bs, LuringState *s) { +trace_luring_io_plug(); s->io_q.plugged++; } void luring_io_unplug(BlockDriverState *bs, LuringState *s) { assert(s->io_q.plugged); +trace_luring_io_unplug(s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (--s->io_q.plugged == 0 && !s->io_q.blocked && s->io_q.in_queue > 0) { ioq_submit(s); @@ -218,6 +222,7 @@ void luring_io_unplug(BlockDriverState *bs, LuringState *s) static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, uint64_t offset, int type) { +int ret; struct io_uring_sqe *sqes = io_uring_get_sqe(>ring); if (!sqes) { sqes = >sqeq; @@ -243,11 +248,14 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, } io_uring_sqe_set_data(sqes, luringcb); s->io_q.in_queue++; - +trace_luring_do_submit(s->io_q.blocked, s->io_q.plugged, + s->io_q.in_queue, s->io_q.in_flight); if (!s->io_q.blocked && (!s->io_q.plugged || s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) { -return ioq_submit(s); +ret = ioq_submit(s); +trace_luring_do_submit_done(ret); +return ret; } return 0; } @@ -295,6 +303,7 @@ LuringState *luring_init(Error **errp) int rc; LuringState *s; s = g_malloc0(sizeof(*s)); +trace_luring_init_state((void *)s, sizeof(*s)); struct io_uring *ring = >ring; rc = io_uring_queue_init(MAX_EVENTS, ring, 0); if (rc < 0) { @@ -312,4 +321,5 @@ void luring_cleanup(LuringState *s) { io_uring_queue_exit(>ring); g_free(s); +trace_luring_cleanup_state(); } diff --git a/block/trace-events b/block/trace-events index 1e0653ce6d..038ff14815 100644 --- a/block/trace-events +++ b/block/trace-events @@ -60,6 +60,14 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" file_copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd %d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64 +#io_uring.c +luring_init_state(void *s, size_t size) "s %p size %zu" +luring_cleanup_state(void) "s freed" +disable luring_io_plug(void) "plug" +disable luring_io_unplug(int blocked, int plugged, int queued, int inflight) "blocked %d plugged %d queued %d inflight %d" +disable luring_do_submit(int blocked, int plugged, int queued, int inflight) "blocked %d plugged %d queued %d inflight %d" +disable luring_do_submit_done(int ret) "submitted to kernel %d" + # qcow2.c qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d" qcow2_writev_done_req(void *co, int ret) "co %p ret %d" -- 2.17.1
[Qemu-devel] [PATCH v4 6/9] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index c10642a385..2709f0edc3 100644 --- a/util/async.c +++ b/util/async.c @@ -277,6 +277,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -341,6 +349,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -432,6 +463,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.17.1
[Qemu-devel] [PATCH v4 7/9] blockdev: accept io_uring as option
Reviewed-by: Stefan Hajnoczi Signed-off-by: Aarushi Mehta --- blockdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index 17c2d801d7..58a9e69268 100644 --- a/blockdev.c +++ b/blockdev.c @@ -386,6 +386,8 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, if ((aio = qemu_opt_get(opts, "aio")) != NULL) { if (!strcmp(aio, "native")) { *bdrv_flags |= BDRV_O_NATIVE_AIO; +} else if (!strcmp(aio, "io_uring")) { +*bdrv_flags |= BDRV_O_IO_URING; } else if (!strcmp(aio, "threads")) { /* this is the default */ } else { @@ -4568,7 +4570,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, -- 2.17.1
[Qemu-devel] [PATCH v4 5/9] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index dcaddec21f..e30f62024e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2521,6 +2521,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 9c7393b08c..5cf160a9c8 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.17.1
[Qemu-devel] [PATCH v3 8/8] block/fileposix: extend to use io_uring
Signed-off-by: Aarushi Mehta --- block/file-posix.c | 65 ++ 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index d018429672..50899064df 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -154,6 +154,7 @@ typedef struct BDRVRawState { bool has_write_zeroes:1; bool discard_zeroes:1; bool use_linux_aio:1; +bool use_linux_io_uring:1; bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; @@ -423,7 +424,7 @@ static QemuOptsList raw_runtime_opts = { { .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", }, { .name = "locking", @@ -494,6 +495,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); +#ifdef CONFIG_LINUX_IO_URING +s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING); +#endif locking = qapi_enum_parse(_lookup, qemu_opt_get(opts, "locking"), @@ -557,7 +561,9 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->shared_perm = BLK_PERM_ALL; #ifdef CONFIG_LINUX_AIO - /* Currently Linux does AIO only for files opened with O_DIRECT */ +/* + * Currently Linux does AIO only for files opened with O_DIRECT + */ if (s->use_linux_aio) { if (!(s->open_flags & O_DIRECT)) { error_setg(errp, "aio=native was specified, but it requires " @@ -578,6 +584,21 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } #endif /* !defined(CONFIG_LINUX_AIO) */ +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +if (!aio_setup_linux_io_uring(bdrv_get_aio_context(bs), errp)) { +error_prepend(errp, "Unable to use io_uring: "); +goto fail; +} +} +#else +if (s->use_linux_io_uring) { +error_setg(errp, "aio=io_uring was specified, but is not supported " + "in this build."); +ret = -EINVAL; +goto fail; +} +#endif /* !defined(CONFIG_LINUX_IO_URING) */ s->has_discard = true; s->has_write_zeroes = true; @@ -1883,6 +1904,12 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); assert(qiov->size == bytes); return laio_co_submit(bs, aio, s->fd, offset, qiov, type); +#endif +#ifdef CONFIG_LINUX_IO_URING +} else if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +assert(qiov->size == bytes); +return luring_co_submit(bs, aio, s->fd, offset, qiov, type); #endif } } @@ -1920,24 +1947,40 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, static void raw_aio_plug(BlockDriverState *bs) { -#ifdef CONFIG_LINUX_AIO +#if defined CONFIG_LINUX_AIO || defined CONFIG_LINUX_IO_URING BDRVRawState *s = bs->opaque; +#endif +#ifdef CONFIG_LINUX_AIO if (s->use_linux_aio) { LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); laio_io_plug(bs, aio); } #endif +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_io_uring) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +luring_io_plug(bs, aio); +} +#endif } static void raw_aio_unplug(BlockDriverState *bs) { -#ifdef CONFIG_LINUX_AIO +#if defined CONFIG_LINUX_AIO || defined CONFIG_LINUX_IO_URING BDRVRawState *s = bs->opaque; +#endif +#ifdef CONFIG_LINUX_AIO if (s->use_linux_aio) { LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); laio_io_unplug(bs, aio); } #endif +#ifdef CONFIG_LINUX_IO_URING +if (s->use_linux_aio) { +LuringState *aio = aio_get_linux_io_uring(bdrv_get_aio_context(bs)); +luring_io_unplug(bs, aio); +} +#endif } static int raw_co_flush_to_disk(BlockDriverState *bs) @@ -1963,8 +2006,10 @@ static int raw_co_flush_to_disk(BlockDriverState *bs) static void raw_aio_attach_aio_context(BlockDriverState *bs, AioContext *new_context) { +#if defined CONFIG_LINUX_AIO || defined CONFIG_LINUX_IO_URING +BDRVRawState *s = bs->opaque; +#endif #ifdef CONFIG_LINUX_AIO -BDRVRawState *s = bs->opaque; if (s->use_linux_aio) { Error *local_err; if (!aio_setup_linux_aio(new_context, _err)) { @@ -1974,6 +2019,16 @@ sta
[Qemu-devel] [PATCH v3 6/8] util/async: add aio interfaces for io_uring
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- util/async.c | 36 1 file changed, 36 insertions(+) diff --git a/util/async.c b/util/async.c index c10642a385..2709f0edc3 100644 --- a/util/async.c +++ b/util/async.c @@ -277,6 +277,14 @@ aio_ctx_finalize(GSource *source) } #endif +#ifdef CONFIG_LINUX_IO_URING +if (ctx->linux_io_uring) { +luring_detach_aio_context(ctx->linux_io_uring, ctx); +luring_cleanup(ctx->linux_io_uring); +ctx->linux_io_uring = NULL; +} +#endif + assert(QSLIST_EMPTY(>scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -341,6 +349,29 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ +if (ctx->linux_io_uring) { +return ctx->linux_io_uring; +} + +ctx->linux_io_uring = luring_init(errp); +if (!ctx->linux_io_uring) { +return NULL; +} + +luring_attach_aio_context(ctx->linux_io_uring, ctx); +return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ +assert(ctx->linux_io_uring); +return ctx->linux_io_uring; +} +#endif + void aio_notify(AioContext *ctx) { /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs @@ -432,6 +463,11 @@ AioContext *aio_context_new(Error **errp) #ifdef CONFIG_LINUX_AIO ctx->linux_aio = NULL; #endif + +#ifdef CONFIG_LINUX_IO_URING +ctx->linux_io_uring = NULL; +#endif + ctx->thread_pool = NULL; qemu_rec_mutex_init(>lock); timerlistgroup_init(>tlg, aio_timerlist_notify, ctx); -- 2.17.1
[Qemu-devel] [PATCH v3 5/8] stubs: add stubs for io_uring interface
Signed-off-by: Aarushi Mehta --- MAINTAINERS | 1 + stubs/Makefile.objs | 1 + stubs/io_uring.c| 32 3 files changed, 34 insertions(+) create mode 100644 stubs/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index 462c00a021..6c6672bda3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2510,6 +2510,7 @@ R: Stefan Hajnoczi L: qemu-bl...@nongnu.org S: Maintained F: block/io_uring.c +F: stubs/io_uring.c qcow2 M: Kevin Wolf diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 73452ad265..ea158cf0ee 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -13,6 +13,7 @@ stub-obj-y += iothread.o stub-obj-y += iothread-lock.o stub-obj-y += is-daemonized.o stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +stub-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += change-state-handler.o diff --git a/stubs/io_uring.c b/stubs/io_uring.c new file mode 100644 index 00..622d1e4648 --- /dev/null +++ b/stubs/io_uring.c @@ -0,0 +1,32 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/raw-aio.h" + +void luring_detach_aio_context(LuringState *s, AioContext *old_context) +{ +abort(); +} + +void luring_attach_aio_context(LuringState *s, AioContext *new_context) +{ +abort(); +} + +LuringState *luring_init(Error **errp) +{ +abort(); +} + +void luring_cleanup(LuringState *s) +{ +abort(); +} -- 2.17.1
[Qemu-devel] [PATCH v3 7/8] blockdev: accept io_uring as option
Signed-off-by: Aarushi Mehta Reviewed-by: Stefan Hajnoczi --- blockdev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/blockdev.c b/blockdev.c index 79fbac8450..b44b9d660d 100644 --- a/blockdev.c +++ b/blockdev.c @@ -386,6 +386,8 @@ static void extract_common_blockdev_options(QemuOpts *opts, int *bdrv_flags, if ((aio = qemu_opt_get(opts, "aio")) != NULL) { if (!strcmp(aio, "native")) { *bdrv_flags |= BDRV_O_NATIVE_AIO; +} else if (!strcmp(aio, "io_uring")) { +*bdrv_flags |= BDRV_O_IO_URING; } else if (!strcmp(aio, "threads")) { /* this is the default */ } else { @@ -4547,7 +4549,7 @@ QemuOptsList qemu_common_drive_opts = { },{ .name = "aio", .type = QEMU_OPT_STRING, -.help = "host AIO implementation (threads, native)", +.help = "host AIO implementation (threads, native, io_uring)", },{ .name = BDRV_OPT_CACHE_WB, .type = QEMU_OPT_BOOL, -- 2.17.1
[Qemu-devel] [PATCH v3 4/8] block/io_uring: implements interfaces for io_uring
Signed-off-by: Aarushi Mehta --- We need nested loops in ioq_submit because overflowed requests may be permitted to submit if existing ones are cleared. Hence, failure to fulfill an overflow request must break separately from normal submission. For now, to prevent any infinite loops, if the kernel fails to submit for any reason, we break (ie when number of submissions is zero). Now this is tested with a kali img with trace events to ensure it is actually running. The initramfs boots switched to threads. MAINTAINERS | 7 + block/Makefile.objs | 3 + block/io_uring.c| 301 include/block/aio.h | 16 ++- include/block/raw-aio.h | 15 ++ 5 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 block/io_uring.c diff --git a/MAINTAINERS b/MAINTAINERS index 3cacd751bf..462c00a021 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2504,6 +2504,13 @@ F: block/file-posix.c F: block/file-win32.c F: block/win32-aio.c +Linux io_uring +M: Aarushi Mehta +R: Stefan Hajnoczi +L: qemu-bl...@nongnu.org +S: Maintained +F: block/io_uring.c + qcow2 M: Kevin Wolf M: Max Reitz diff --git a/block/Makefile.objs b/block/Makefile.objs index 7a81892a52..348a003af5 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -18,6 +18,7 @@ block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-$(CONFIG_LINUX_IO_URING) += io_uring.o block-obj-y += null.o mirror.o commit.o io.o create.o block-obj-y += throttle-groups.o block-obj-$(CONFIG_LINUX) += nvme.o @@ -61,5 +62,7 @@ block-obj-$(if $(CONFIG_LZFSE),m,n) += dmg-lzfse.o dmg-lzfse.o-libs := $(LZFSE_LIBS) qcow.o-libs:= -lz linux-aio.o-libs := -laio +io_uring.o-cflags := $(LINUX_IO_URING_CFLAGS) +io_uring.o-libs:= $(LINUX_IO_URING_LIBS) parallels.o-cflags := $(LIBXML2_CFLAGS) parallels.o-libs := $(LIBXML2_LIBS) diff --git a/block/io_uring.c b/block/io_uring.c new file mode 100644 index 00..2a8c48a7dc --- /dev/null +++ b/block/io_uring.c @@ -0,0 +1,301 @@ +/* + * Linux io_uring support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2019 Aarushi Mehta + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/block.h" +#include "block/raw-aio.h" +#include "qemu/coroutine.h" +#include "qapi/error.h" + +#define MAX_EVENTS 128 + +typedef struct LuringAIOCB { +BlockAIOCB common; +Coroutine *co; +struct io_uring_sqe sqeq; +int ret; +QSIMPLEQ_ENTRY(LuringAIOCB) next; +} LuringAIOCB; + +typedef struct LuringQueue { +int plugged; +unsigned int in_queue; +unsigned int in_flight; +bool blocked; +QSIMPLEQ_HEAD(, LuringAIOCB) sq_overflow; +} LuringQueue; + +typedef struct LuringState { +AioContext *aio_context; + +struct io_uring ring; + +/* io queue for submit at batch. Protected by AioContext lock. */ +LuringQueue io_q; + +/* I/O completion processing. Only runs in I/O thread. */ +QEMUBH *completion_bh; +} LuringState; + +static void ioq_submit(LuringState *s); + +static inline int io_cqe_ret(struct io_uring_cqe *cqe) +{ +return cqe->res; +} + +/** + * qemu_luring_process_completions: + * @s: AIO state + * + * Fetches completed I/O requests, consumes cqes and invokes their callbacks. + * + */ +static void qemu_luring_process_completions(LuringState *s) +{ +struct io_uring_cqe *cqes; +/* + * Request completion callbacks can run the nested event loop. + * Schedule ourselves so the nested event loop will "see" remaining + * completed requests and process them. Without this, completion + * callbacks that wait for other requests using a nested event loop + * would hang forever. + */ +qemu_bh_schedule(s->completion_bh); + +while (!io_uring_peek_cqe(>ring, )) { +io_uring_cqe_seen(>ring, cqes); + +LuringAIOCB *luringcb = io_uring_cqe_get_data(cqes); +luringcb->ret = io_cqe_ret(cqes); +if (luringcb->co) { +/* + * If the coroutine is already entered it must be in ioq_submit() + * and will notice luringcb->ret has been filled in when it + * eventually runs later. Coroutines cannot be entered recursively + * so avoid doing that! + */ +if (!qemu_coroutine_entered(luringcb->co)) { +aio_co_wake(luringcb->co); +} +} else { +luringcb->common.cb(luringcb->common.opaque, luring
[Qemu-devel] [PATCH v3 2/8] qapi/block-core: add option for io_uring
Signed-off-by: Aarushi Mehta --- qapi/block-core.json | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 7ccbfff9d0..2773803890 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2776,11 +2776,13 @@ # # @threads: Use qemu's thread pool # @native: Use native AIO backend (only Linux and Windows) +# @io_uring:Use linux io_uring (only Linux) # -# Since: 2.9 +# Since: 2.9 @iouring Since: 4.1 ## { 'enum': 'BlockdevAioOptions', - 'data': [ 'threads', 'native' ] } + 'data': [ 'threads', 'native', +{ 'name': 'io_uring', 'if': 'defined(CONFIG_LINUX_IO_URING)' } ] } ## # @BlockdevCacheOptions: -- 2.17.1
[Qemu-devel] [PATCH v3 3/8] block/block: add BDRV flag for io_uring
Signed-off-by: Aarushi Mehta --- include/block/block.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/block.h b/include/block/block.h index 9b083e2bca..60f7c6c01c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -121,6 +121,7 @@ typedef struct HDGeometry { ignoring the format layer */ #define BDRV_O_NO_IO 0x1 /* don't initialize for I/O */ #define BDRV_O_AUTO_RDONLY 0x2 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING0x4 /* use io_uring instead of the thread pool */ #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) -- 2.17.1