[Qemu-devel] [PATCH v4 07/13] test-char: split char_udp_test
makes it possible to test the existing chardev-udp Signed-off-by: Anton Nefedov Reviewed-by: Marc-André Lureau --- tests/test-char.c | 57 ++- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/tests/test-char.c b/tests/test-char.c index 90b1f50..1f0de25 100644 --- a/tests/test-char.c +++ b/tests/test-char.c @@ -409,16 +409,11 @@ static void char_pipe_test(void) } #endif -static void char_udp_test(void) +static int make_udp_socket(int *port) { -struct sockaddr_in addr = { 0, }, other; -SocketIdleData d = { 0, }; -Chardev *chr; -CharBackend be; +struct sockaddr_in addr = { 0, }; socklen_t alen = sizeof(addr); int ret, sock = qemu_socket(PF_INET, SOCK_DGRAM, 0); -char buf[10]; -char *tmp; g_assert_cmpint(sock, >, 0); addr.sin_family = AF_INET ; @@ -429,19 +424,41 @@ static void char_udp_test(void) ret = getsockname(sock, (struct sockaddr *)&addr, &alen); g_assert_cmpint(ret, ==, 0); -tmp = g_strdup_printf("udp:127.0.0.1:%d", - ntohs(addr.sin_port)); -chr = qemu_chr_new("client", tmp); -g_assert_nonnull(chr); +*port = ntohs(addr.sin_port); +return sock; +} + +static void char_udp_test_internal(Chardev *reuse_chr, int sock) +{ +struct sockaddr_in other; +SocketIdleData d = { 0, }; +Chardev *chr; +CharBackend *be; +socklen_t alen = sizeof(other); +int ret; +char buf[10]; +char *tmp = NULL; + +if (reuse_chr) { +chr = reuse_chr; +be = chr->be; +} else { +int port; +sock = make_udp_socket(&port); +tmp = g_strdup_printf("udp:127.0.0.1:%d", port); +chr = qemu_chr_new("client", tmp); +g_assert_nonnull(chr); + +be = g_alloca(sizeof(CharBackend)); +qemu_chr_fe_init(be, chr, &error_abort); +} d.chr = chr; -qemu_chr_fe_init(&be, chr, &error_abort); -qemu_chr_fe_set_handlers(&be, socket_can_read_hello, socket_read_hello, +qemu_chr_fe_set_handlers(be, socket_can_read_hello, socket_read_hello, NULL, NULL, &d, NULL, true); ret = qemu_chr_write_all(chr, (uint8_t *)"hello", 5); g_assert_cmpint(ret, ==, 5); -alen = sizeof(addr); ret = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *)&other, &alen); g_assert_cmpint(ret, ==, 5); @@ -450,10 +467,16 @@ static void char_udp_test(void) main_loop(); -close(sock); +if (!reuse_chr) { +close(sock); +qemu_chr_fe_deinit(be, true); +} g_free(tmp); -qemu_chr_fe_deinit(&be); -object_unparent(OBJECT(chr)); +} + +static void char_udp_test(void) +{ +char_udp_test_internal(NULL, 0); } #ifdef HAVE_CHARDEV_SERIAL -- 2.7.4
[Qemu-devel] [PATCH v4 04/13] char: forbid direct chardevice access for hotswap devices
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support should not access CharDriver ptr directly as CharDriver might change. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Marc-André Lureau --- include/chardev/char-fe.h | 10 ++ chardev/char-fe.c | 7 +++ 2 files changed, 17 insertions(+) diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h index 0314870..385aa99 100644 --- a/include/chardev/char-fe.h +++ b/include/chardev/char-fe.h @@ -63,10 +63,20 @@ bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp); * * Returns the driver associated with a CharBackend or NULL if no * associated Chardev. + * Note: avoid this function as the driver should never be accessed directly, + * especially by the frontends that support chardevice hotswap. + * Consider qemu_chr_fe_backend_connected() to check for driver existence */ Chardev *qemu_chr_fe_get_driver(CharBackend *be); /** + * @qemu_chr_fe_backend_connected: + * + * Returns true if there is a chardevice associated with @be. + */ +bool qemu_chr_fe_backend_connected(CharBackend *be); + +/** * @qemu_chr_fe_set_handlers: * @b: a CharBackend * @fd_can_read: callback to get the amount of data the frontend may diff --git a/chardev/char-fe.c b/chardev/char-fe.c index 86a878b..be96fb5 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -183,9 +183,16 @@ void qemu_chr_fe_printf(CharBackend *be, const char *fmt, ...) Chardev *qemu_chr_fe_get_driver(CharBackend *be) { +/* this is unsafe for the users that support chardev hotswap */ +assert(be->chr_be_change == NULL); return be->chr; } +bool qemu_chr_fe_backend_connected(CharBackend *be) +{ +return !!be->chr; +} + bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; -- 2.7.4
[Qemu-devel] [PATCH v4 03/13] char: chardevice hotswap
This patch adds a possibility to change a char device without a frontend removal. 1. Ideally, it would have to happen transparently to a frontend, i.e. frontend would continue its regular operation. However, backends are not stateless and are set up by the frontends via qemu_chr_fe_<> functions, and it's not (generally) possible to replay that setup entirely in a backend code, as different chardevs respond to the setup calls differently, so do frontends work differently basing on those setup responses. Moreover, some frontend can generally get and save the backend pointer (qemu_chr_fe_get_driver()), and it will become invalid after backend change. So, a frontend which would like to support chardev hotswap has to register a "backend change" handler, and redo its backend setup there. 2. Write path can be used by multiple threads and thus protected with chr_write_lock. So hotswap also has to be protected so write functions won't access a backend being replaced. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- include/chardev/char-fe.h | 17 ++ include/chardev/char.h| 9 + chardev/char-fe.c | 33 +- chardev/char.c| 86 +++ qapi-schema.json | 40 ++ 5 files changed, 176 insertions(+), 9 deletions(-) diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h index 9f38060..0314870 100644 --- a/include/chardev/char-fe.h +++ b/include/chardev/char-fe.h @@ -10,6 +10,12 @@ typedef int BackendChangeHandler(void *opaque); * Chardev */ struct CharBackend { Chardev *chr; +/* chr_lock is used to ensure thread-safety of operations with + * the associated Chardev. + * There is no guarantee otherwise, generally, that *chr won't become + * invalid. + */ +QemuMutex chr_lock; IOEventHandler *chr_event; IOCanReadHandler *chr_can_read; IOReadHandler *chr_read; @@ -42,6 +48,17 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp); void qemu_chr_fe_deinit(CharBackend *b, bool del); /** + * @qemu_chr_fe_connect: + * + * Connects the already initialized front end with a given Chardev. + * Call qemu_chr_fe_deinit() to remove the association and + * release the driver. + * + * Returns: false on error. + */ +bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp); + +/** * @qemu_chr_fe_get_driver: * * Returns the driver associated with a CharBackend or NULL if no diff --git a/include/chardev/char.h b/include/chardev/char.h index 8a9ade4..22fd734 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -93,6 +93,15 @@ void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend); Chardev *qemu_chr_new(const char *label, const char *filename); /** + * @qemu_chr_change: + * + * Change an existing character backend + * + * @opts the new backend options + */ +void qemu_chr_change(QemuOpts *opts, Error **errp); + +/** * @qemu_chr_cleanup: * * Delete all chardevs (when leaving qemu) diff --git a/chardev/char-fe.c b/chardev/char-fe.c index 7054863..86a878b 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -33,24 +33,28 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +int ret = 0; -if (!s) { -return 0; +qemu_mutex_lock(&be->chr_lock); +if (be->chr) { +ret = qemu_chr_write(be->chr, buf, len, false); } +qemu_mutex_unlock(&be->chr_lock); -return qemu_chr_write(s, buf, len, false); +return ret; } int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +int ret = 0; -if (!s) { -return 0; +qemu_mutex_lock(&be->chr_lock); +if (be->chr) { +ret = qemu_chr_write(be->chr, buf, len, true); } +qemu_mutex_unlock(&be->chr_lock); -return qemu_chr_write(s, buf, len, true); +return ret; } int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len) @@ -182,7 +186,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be) return be->chr; } -bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; @@ -211,6 +215,16 @@ unavailable: return false; } +bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +{ +if (!qemu_chr_fe_connect(b, s, errp)) { +return false; +} + +qemu_mutex_init(&b->chr_lock); +return true; +} + void qemu_chr_fe_deinit(CharBackend *b, bool del) { assert(b); @@ -228,6 +242,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del) object_unparent(OBJECT(b->chr)); } b->chr = NULL; +qemu_mutex_destroy(&b->chr_lock); } } diff --git a/chardev/char.c b/char
[Qemu-devel] [PATCH v4 10/13] hmp: add hmp analogue for qmp-chardev-change
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Dr. David Alan Gilbert --- include/chardev/char.h | 10 ++ hmp.h | 1 + chardev/char.c | 2 +- hmp-commands.hx| 18 +- hmp.c | 34 ++ tests/test-hmp.c | 1 + 6 files changed, 64 insertions(+), 2 deletions(-) diff --git a/include/chardev/char.h b/include/chardev/char.h index 22fd734..1604ea9 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -81,6 +81,16 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend); /** + * @qemu_chr_parse_opts: + * + * Parse the options to the ChardevBackend struct. + * + * Returns: a new backend or NULL on error + */ +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, +Error **errp); + +/** * @qemu_chr_new: * * Create a new character backend from a URI. diff --git a/hmp.h b/hmp.h index d8b94ce..23e035c 100644 --- a/hmp.h +++ b/hmp.h @@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict); void hmp_nbd_server_add(Monitor *mon, const QDict *qdict); void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); +void hmp_chardev_change(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); void hmp_cpu_add(Monitor *mon, const QDict *qdict); diff --git a/chardev/char.c b/chardev/char.c index 6b2cb7b..8e8b881 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -567,7 +567,7 @@ static const char *chardev_alias_translate(const char *name) return name; } -static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) { Error *local_err = NULL; const ChardevClass *cc; diff --git a/hmp-commands.hx b/hmp-commands.hx index e763606..2cad758 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1724,7 +1724,23 @@ ETEXI STEXI @item chardev-add args @findex chardev-add -chardev_add accepts the same parameters as the -chardev command line switch. +chardev-add accepts the same parameters as the -chardev command line switch. + +ETEXI + +{ +.name = "chardev-change", +.args_type = "id:s,args:s", +.params = "id args", +.help = "change chardev", +.cmd= hmp_chardev_change, +}, + +STEXI +@item chardev-change args +@findex chardev-change +chardev-change accepts existing chardev @var{id} and then the same arguments +as the -chardev command line switch (except for "id"). ETEXI diff --git a/hmp.c b/hmp.c index 8c72c58..91e4317 100644 --- a/hmp.c +++ b/hmp.c @@ -2225,6 +2225,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_chardev_change(Monitor *mon, const QDict *qdict) +{ +const char *args = qdict_get_str(qdict, "args"); +const char *id; +Error *err = NULL; +ChardevBackend *backend = NULL; +ChardevReturn *ret = NULL; +QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args, + true); +if (!opts) { +error_setg(&err, "Parsing chardev args failed"); +goto end; +} + +id = qdict_get_str(qdict, "id"); +if (qemu_opts_id(opts)) { +error_setg(&err, "Unexpected 'id' parameter"); +goto end; +} + +backend = qemu_chr_parse_opts(opts, &err); +if (!backend) { +goto end; +} + +ret = qmp_chardev_change(id, backend, &err); + +end: +qapi_free_ChardevReturn(ret); +qapi_free_ChardevBackend(backend); +qemu_opts_del(opts); +hmp_handle_error(mon, &err); +} + void hmp_chardev_remove(Monitor *mon, const QDict *qdict) { Error *local_err = NULL; diff --git a/tests/test-hmp.c b/tests/test-hmp.c index 99e35ec..299df19 100644 --- a/tests/test-hmp.c +++ b/tests/test-hmp.c @@ -22,6 +22,7 @@ static int verbose; static const char *hmp_cmds[] = { "boot_set ndc", "chardev-add null,id=testchardev1", +"chardev-change testchardev1 ringbuf", "chardev-remove testchardev1", "commit all", "cpu-add 1", -- 2.7.4
[Qemu-devel] [PATCH v4 01/13] char: move QemuOpts->ChardevBackend translation to a separate func
parse function will be used by the following patch Signed-off-by: Anton Nefedov --- chardev/char.c | 81 -- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index bcfc065..f8817c5 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -556,17 +556,23 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -Chardev *qemu_chr_new_from_opts(QemuOpts *opts, -Error **errp) +static const char *chardev_alias_translate(const char *name) +{ +int i; +for (i = 0; i < (int)ARRAY_SIZE(chardev_alias_table); i++) { +if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) { +return chardev_alias_table[i].typename; +} +} +return name; +} + +static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) { Error *local_err = NULL; const ChardevClass *cc; -Chardev *chr; -int i; ChardevBackend *backend = NULL; -const char *name = qemu_opt_get(opts, "backend"); -const char *id = qemu_opts_id(opts); -char *bid = NULL; +const char *name = chardev_alias_translate(qemu_opt_get(opts, "backend")); if (name == NULL) { error_setg(errp, "chardev: \"%s\" missing backend", @@ -574,7 +580,40 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, return NULL; } -if (is_help_option(name)) { +cc = char_get_class(name, errp); +if (cc == NULL) { +return NULL; +} + +backend = g_new0(ChardevBackend, 1); +backend->type = CHARDEV_BACKEND_KIND_NULL; + +if (cc->parse) { +cc->parse(opts, backend, &local_err); +if (local_err) { +error_propagate(errp, local_err); +qapi_free_ChardevBackend(backend); +return NULL; +} +} else { +ChardevCommon *ccom = g_new0(ChardevCommon, 1); +qemu_chr_parse_common(opts, ccom); +backend->u.null.data = ccom; /* Any ChardevCommon member would work */ +} + +return backend; +} + +Chardev *qemu_chr_new_from_opts(QemuOpts *opts, Error **errp) +{ +const ChardevClass *cc; +Chardev *chr = NULL; +ChardevBackend *backend = NULL; +const char *name = chardev_alias_translate(qemu_opt_get(opts, "backend")); +const char *id = qemu_opts_id(opts); +char *bid = NULL; + +if (name && is_help_option(name)) { GString *str = g_string_new(""); chardev_name_foreach(help_string_append, str); @@ -589,38 +628,20 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, return NULL; } -for (i = 0; i < (int)ARRAY_SIZE(chardev_alias_table); i++) { -if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) { -name = chardev_alias_table[i].typename; -break; -} +backend = qemu_chr_parse_opts(opts, errp); +if (backend == NULL) { +return NULL; } cc = char_get_class(name, errp); if (cc == NULL) { -return NULL; +goto out; } -backend = g_new0(ChardevBackend, 1); -backend->type = CHARDEV_BACKEND_KIND_NULL; - if (qemu_opt_get_bool(opts, "mux", 0)) { bid = g_strdup_printf("%s-base", id); } -chr = NULL; -if (cc->parse) { -cc->parse(opts, backend, &local_err); -if (local_err) { -error_propagate(errp, local_err); -goto out; -} -} else { -ChardevCommon *ccom = g_new0(ChardevCommon, 1); -qemu_chr_parse_common(opts, ccom); -backend->u.null.data = ccom; /* Any ChardevCommon member would work */ -} - chr = qemu_chardev_new(bid ? bid : id, object_class_get_name(OBJECT_CLASS(cc)), backend, errp); -- 2.7.4
[Qemu-devel] [PATCH v4 13/13] serial: chardev hotswap support
for a backend change, a number of ioctls has to be replayed to sync the current setup of a frontend to a backend tty. This is hopefully enough so we don't have to track, store and replay the whole original control byte sequence. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Michael S. Tsirkin CC: Paolo Bonzini --- hw/char/serial.c | 32 ++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/hw/char/serial.c b/hw/char/serial.c index e47f0b6..9aec6c6 100644 --- a/hw/char/serial.c +++ b/hw/char/serial.c @@ -891,9 +891,37 @@ static void serial_reset(void *opaque) s->msr &= ~UART_MSR_ANY_DELTA; } +static int serial_be_change(void *opaque) +{ +SerialState *s = opaque; + +qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, + serial_event, serial_be_change, s, NULL, true); + +serial_update_parameters(s); + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK, + &s->last_break_enable); + +s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0; +serial_update_msl(s); + +if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) { +serial_update_tiocm(s); +} + +if (s->watch_tag > 0) { +g_source_remove(s->watch_tag); +s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP, + serial_watch_cb, s); +} + +return 0; +} + void serial_realize_core(SerialState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create serial device, empty char device"); return; } @@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp) qemu_register_reset(serial_reset, s); qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, - serial_event, NULL, s, NULL, true); + serial_event, serial_be_change, s, NULL, true); fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH); fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH); serial_reset(s); -- 2.7.4
[Qemu-devel] [PATCH v4 02/13] char: add backend hotswap handler
Frontends should have an interface to setup the handler of a backend change. The interface will be used in the next commits Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Marc-André Lureau --- include/chardev/char-fe.h | 5 + backends/rng-egd.c | 2 +- chardev/char-fe.c | 4 +++- chardev/char-mux.c | 1 + gdbstub.c | 2 +- hw/arm/pxa2xx.c | 3 ++- hw/arm/strongarm.c | 2 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 2 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 2 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/serial.c| 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/stm32f2xx_usart.c | 3 ++- hw/char/terminal3270.c | 2 +- hw/char/virtio-console.c| 4 ++-- hw/char/xen_console.c | 2 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 2 +- hw/usb/ccid-card-passthru.c | 2 +- hw/usb/dev-serial.c | 2 +- hw/usb/redirect.c | 2 +- monitor.c | 4 ++-- net/colo-compare.c | 10 ++ net/filter-mirror.c | 6 +++--- net/slirp.c | 2 +- net/vhost-user.c| 7 --- qtest.c | 2 +- tests/test-char.c | 14 ++ tests/vhost-user-test.c | 2 +- 47 files changed, 76 insertions(+), 57 deletions(-) diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h index 2cbb262..9f38060 100644 --- a/include/chardev/char-fe.h +++ b/include/chardev/char-fe.h @@ -4,6 +4,7 @@ #include "chardev/char.h" typedef void IOEventHandler(void *opaque, int event); +typedef int BackendChangeHandler(void *opaque); /* This is the backend as seen by frontend, the actual backend is * Chardev */ @@ -12,6 +13,7 @@ struct CharBackend { IOEventHandler *chr_event; IOCanReadHandler *chr_can_read; IOReadHandler *chr_read; +BackendChangeHandler *chr_be_change; void *opaque; int tag; int fe_open; @@ -54,6 +56,8 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be); * receive * @fd_read: callback to receive data from char * @fd_event: event callback + * @be_change: backend change callback; passing NULL means hot backend change + * is not supported and will not be attempted * @opaque: an opaque pointer for the callbacks * @context: a main loop context or NULL for the default * @set_open: whether to call qemu_chr_fe_set_open() implicitely when @@ -68,6 +72,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, IOCanReadHandler *fd_can_read, IOReadHandler *fd_read, IOEventHandler *fd_event, + BackendChangeHandler *be_change, void *opaque, GMainContext *context, bool set_open); diff --git a/backends/rng-egd.c b/backends/rng-egd.c index e7ce2ca..d2b9ce6 100644 --- a/backends/rng-egd.c +++ b/backends/rng-egd.c @@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp) /* FIXME we should resubmit pending requests when the CDS reconnects. */ qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read, - rng_egd_chr_read, NULL, s, NULL, true); + rng_egd_chr_read, NULL, NULL, s, NULL, true); } static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp) diff --git a/chardev/char-fe.c b/chardev/char-fe.c index 3f90f05..7054863 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -216,7 +216,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del) assert(b); if (b->chr) { -qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true); +qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true); if (b->chr->be == b) { b->chr->be = NULL; } @@ -235,6 +235,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, IOCanReadHandler *fd_can_read, IOReadHandler *fd_read, IOEventHandler *fd_event, +
[Qemu-devel] [PATCH v4 05/13] char: avoid chardevice direct access
frontends should avoid accessing CharDriver struct where possible Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Marc-André Lureau --- include/chardev/char-fe.h | 7 +++ chardev/char-fe.c | 5 + hw/arm/strongarm.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/escc.c | 6 +++--- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/ipoctal232.c| 2 +- hw/char/parallel.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/terminal3270.c | 2 +- hw/char/xen_console.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/misc/ivshmem.c | 4 ++-- hw/usb/ccid-card-passthru.c | 4 ++-- hw/usb/dev-serial.c | 5 ++--- hw/usb/redirect.c | 5 ++--- net/filter-mirror.c | 2 +- 22 files changed, 38 insertions(+), 28 deletions(-) diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h index 385aa99..61c7f97 100644 --- a/include/chardev/char-fe.h +++ b/include/chardev/char-fe.h @@ -77,6 +77,13 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be); bool qemu_chr_fe_backend_connected(CharBackend *be); /** + * @qemu_chr_fe_backend_open: + * + * Returns true if chardevice associated with @be is open. + */ +bool qemu_chr_fe_backend_open(CharBackend *be); + +/** * @qemu_chr_fe_set_handlers: * @b: a CharBackend * @fd_can_read: callback to get the amount of data the frontend may diff --git a/chardev/char-fe.c b/chardev/char-fe.c index be96fb5..3120441 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -193,6 +193,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be) return !!be->chr; } +bool qemu_chr_fe_backend_open(CharBackend *be) +{ +return be->chr && be->chr->be_open; +} + bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c index 1fa082c..6a45dcc 100644 --- a/hw/arm/strongarm.c +++ b/hw/arm/strongarm.c @@ -1106,7 +1106,7 @@ static void strongarm_uart_tx(void *opaque) if (s->utcr3 & UTCR3_LBM) /* loopback */ { strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1); -} else if (qemu_chr_fe_get_driver(&s->chr)) { +} else if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1); diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c index 3a9335c..6143494 100644 --- a/hw/char/cadence_uart.c +++ b/hw/char/cadence_uart.c @@ -279,7 +279,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, GIOCondition cond, int ret; /* instant drain the fifo when there's no back-end */ -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { s->tx_count = 0; return FALSE; } diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c index 9b56fb8..95ccec6 100644 --- a/hw/char/debugcon.c +++ b/hw/char/debugcon.c @@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = { static void debugcon_realize_core(DebugconState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create debugcon device, empty char device"); return; } diff --git a/hw/char/escc.c b/hw/char/escc.c index 3546df3..89ae9eb 100644 --- a/hw/char/escc.c +++ b/hw/char/escc.c @@ -417,7 +417,7 @@ static void escc_update_parameters(ChannelState *s) int speed, parity, data_bits, stop_bits; QEMUSerialSetParams ssp; -if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser) +if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser) return; if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) { @@ -557,7 +557,7 @@ static void escc_mem_write(void *opaque, hwaddr addr, trace_escc_mem_writeb_data(CHN_C(s), val); s->tx = val; if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled -if (qemu_chr_fe_get_driver(&s->chr)) { +if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx, 1); @@ -1013,7 +1013,7 @@ static void escc_realize(DeviceState *dev, Error **errp) ESCC_SIZE << s->it_shift); for (i = 0; i < 2; i++) { -if (qemu_chr_fe_get_driver(&s
[Qemu-devel] [PATCH v4 00/13] chardevice hotswap
Changed in v4: - rebased on top of the latest chardev changes - remarks applied - patch 1 fixed so it works with alias names Changed in v3: - minor remarks to patch 1 applied - patch 3: avoid using bottom-half, handle syncronously As mentioned, it gets thing complicated and is only a problem for a monitor-connected chardev hotswap and that is not supported for now - tests added (patches 6-9) This serie is a v2 of the February submit http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html The interface is changed as requested and the changes are slightly reworked and split into separate patches. The patchset adds support of the character device change without a frontend device removal. Yet isa-serial and virtio-serial frontends are supported. The feature can be helpful for e.g. Windows debug allowing to establish connection to a live VM from VM with WinDbg. Anton Nefedov (13): char: move QemuOpts->ChardevBackend translation to a separate func char: add backend hotswap handler char: chardevice hotswap char: forbid direct chardevice access for hotswap devices char: avoid chardevice direct access test-char: unref chardev-udp after test test-char: split char_udp_test test-char: split char_file_test test-char: add hotswap test hmp: add hmp analogue for qmp-chardev-change virtio-console: chardev hotswap support serial: move TIOCM update to a separate function serial: chardev hotswap support include/chardev/char-fe.h | 39 +++ include/chardev/char.h | 19 hmp.h | 1 + backends/rng-egd.c | 2 +- chardev/char-fe.c | 49 +++-- chardev/char-mux.c | 1 + chardev/char.c | 167 +++- gdbstub.c | 2 +- hmp-commands.hx | 18 ++- hmp.c | 34 ++ hw/arm/pxa2xx.c | 3 +- hw/arm/strongarm.c | 4 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 4 +- hw/char/debugcon.c | 4 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 8 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 4 +- hw/char/grlib_apbuart.c | 4 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 4 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/parallel.c | 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 4 +- hw/char/sclpconsole.c | 4 +- hw/char/serial.c| 63 --- hw/char/sh_serial.c | 4 +- hw/char/spapr_vty.c | 4 +- hw/char/stm32f2xx_usart.c | 3 +- hw/char/terminal3270.c | 4 +- hw/char/virtio-console.c| 35 +- hw/char/xen_console.c | 4 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 4 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 6 +- hw/usb/ccid-card-passthru.c | 6 +- hw/usb/dev-serial.c | 7 +- hw/usb/redirect.c | 7 +- monitor.c | 4 +- net/colo-compare.c | 10 +- net/filter-mirror.c | 8 +- net/slirp.c | 2 +- net/vhost-user.c| 7 +- qapi-schema.json| 40 +++ qtest.c | 2 +- tests/test-char.c | 263 +--- tests/test-hmp.c| 1 + tests/vhost-user-test.c | 2 +- 55 files changed, 685 insertions(+), 202 deletions(-) -- 2.7.4
Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap
On 06/13/2017 03:32 PM, Marc-André Lureau wrote: Hi On Tue, Jun 13, 2017 at 3:53 PM Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> wrote: > The existing chr_write_lock belongs to Chardev. > For the hotswap case, we need to ensure that be->chr won't change and > the old Chardev (together with its mutex) won't be destroyed while it's > used in the write functions. > > Maybe we could move the lock to CharBackend, instead of creating a new > one. But I guess this >1. won't work for mux, where multiple CharBackends share the same > Chardev >2. won't work for some chardevs, like pty which uses the lock for the > timer handler > > Sorry if I'm not explaining clearly enough or maybe I'm missing some > easier solution? > > > It looks to me like you would get the same guarantees by using the > chr_write_lock directly: > > @@ -1381,7 +1374,7 @@ ChardevReturn *qmp_chardev_change(const char *id, > ChardevBackend *backend, > closed_sent = true; > } > > -qemu_mutex_lock(&be->chr_lock); > +qemu_mutex_lock(&chr->chr_write_lock); > chr->be = NULL; > qemu_chr_fe_connect(be, chr_new, &error_abort); > > @@ -1389,14 +1382,14 @@ ChardevReturn *qmp_chardev_change(const char > *id, ChardevBackend *backend, > error_setg(errp, "Chardev '%s' change failed", chr_new->label); > chr_new->be = NULL; > qemu_chr_fe_connect(be, chr, &error_abort); > -qemu_mutex_unlock(&be->chr_lock); > +qemu_mutex_unlock(&chr->chr_write_lock); > if (closed_sent) { > qemu_chr_be_event(chr, CHR_EVENT_OPENED); > } > object_unref(OBJECT(chr_new)); > return NULL; > } > -qemu_mutex_unlock(&be->chr_lock); > +qemu_mutex_unlock(&chr->chr_write_lock); > > I wonder if we should rename 'chr_write_lock' to just 'lock' :) > hi, but isn't there a potential race? Assume thread-1 is somewhere in qemu_chr_write() and thread-2 is in qmp_chardev_change(). T2 can change the chardev, and destroy the old one while T1 still has the pointer ( = use after free). Or T1 unlocks chr_write_lock, T2 acquires that in qemu_chr_write_buffer(), then T1 destroys it together with the chardev ( = undefined behaviour). What I'm trying to say is that the critical section for the hotswap is bigger than what chr_write_lock currently covers - we also need to cover the be->chr pointer. Or am I missing something? Thanks for the detail, I think you are correct, and probably your solution is good. Another way would be to object_ref/unref the Chardev when using it in seperate thread, this way we wouldn't need an extra lock, I think. Paolo might be of good advice to get this right. We also need to protect be->chr pointer, not just the Chardev itself, so probably ref/unref won't be thread-safe, strictly speaking? e.g. fe_write(CharBackend *be) { ref(be->chr); ... unref(be->chr); // oops, be->chr might have changed } or fe_write(CharBackend *be) { Chardev *chr = be->chr; ref(chr); // not good either, chr might be destroyed already ... unref(chr); } -- Marc-André Lureau /Anton
Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap
On 06/09/2017 05:53 PM, Marc-André Lureau wrote: On Thu, Jun 1, 2017 at 3:23 PM Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> wrote: On 05/31/2017 10:20 PM, Marc-André Lureau wrote: > Hi > > On Tue, May 30, 2017 at 6:12 PM Anton Nefedov > mailto:anton.nefe...@virtuozzo.com> <mailto:anton.nefe...@virtuozzo.com <mailto:anton.nefe...@virtuozzo.com>>> wrote: > > This patch adds a possibility to change a char device without a frontend > removal. > > 1. Ideally, it would have to happen transparently to a frontend, i.e. > frontend would continue its regular operation. > However, backends are not stateless and are set up by the frontends > via qemu_chr_fe_<> functions, and it's not (generally) possible to > replay > that setup entirely in a backend code, as different chardevs respond > to the setup calls differently, so do frontends work differently basing > on those setup responses. > Moreover, some frontend can generally get and save the backend pointer > (qemu_chr_fe_get_driver()), and it will become invalid after backend > change. > > So, a frontend which would like to support chardev hotswap has to > register > a "backend change" handler, and redo its backend setup there. > > 2. Write path can be used by multiple threads and thus protected with > chr_write_lock. > So hotswap also has to be protected so write functions won't access > a backend being replaced. > > > Tbh, I don't understand the need for a different lock. Could you > explain? Even better would be to write a test that shows in which way > the lock helps. > hi Marc-André, The existing chr_write_lock belongs to Chardev. For the hotswap case, we need to ensure that be->chr won't change and the old Chardev (together with its mutex) won't be destroyed while it's used in the write functions. Maybe we could move the lock to CharBackend, instead of creating a new one. But I guess this 1. won't work for mux, where multiple CharBackends share the same Chardev 2. won't work for some chardevs, like pty which uses the lock for the timer handler Sorry if I'm not explaining clearly enough or maybe I'm missing some easier solution? It looks to me like you would get the same guarantees by using the chr_write_lock directly: @@ -1381,7 +1374,7 @@ ChardevReturn *qmp_chardev_change(const char *id, ChardevBackend *backend, closed_sent = true; } -qemu_mutex_lock(&be->chr_lock); +qemu_mutex_lock(&chr->chr_write_lock); chr->be = NULL; qemu_chr_fe_connect(be, chr_new, &error_abort); @@ -1389,14 +1382,14 @@ ChardevReturn *qmp_chardev_change(const char *id, ChardevBackend *backend, error_setg(errp, "Chardev '%s' change failed", chr_new->label); chr_new->be = NULL; qemu_chr_fe_connect(be, chr, &error_abort); -qemu_mutex_unlock(&be->chr_lock); +qemu_mutex_unlock(&chr->chr_write_lock); if (closed_sent) { qemu_chr_be_event(chr, CHR_EVENT_OPENED); } object_unref(OBJECT(chr_new)); return NULL; } -qemu_mutex_unlock(&be->chr_lock); +qemu_mutex_unlock(&chr->chr_write_lock); I wonder if we should rename 'chr_write_lock' to just 'lock' :) hi, but isn't there a potential race? Assume thread-1 is somewhere in qemu_chr_write() and thread-2 is in qmp_chardev_change(). T2 can change the chardev, and destroy the old one while T1 still has the pointer ( = use after free). Or T1 unlocks chr_write_lock, T2 acquires that in qemu_chr_write_buffer(), then T1 destroys it together with the chardev ( = undefined behaviour). What I'm trying to say is that the critical section for the hotswap is bigger than what chr_write_lock currently covers - we also need to cover the be->chr pointer. Or am I missing something? I can try to add a test but can't quite see yet how to freeze the old chardev somewhere in cc->chr_write() and hotswap it while it's there. > > > Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com> > <mailto:anton.nefe...@virtuozzo.com <mailto:anton.nefe...@virtuozzo.com>>> > Reviewed-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com> > <mailto:vsement...@virtuozzo.com <mailto:v
Re: [Qemu-devel] [PATCH v2 02/15] file-posix: support BDRV_REQ_ALLOCATE
On 06/01/2017 10:54 PM, Eric Blake wrote: On 06/01/2017 02:49 PM, Eric Blake wrote: On 06/01/2017 10:14 AM, Anton Nefedov wrote: Current write_zeroes implementation is good enough to satisfy this flag too Signed-off-by: Anton Nefedov --- block/file-posix.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) Are we sure that fallocate() is always fast, or are there some file systems where it is no faster than manually writing zeroes? I'm worried that blindly claiming BDRV_REQ_ALLOCATE may fail if we encounter a libc not so much fail as in "break the guest", but fail as in "take far more time than we were expecting, pessimising our behavior to worse than if we had not tried the allocation at all" or kernel-based fallback that takes a slow patch on our behalf. I would expect such filesystems to not support fallocate. Though I must admit I can't see anywhere in the documentation that it MUST be strictly faster than writing zeroes; it would look very strange to me if there were a slowpath fallback somewhere past the libc. /Anton
Re: [Qemu-devel] [PATCH v2 03/15] blkdebug: support BDRV_REQ_ALLOCATE
On 06/01/2017 10:50 PM, Eric Blake wrote: On 06/01/2017 10:14 AM, Anton Nefedov wrote: Support the flag if the underlying BDS supports it Signed-off-by: Anton Nefedov --- block/blkdebug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) Shouldn't other passthrough drivers (like raw-format.c) make this change as well? Right. Wonder why they even enumerate those instead of just bs->supported_zero_flags = bs->file->bs->supported_zero_flags; diff --git a/block/blkdebug.c b/block/blkdebug.c index a5196e8..8b1401b 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -415,7 +415,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, bs->supported_write_flags = BDRV_REQ_FUA & bs->file->bs->supported_write_flags; -bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & +bs->supported_zero_flags = +(BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_ALLOCATE) & bs->file->bs->supported_zero_flags; ret = -EINVAL; /Anton
Re: [Qemu-devel] [PATCH v2 01/15] block: introduce BDRV_REQ_ALLOCATE flag
On 06/01/2017 10:07 PM, Eric Blake wrote: On 06/01/2017 10:14 AM, Anton Nefedov wrote: The flag is supposed to indicate that the region of the disk image has to be sufficiently allocated so it reads as zeroes. The call with the flag set has to return -ENOTSUP if allocation cannot be done efficiently (i.e. without falling back to writing actual buffers) Signed-off-by: Anton Nefedov --- block/io.c| 19 --- block/trace-events| 1 + include/block/block.h | 6 +- 3 files changed, 22 insertions(+), 4 deletions(-) You may want to 'git config diff.orderFile /path/to/file' (with a suitably populated file) so that .h files come first in your diffs, as that can aid reviewers. At one point, there was a thread about adding such a file to qemu.git proper for everyone to share, although it seems to have stalled. Thanks, will do diff --git a/block/io.c b/block/io.c index ed31810..d47efa9 100644 --- a/block/io.c +++ b/block/io.c @@ -1272,7 +1272,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, assert(!bs->supported_zero_flags); } -if (ret == -ENOTSUP) { +if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) { I'd feel MUCH better if you first fixed the conditional just above this point to ensure that if the caller requests BDRV_REQ_ALLOCATE that we do not call bdrv->bdrv_co_pwrite_zeroes() unless bs->supported_zero_flags also mentions this bit. Remember, the existing semantics of .bdrv_co_pwrite_zeroes() merely state that we must return -ENOTSUP unless we can guarantee that we read back as zeroes, but puts no timing constraints on it. A driver that has not been retrofitted to understand the BDRV_REQ_ALLOCATE flag will therefore risk taking too long. Using bs->supported_zero_flags as your gate is what will let you avoid calling into a driver that has not been audited for fitting the new contract. Absolutely; I have even added that check but must have lost that at some point. Meant to add that much earlier though, to bdrv_co_pwrite_zeroes() /* Fall back to bounce buffer if write zeroes is unsupported */ BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; @@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && -!(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && -qemu_iovec_is_zero(qiov)) { +!(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) && +drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) { flags |= BDRV_REQ_ZERO_WRITE; if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { flags |= BDRV_REQ_MAY_UNMAP; @@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, assert(flags & BDRV_REQ_ZERO_WRITE); if (head_padding_bytes || tail_padding_bytes) { +if (flags & BDRV_REQ_ALLOCATE) { +return -ENOTSUP; +} Can we assert that BDRV_REQ_ALLOCATE will only be supplied by a caller that is already using aligned values? Or is that too strict? as I understand the top driver should not care much about the child driver alignment preferences? that's what the common bdrv_* interface is there for buf = qemu_blockalign(bs, align); iov = (struct iovec) { .iov_base = buf, @@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, return ret; } +if (qiov && flags & BDRV_REQ_ALLOCATE) { +/* allocation request with qiov provided doesn't make much sense */ +return -ENOTSUP; Should this be an assertion (bug in the program for mixing things that don't make sense) rather than just a runtime error return? incline to agree here +} + bdrv_inc_in_flight(bs); /* * Align write if necessary by performing a read-modify-write cycle. @@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, { trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); +if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) { +/* nonsense */ +return -ENOTSUP; +} Ditto. yep + if (!(child->bs->open_flags & BDRV_O_UNMAP)) { flags &= ~BDRV_REQ_MAY_UNMAP; } diff --git a/block/trace-events b/block/trace-events index 9a71c7f..a15c2cc 100644 --- a/block/trace-events +++ b/block/trace-events @@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs bdrv
[Qemu-devel] [PATCH v2 15/15] iotest 046: test simultaneous cluster write error case
Signed-off-by: Anton Nefedov --- tests/qemu-iotests/046 | 38 +- tests/qemu-iotests/046.out | 23 +++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046 index f2ebecf..c210b55 100755 --- a/tests/qemu-iotests/046 +++ b/tests/qemu-iotests/046 @@ -29,7 +29,8 @@ status=1 # failure is the default! _cleanup() { - _cleanup_test_img +_cleanup_test_img +rm "$TEST_DIR/blkdebug.conf" } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -188,6 +189,37 @@ overlay_io | $QEMU_IO blkdebug::"$TEST_IMG" | _filter_qemu_io |\ sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g' echo +echo "== Concurrency error case ==" + +# 1. 1st request allocated the cluster, stop before it writes and updates L2 +# 2. 2nd request starts at the same cluster must complete write and start +#waiting for the 1st to update L2 +# 3. Resume the 1st request to make it fail (injected error) +# 4. 2nd request must wake and fail as well +#1 cluster will end up leaked +cat > "$TEST_DIR/blkdebug.conf" <
[Qemu-devel] [PATCH v2 14/15] qcow2: allow concurrent unaligned writes to the same clusters
If COW area of a write request to unallocated cluster is empty, concurrent write requests can be allowed with a little bit of extra synchronization; so they don't have to wait until L2 is filled. Let qcow2_cluster.c::handle_dependencies() do the most of the job: if there is an in-flight request to the same cluster, and the current request wants to write in its COW area, and its COW area is marked empty, - steal the allocated offset and write concurrently. Let the original request update L2 later when it likes. This gives an improvement for parallel misaligned writes to unallocated clusters with no backing data: HDD fio over xfs iodepth=4: seqwrite 4k: 18400 -> 22800 IOPS ( x1.24 ) seqwrite 68k: 1600 -> 2300 IOPS ( x1.44 ) Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 169 +++--- block/qcow2.c | 28 - block/qcow2.h | 12 +++- 3 files changed, 181 insertions(+), 28 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 2fa549d..03b2e6c 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -898,20 +898,32 @@ out: /* * Check if there already is an AIO write request in flight which allocates - * the same cluster. In this case we need to wait until the previous - * request has completed and updated the L2 table accordingly. + * the same cluster. + * In this case, check if that request has explicitly allowed to write + * in its COW area(s). + * If yes - fill the meta to point to the same cluster. + * If no - we need to wait until the previous request has completed and + *updated the L2 table accordingly or + *has allowed writing in its COW area(s). * Returns: * 0 if there was no dependency. *cur_bytes indicates the number of * bytes from guest_offset that can be read before the next * dependency must be processed (or the request is complete). - * *m is not modified + * *m, *host_offset are not modified + * + * 1 if there is a dependency but it is possible to write concurrently + * *m is filled accordingly, + * *cur_bytes may have decreased and describes + * the length of the area that can be written to, + * *host_offset contains the starting host image offset to write to * * -EAGAIN if we had to wait for another request. The caller - * must start over, so consider *cur_bytes undefined. + * must start over, so consider *cur_bytes and *host_offset undefined. * *m is not modified */ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, -uint64_t *cur_bytes, QCowL2Meta **m) + uint64_t *host_offset, uint64_t *cur_bytes, + QCowL2Meta **m) { BDRVQcow2State *s = bs->opaque; QCowL2Meta *old_alloc; @@ -924,7 +936,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, const uint64_t old_start = l2meta_cow_start(old_alloc); const uint64_t old_end = l2meta_cow_end(old_alloc); -if (end <= old_start || start >= old_end) { +if (end <= old_start || start >= old_end || old_alloc->piggybacked) { /* No intersection */ continue; } @@ -936,21 +948,95 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, continue; } -/* Stop if an l2meta already exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ -if (*m) { +/* offsets of the cluster we're intersecting in */ +const uint64_t cluster_start = start_of_cluster(s, start); +const uint64_t cluster_end = cluster_start + s->cluster_size; + +const uint64_t old_data_start = old_start ++ old_alloc->cow_start.nb_bytes; +const uint64_t old_data_end = old_alloc->offset ++ old_alloc->cow_end.offset; + +const bool conflict_in_data_area = +end > old_data_start && start < old_data_end; +const bool conflict_in_old_cow_start = +/* 1). new write request area is before the old */ +start < old_data_start +&& /* 2). old request did not allow writing in its cow area */ +!old_alloc->cow_start.reduced; +const bool conflict_in_old_cow_end = +/* 1). new write request area is after the old */ +start > old_data_start +&& /* 2). old request did not allow writing in its cow area */ +!old_alloc->cow_end.reduced; + +if (conflict_in_data_a
[Qemu-devel] [PATCH v2 12/15] qcow2-cluster: slightly refactor handle_dependencies()
- assert the alignment on return if the allocation has to stop (at the start of a running allocation) - make use of const specifiers for local variables Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 20 +++- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 3dafd19..95e8862 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -900,15 +900,15 @@ out: * Check if there already is an AIO write request in flight which allocates * the same cluster. In this case we need to wait until the previous * request has completed and updated the L2 table accordingly. - * * Returns: * 0 if there was no dependency. *cur_bytes indicates the number of * bytes from guest_offset that can be read before the next - * dependency must be processed (or the request is complete) + * dependency must be processed (or the request is complete). + * *m is not modified * - * -EAGAIN if we had to wait for another request, previously gathered - * information on cluster allocation may be invalid now. The caller - * must start over anyway, so consider *cur_bytes undefined. + * -EAGAIN if we had to wait for another request. The caller + * must start over, so consider *cur_bytes undefined. + * *m is not modified */ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, uint64_t *cur_bytes, QCowL2Meta **m) @@ -919,10 +919,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { -uint64_t start = guest_offset; -uint64_t end = start + bytes; -uint64_t old_start = l2meta_cow_start(old_alloc); -uint64_t old_end = l2meta_cow_end(old_alloc); +const uint64_t start = guest_offset; +const uint64_t end = start + bytes; +const uint64_t old_start = l2meta_cow_start(old_alloc); +const uint64_t old_end = l2meta_cow_end(old_alloc); if (end <= old_start || start >= old_end) { /* No intersection */ @@ -939,6 +939,8 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, * and deal with requests depending on them before starting to * gather new ones. Not worth the trouble. */ if (bytes == 0 && *m) { +/* start must be cluster aligned at this point */ +assert(start == start_of_cluster(s, start)); *cur_bytes = 0; return 0; } -- 2.7.4
[Qemu-devel] [PATCH v2 08/15] qcow2: truncate preallocated space
From: "Denis V. Lunev" This could be done after calculation of the end of data and metadata in the qcow2 image. Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2-cluster.c | 9 + block/qcow2-refcount.c | 7 +++ block/qcow2.c | 8 block/qcow2.h | 3 +++ 4 files changed, 27 insertions(+) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 88dd555..c39e825 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1958,3 +1958,12 @@ fail: g_free(l1_table); return ret; } + +void qcow2_update_data_end(BlockDriverState *bs, uint64_t off) +{ +BDRVQcow2State *s = bs->opaque; + +if (s->data_end < off) { +s->data_end = off; +} +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 873a1d2..8156466 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -744,6 +744,9 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, ret = alloc_refcount_block(bs, cluster_index, &refcount_block); if (ret < 0) { goto fail; +} else { +qcow2_update_data_end(bs, s->refcount_table_offset + +s->refcount_table_size * sizeof(uint64_t)); } } old_table_index = table_index; @@ -865,6 +868,8 @@ retry: s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) { return -EFBIG; +} else { +qcow2_update_data_end(bs, s->free_cluster_index << s->cluster_bits); } #ifdef DEBUG_ALLOC2 @@ -929,6 +934,8 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, if (ret < 0) { return ret; +} else { +qcow2_update_data_end(bs, offset + (nb_clusters << s->cluster_bits)); } return i; diff --git a/block/qcow2.c b/block/qcow2.c index b090833..33e5455 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1194,6 +1194,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, } } +s->data_end = bdrv_getlength(bs->file->bs); + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -1941,12 +1943,18 @@ static int qcow2_inactivate(BlockDriverState *bs) static void qcow2_close(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; if (!(s->flags & BDRV_O_INACTIVE)) { qcow2_inactivate(bs); + +/* truncate preallocated space */ +if (!bs->read_only && s->data_end < bdrv_getlength(bs->file->bs)) { +bdrv_truncate(bs->file, s->data_end, NULL); +} } cache_clean_timer_del(bs); diff --git a/block/qcow2.h b/block/qcow2.h index a0d222d..e28c54a 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -297,6 +297,7 @@ typedef struct BDRVQcow2State { char *image_backing_format; uint64_t prealloc_size; +uint64_t data_end; } BDRVQcow2State; typedef struct Qcow2COWRegion { @@ -607,4 +608,6 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); +void qcow2_update_data_end(BlockDriverState *bs, uint64_t off); + #endif -- 2.7.4
[Qemu-devel] [PATCH v2 13/15] qcow2-cluster: make handle_dependencies() logic easier to follow
Avoid complicated nested conditions; return or continue asap instead. The logic is not changed. Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 45 ++--- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 95e8862..2fa549d 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -926,32 +926,31 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, if (end <= old_start || start >= old_end) { /* No intersection */ -} else { -if (start < old_start) { -/* Stop at the start of a running allocation */ -bytes = old_start - start; -} else { -bytes = 0; -} +continue; +} -/* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ -if (bytes == 0 && *m) { -/* start must be cluster aligned at this point */ -assert(start == start_of_cluster(s, start)); -*cur_bytes = 0; -return 0; -} +if (start < old_start) { +/* Stop at the start of a running allocation */ +bytes = old_start - start; +/* ..if there is no other conflict, keep checking */ +continue; +} -if (bytes == 0) { -/* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ -qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); -return -EAGAIN; -} +/* Stop if an l2meta already exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. */ +if (*m) { +/* start must be cluster aligned at this point */ +assert(start == start_of_cluster(s, start)); +*cur_bytes = 0; +return 0; } + +/* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ +qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); +return -EAGAIN; } /* Make sure that existing clusters and new allocations are only used up to -- 2.7.4
[Qemu-devel] [PATCH v2 05/15] qcow2: do not COW the empty areas
If COW area of the newly allocated cluster is zeroes, there is no reason to write zero sectors in perform_cow() again now as whole clusters are zeroed out in single chunks by handle_alloc_space(). Introduce QCowL2Meta field "reduced", since the existing fields (offset and nb_bytes) still has to keep other write requests from simultaneous writing in the area iotest 060: write to the discarded cluster does not trigger COW anymore. so, break on write_aio event instead, will work for the test (but write won't fail anymore, so update reference output) iotest 066: cluster-alignment areas that were not really COWed are now detected as zeroes, hence the initial write has to be exactly the same size for the maps to match performance tests: === qemu-io, results in seconds to complete (less is better) random write 4k to empty image, no backing HDD 64k cluster 128M over 128M image: 160 -> 160 ( x1 ) 128M over 2G image:86 -> 84 ( x1 ) 128M over 8G image:40 -> 29 ( x1.4 ) 1M cluster 32M over 8G image:58 -> 23 ( x2.5 ) SSD 64k cluster 2G over 2G image:71 -> 38 ( x1.9 ) 512M over 8G image:85 -> 8 ( x10.6 ) 1M cluster 128M over 32G image: 314 -> 2 ( x157 ) - improvement grows bigger the bigger the cluster size, - first data portions to the fresh image benefit the most (more chance to hit an unallocated cluster) - SSD improvement is close to the IO length reduction rate (e.g. writing only 4k instead of 64k) gives theoretical x16 and practical x10 improvement) fio tests over xfs, empty image (cluster 64k), no backing, first megabytes of random writes: randwrite 4k, size=8g: HDD (io_size=128m) : 730 -> 1050 IOPS ( x1.45) SSD (io_size=512m) : 1500 -> 7000 IOPS ( x4.7 ) random writes io_size==image_size: randwrite 4k, size=2g io_size=2g: HDD : 200 IOPS (no difference) SSD : 7500 -> 9500 IOPS ( x1.3 ) sequential write: seqwrite 4k, size=4g, iodepth=4 SSD : 7000 -> 18000 IOPS ( x2.6 ) - numbers are similar to qemu-io tests, slightly less improvement (damped by fs?) Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 4 +++- block/qcow2.c | 32 ++-- block/qcow2.h | 4 tests/qemu-iotests/060 | 2 +- tests/qemu-iotests/060.out | 3 ++- tests/qemu-iotests/066 | 2 +- tests/qemu-iotests/066.out | 4 ++-- 7 files changed, 43 insertions(+), 8 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index d779ea1..ba8ce76 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -758,7 +758,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) BDRVQcow2State *s = bs->opaque; int ret; -if (r->nb_bytes == 0) { +if (r->nb_bytes == 0 || r->reduced) { return 0; } @@ -1267,10 +1267,12 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, .cow_start = { .offset = 0, .nb_bytes = offset_into_cluster(s, guest_offset), +.reduced= false, }, .cow_end = { .offset = nb_bytes, .nb_bytes = avail_bytes - nb_bytes, +.reduced= false, }, }; qemu_co_queue_init(&(*m)->dependent_requests); diff --git a/block/qcow2.c b/block/qcow2.c index cd5efba..067bb87 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -64,6 +64,9 @@ typedef struct { #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 +static bool is_zero_sectors(BlockDriverState *bs, int64_t start, +uint32_t count); + static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) { const QCowHeader *cow_header = (const void *)buf; @@ -1575,10 +1578,30 @@ fail: return ret; } +static void handle_cow_reduce(BlockDriverState *bs, QCowL2Meta *m) +{ +if (bs->encrypted) { +return; +} +if (!m->cow_start.reduced && m->cow_start.nb_bytes != 0 && +is_zero_sectors(bs, +(m->offset + m->cow_start.offset) >> BDRV_SECTOR_BITS, +m->cow_start.nb_bytes >> BDRV_SECTOR_BITS)) { +m->cow_start.reduced = true; +} +if (!m->cow_end.reduced && m->cow_end.nb_bytes != 0 && +is_zero_sectors(bs, +(m->offset + m->cow_end.offset) >> BDRV_SECTOR_BITS, +m->cow_end.nb_bytes >> BDRV_SECTOR_BITS)) { +m->cow_end.reduced = true; +} +} + static void handle_alloc_space(BlockDriverState *bs, QCowL
[Qemu-devel] [PATCH v2 11/15] qcow2: fix misleading comment about L2 linking
Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ed65961..3dafd19 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -827,12 +827,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { -/* if two concurrent writes happen to the same unallocated cluster - * each write allocates separate cluster and writes data concurrently. - * The first one to complete updates l2 table with pointer to its - * cluster the second one has to do RMW (which is done above by - * perform_cow()), update l2 table with its cluster pointer and free - * old cluster. This is what this loop does */ +/* handle_dependencies() protects from normal cluster allocation + * collision; still L2 entry might be !0 in case of zero or compressed + * cluster reusage or writing over the snapshot + */ if (l2_table[l2_index + i] != 0) { old_cluster[j++] = l2_table[l2_index + i]; } -- 2.7.4
[Qemu-devel] [PATCH v2 07/15] qcow2: set inactive flag
Qcow2State and BlockDriverState flags have to be in sync Signed-off-by: Anton Nefedov --- block/qcow2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/qcow2.c b/block/qcow2.c index c6fb714..b090833 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1932,6 +1932,7 @@ static int qcow2_inactivate(BlockDriverState *bs) if (result == 0) { qcow2_mark_clean(bs); +s->flags |= BDRV_O_INACTIVE; } return result; -- 2.7.4
[Qemu-devel] [PATCH v2 00/15] qcow2: space preallocation and COW improvements
Changes in v2: - introduce new BDRV flag for write_zeroes() instead of using driver callback directly. Skipped introducing new functions like bdrv_co_pallocate() for now: 1. it seems ok to keep calling this write_zeroes() as zeroes are expected; 2. most of the code can be reused now anyway, so changes to write_zeroes() path are not significant 3. write_zeroes() alignment and max-request limits can also be reused As a possible alternative we can have bdrv_co_pallocate() which can switch to pwrite_zeroes(,flags|=BDRV_REQ_ALLOCATE) early. This pull request is to address a few performance problems of qcow2 format: 1. non cluster-aligned write requests (to unallocated clusters) explicitly pad data with zeroes if there is no backing data. This can be avoided and the whole clusters are preallocated and zeroed in a single efficient write_zeroes() operation, also providing better host file continuity 2. moreover, efficient write_zeroes() operation can be used to preallocate space megabytes ahead which gives noticeable improvement on some storage types (e.g. distributed storages where space allocation operation is expensive) 3. preallocating/zeroing the clusters in advance makes possible to enable simultaneous writes to the same unallocated cluster, which is beneficial for parallel sequential write operations which are not cluster-aligned Performance test results are added to commit messages (see patch 3, 12) Anton Nefedov (11): block: introduce BDRV_REQ_ALLOCATE flag file-posix: support BDRV_REQ_ALLOCATE blkdebug: support BDRV_REQ_ALLOCATE qcow2: do not COW the empty areas qcow2: set inactive flag qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation qcow2: fix misleading comment about L2 linking qcow2-cluster: slightly refactor handle_dependencies() qcow2-cluster: make handle_dependencies() logic easier to follow qcow2: allow concurrent unaligned writes to the same clusters iotest 046: test simultaneous cluster write error case Denis V. Lunev (3): qcow2: alloc space for COW in one chunk qcow2: preallocation at image expand qcow2: truncate preallocated space Pavel Butsykin (1): qcow2: check space leak at the end of the image block/blkdebug.c | 3 +- block/file-posix.c | 9 +- block/io.c | 19 ++- block/qcow2-cache.c| 3 + block/qcow2-cluster.c | 218 +++-- block/qcow2-refcount.c | 21 +++ block/qcow2.c | 273 - block/qcow2.h | 26 block/trace-events | 1 + include/block/block.h | 6 +- tests/qemu-iotests/026.out | 104 ++ tests/qemu-iotests/026.out.nocache | 104 ++ tests/qemu-iotests/029.out | 5 +- tests/qemu-iotests/046 | 38 +- tests/qemu-iotests/046.out | 23 tests/qemu-iotests/060 | 2 +- tests/qemu-iotests/060.out | 13 +- tests/qemu-iotests/061.out | 5 +- tests/qemu-iotests/066 | 2 +- tests/qemu-iotests/066.out | 9 +- tests/qemu-iotests/098.out | 7 +- tests/qemu-iotests/108.out | 5 +- tests/qemu-iotests/112.out | 5 +- 23 files changed, 789 insertions(+), 112 deletions(-) -- 2.7.4
[Qemu-devel] [PATCH v2 06/15] qcow2: preallocation at image expand
From: "Denis V. Lunev" This patch adds image preallocation at expand to provide better locality of QCOW2 image file and optimize this procedure for some distributed storages where this procedure is slow. Image expand requests have to be suspended until the allocation is performed which is done via special QCowL2Meta. This meta is invisible to handle_dependencies() code. This is the main reason for also calling preallocation before metadata write: it might intersect with preallocation triggered by another IO, and has to yield Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2-cache.c| 3 + block/qcow2-cluster.c | 5 ++ block/qcow2-refcount.c | 14 + block/qcow2.c | 147 + block/qcow2.h | 5 ++ 5 files changed, 174 insertions(+) diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 1d25147..aa9da5f 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -204,6 +204,9 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) return ret; } +/* check and preallocate extra space if touching a fresh metadata cluster */ +qcow2_handle_prealloc(bs, c->entries[i].offset, s->cluster_size); + if (c == s->refcount_block_cache) { BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); } else if (c == s->l2_table_cache) { diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index ba8ce76..88dd555 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -108,6 +108,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, goto fail; } +qcow2_handle_prealloc(bs, new_l1_table_offset, + QEMU_ALIGN_UP(new_l1_size2, s->cluster_size)); + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); @@ -1821,6 +1824,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } +qcow2_handle_prealloc(bs, offset, s->cluster_size); + ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); if (ret < 0) { if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 7c06061..873a1d2 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -547,6 +547,8 @@ static int alloc_refcount_block(BlockDriverState *bs, } /* Write refcount blocks to disk */ +qcow2_handle_prealloc(bs, meta_offset, blocks_clusters * s->cluster_size); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, blocks_clusters * s->cluster_size); @@ -561,6 +563,10 @@ static int alloc_refcount_block(BlockDriverState *bs, cpu_to_be64s(&new_table[i]); } +qcow2_handle_prealloc(bs, table_offset, + QEMU_ALIGN_UP(table_size * sizeof(uint64_t), +s->cluster_size)); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, table_size * sizeof(uint64_t)); @@ -2104,6 +2110,8 @@ write_refblocks: goto fail; } +qcow2_handle_prealloc(bs, refblock_offset, s->cluster_size); + /* The size of *refcount_table is always cluster-aligned, therefore the * write operation will not overflow */ on_disk_refblock = (void *)((char *) *refcount_table + @@ -2158,6 +2166,8 @@ write_refblocks: } assert(reftable_size < INT_MAX / sizeof(uint64_t)); +qcow2_handle_prealloc(bs, reftable_offset, + reftable_size * sizeof(uint64_t)); ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, reftable_size * sizeof(uint64_t)); if (ret < 0) { @@ -2845,6 +2855,10 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, cpu_to_be64s(&new_reftable[i]); } +qcow2_handle_prealloc(bs, new_reftable_offset, + QEMU_ALIGN_UP(new_reftable_size * sizeof(uint64_t), +s->cluster_size)); + ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, new_reftable_size * sizeof(uint64_t)); diff --git a/block/qcow2.c b/block/qcow2.c index 067bb87..c6fb714 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -464,6 +464,11 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_NUMBER, .help = "Clean unused cache entries after this time (in seconds)", }, +{ +.name = QCOW2_OPT_PRE
[Qemu-devel] [PATCH v2 09/15] qcow2: check space leak at the end of the image
From: Pavel Butsykin Preallocating memory in the image may remain unused after the fall, for the qcow2_check adds the ability to identify and fix it, so as not to store extra memory on the host. Signed-off-by: Pavel Butsykin Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2.c | 31 +++ tests/qemu-iotests/026.out | 104 - tests/qemu-iotests/026.out.nocache | 104 - tests/qemu-iotests/029.out | 5 +- tests/qemu-iotests/060.out | 10 +++- tests/qemu-iotests/061.out | 5 +- tests/qemu-iotests/066.out | 5 +- tests/qemu-iotests/098.out | 7 ++- tests/qemu-iotests/108.out | 5 +- tests/qemu-iotests/112.out | 5 +- 10 files changed, 225 insertions(+), 56 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 33e5455..809102a 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -322,6 +322,32 @@ int qcow2_mark_consistent(BlockDriverState *bs) return 0; } +static int qcow2_check_extra_preallocation(BlockDriverState *bs, +BdrvCheckResult *res, BdrvCheckMode fix) +{ +BDRVQcow2State *s = bs->opaque; +uint64_t img_size = bdrv_getlength(bs->file->bs); + +if (res->image_end_offset < img_size) { +uint64_t count = +DIV_ROUND_UP(img_size - res->image_end_offset, s->cluster_size); +fprintf(stderr, "%s space leaked at the end of the image %jd\n", +fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", +img_size - res->image_end_offset); +res->leaks += count; +if (fix & BDRV_FIX_LEAKS) { +int ret = bdrv_truncate(bs->file, res->image_end_offset, NULL); +if (ret < 0) { +res->check_errors++; +return ret; +} +res->leaks_fixed += count; +} +} + +return 0; +} + static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix) { @@ -330,6 +356,11 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, return ret; } +ret = qcow2_check_extra_preallocation(bs, result, fix); +if (ret < 0) { +return ret; +} + if (fix && result->check_errors == 0 && result->corruptions == 0) { ret = qcow2_mark_clean(bs); if (ret < 0) { diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out index 86a50a2..e8cf348 100644 --- a/tests/qemu-iotests/026.out +++ b/tests/qemu-iotests/026.out @@ -5,7 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 5; imm: off; once: on; write write failed: Input/output error -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 5; imm: off; once: on; write -b @@ -33,7 +36,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 28; imm: off; once: on; write -b @@ -181,7 +187,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 5; imm: off; once: on; write write failed: Input/output error -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 5; imm: off; once: on; write -b @@ -207,7 +216,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 28; imm: off; once: on; write -b @@ -468,20 +480,27 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: refblock_alloc_hookup; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked a
[Qemu-devel] [PATCH v2 03/15] blkdebug: support BDRV_REQ_ALLOCATE
Support the flag if the underlying BDS supports it Signed-off-by: Anton Nefedov --- block/blkdebug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index a5196e8..8b1401b 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -415,7 +415,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, bs->supported_write_flags = BDRV_REQ_FUA & bs->file->bs->supported_write_flags; -bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & +bs->supported_zero_flags = +(BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_ALLOCATE) & bs->file->bs->supported_zero_flags; ret = -EINVAL; -- 2.7.4
[Qemu-devel] [PATCH v2 02/15] file-posix: support BDRV_REQ_ALLOCATE
Current write_zeroes implementation is good enough to satisfy this flag too Signed-off-by: Anton Nefedov --- block/file-posix.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index de2d3a2..117bbee 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -527,7 +527,6 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, s->has_discard = true; s->has_write_zeroes = true; -bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { s->needs_alignment = true; } @@ -577,6 +576,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif +bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; +if (s->has_write_zeroes || s->has_fallocate) { +bs->supported_zero_flags |= BDRV_REQ_ALLOCATE; +} + ret = 0; fail: if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { @@ -1390,6 +1394,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) } #endif +if (!s->has_fallocate) { +aiocb->bs->supported_zero_flags &= ~BDRV_REQ_ALLOCATE; +} return -ENOTSUP; } -- 2.7.4
[Qemu-devel] [PATCH v2 01/15] block: introduce BDRV_REQ_ALLOCATE flag
The flag is supposed to indicate that the region of the disk image has to be sufficiently allocated so it reads as zeroes. The call with the flag set has to return -ENOTSUP if allocation cannot be done efficiently (i.e. without falling back to writing actual buffers) Signed-off-by: Anton Nefedov --- block/io.c| 19 --- block/trace-events| 1 + include/block/block.h | 6 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index ed31810..d47efa9 100644 --- a/block/io.c +++ b/block/io.c @@ -1272,7 +1272,7 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, assert(!bs->supported_zero_flags); } -if (ret == -ENOTSUP) { +if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) { /* Fall back to bounce buffer if write zeroes is unsupported */ BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE; @@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && -!(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes && -qemu_iovec_is_zero(qiov)) { +!(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) && +drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) { flags |= BDRV_REQ_ZERO_WRITE; if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { flags |= BDRV_REQ_MAY_UNMAP; @@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child, assert(flags & BDRV_REQ_ZERO_WRITE); if (head_padding_bytes || tail_padding_bytes) { +if (flags & BDRV_REQ_ALLOCATE) { +return -ENOTSUP; +} buf = qemu_blockalign(bs, align); iov = (struct iovec) { .iov_base = buf, @@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child, return ret; } +if (qiov && flags & BDRV_REQ_ALLOCATE) { +/* allocation request with qiov provided doesn't make much sense */ +return -ENOTSUP; +} + bdrv_inc_in_flight(bs); /* * Align write if necessary by performing a read-modify-write cycle. @@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, { trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags); +if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) { +/* nonsense */ +return -ENOTSUP; +} + if (!(child->bs->open_flags & BDRV_O_UNMAP)) { flags &= ~BDRV_REQ_MAY_UNMAP; } diff --git a/block/trace-events b/block/trace-events index 9a71c7f..a15c2cc 100644 --- a/block/trace-events +++ b/block/trace-events @@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d" bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p offset %"PRId64" count %d flags %#x" +bdrv_co_allocate(void *bs, int64_t offset, int count) "bs %p offset %"PRId64" count %d" bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u cluster_offset %"PRId64" cluster_bytes %u" # block/stream.c diff --git a/include/block/block.h b/include/block/block.h index 9b355e9..53a357c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -65,9 +65,13 @@ typedef enum { BDRV_REQ_NO_SERIALISING = 0x8, BDRV_REQ_FUA= 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, +/* BDRV_REQ_ALLOCATE is used to indicate that the driver is to + * efficiently allocate the space so it reads as zeroes or return an error + */ +BDRV_REQ_ALLOCATE = 0x40, /* Mask of valid flags */ -BDRV_REQ_MASK = 0x3f, +BDRV_REQ_MASK = 0x7f, } BdrvRequestFlags; typedef struct BlockSizes { -- 2.7.4
[Qemu-devel] [PATCH v2 04/15] qcow2: alloc space for COW in one chunk
From: "Denis V. Lunev" Currently each single write operation can result in 3 write operations if guest offsets are not cluster aligned. One write is performed for the real payload and two for COW-ed areas. Thus the data possibly lays non-contiguously on the host filesystem. This will reduce further sequential read performance significantly. The patch allocates the space in the file with cluster granularity, ensuring 1. better host offset locality 2. less space allocation operations (which can be expensive on distributed storage) Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index b3ba5da..cd5efba 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1575,6 +1575,24 @@ fail: return ret; } +static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) +{ +BDRVQcow2State *s = bs->opaque; +QCowL2Meta *m; + +for (m = l2meta; m != NULL; m = m->next) { +uint64_t bytes = m->nb_clusters << s->cluster_bits; + +if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { +continue; +} + +/* try to alloc host space in one chunk for better locality */ +bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset, bytes, + BDRV_REQ_ALLOCATE); +} +} + static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -1656,8 +1674,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, if (ret < 0) { goto fail; } - qemu_co_mutex_unlock(&s->lock); + +if (bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE) { +handle_alloc_space(bs, l2meta); +} + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), cluster_offset + offset_in_cluster); -- 2.7.4
[Qemu-devel] [PATCH v2 10/15] qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation
Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 2 ++ block/qcow2.c | 8 +++- block/qcow2.h | 4 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c39e825..ed65961 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1143,6 +1143,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) { BDRVQcow2State *s = bs->opaque; +const uint64_t old_data_end = s->data_end; int l2_index; uint64_t *l2_table; uint64_t entry; @@ -1264,6 +1265,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, .alloc_offset = alloc_cluster_offset, .offset = start_of_cluster(s, guest_offset), .nb_clusters= nb_clusters, +.clusters_are_trailing = alloc_cluster_offset >= old_data_end, .keep_old_clusters = keep_old_clusters, diff --git a/block/qcow2.c b/block/qcow2.c index 809102a..92d0af6 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1686,7 +1686,13 @@ restart: if (end <= bdrv_getlength(file)) { /* No need to care, file size will not be changed */ -return false; + +/* We're safe to assume that the area is zeroes if the area + * was allocated at the end of data (s->data_end). + * In this case, the only way for file length to be bigger is that + * the area was preallocated by another request. + */ +return m->clusters_are_trailing; } meta = g_alloca(sizeof(*meta)); diff --git a/block/qcow2.h b/block/qcow2.h index e28c54a..2fd8510 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -333,6 +333,10 @@ typedef struct QCowL2Meta /** Do not free the old clusters */ bool keep_old_clusters; +/** True if the area is allocated after the end of data area + * (i.e. >= s->data_end), which means that it is zeroed */ +bool clusters_are_trailing; + /** * Requests that overlap with this allocation and wait to be restarted * when the allocating request has completed. -- 2.7.4
Re: [Qemu-devel] [PATCH v3 10/13] hmp: add hmp analogue for qmp-chardev-change
On 06/01/2017 10:12 AM, Marc-André Lureau wrote: Hi On Tue, May 30, 2017 at 6:08 PM Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> wrote: Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> Reviewed-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>> CC: Dr. David Alan Gilbert mailto:dgilb...@redhat.com>> --- chardev/char.c| 2 +- hmp-commands.hx | 16 hmp.c | 34 ++ hmp.h | 1 + include/sysemu/char.h | 12 5 files changed, 64 insertions(+), 1 deletion(-) diff --git a/chardev/char.c b/chardev/char.c index 2d6e204..1e2a2dd 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -888,7 +888,7 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) { Error *local_err = NULL; const ChardevClass *cc; diff --git a/hmp-commands.hx b/hmp-commands.hx index baeac47..19bfddd 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as the -chardev command line switch. ETEXI { +.name = "chardev-change", +.args_type = "id:s,args:s", +.params = "id args", +.help = "change chardev", +.cmd= hmp_chardev_change, +}, + +STEXI +@item chardev-change args +@findex chardev-change +chardev_change accepts existing chardev @var{id} and then the same arguments chardev_change vs chardev-change. Apparently, there is this difference for chardev-add too Done. +as the -chardev command line switch (except for "id"). + +ETEXI + +{ .name = "chardev-remove", .args_type = "id:s", .params = "id", diff --git a/hmp.c b/hmp.c index 20f5dab..7660495 100644 --- a/hmp.c +++ b/hmp.c @@ -2224,6 +2224,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_chardev_change(Monitor *mon, const QDict *qdict) +{ +const char *args = qdict_get_str(qdict, "args"); +const char *id; +Error *err = NULL; +ChardevBackend *backend = NULL; +ChardevReturn *ret = NULL; +QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args, + true); +if (!opts) { +error_setg(&err, "Parsing chardev args failed"); +goto end; +} + +id = qdict_get_str(qdict, "id"); +if (qemu_opts_id(opts)) { +error_setg(&err, "Unexpected 'id' parameter"); enlighten me, how is this triggered? correct variant: chardev-change char2 pipe,path=/tmp/cent.fifo vs. (without this check) (qemu) chardev-change char2 pipe,id=char2,path=/tmp/cent.fifo chardev-change char2 pipe,id=char2,path=/tmp/cent.fifo Duplicate ID 'char2' for chardev Parsing chardev args failed or (qemu) chardev-change char2 pipe,id=newchar3,path=/tmp/cent.fifo the command would succeed but the user might expect the id to change, but it wouldn't +goto end; +} + +backend = qemu_chr_parse_opts(opts, &err); +if (!backend) { +goto end; +} + +ret = qmp_chardev_change(id, backend, &err); + +end: +qapi_free_ChardevReturn(ret); +qapi_free_ChardevBackend(backend); +qemu_opts_del(opts); +hmp_handle_error(mon, &err); +} + void hmp_chardev_remove(Monitor *mon, const QDict *qdict) { Error *local_err = NULL; diff --git a/hmp.h b/hmp.h index d8b94ce..23e035c 100644 --- a/hmp.h +++ b/hmp.h @@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict); void hmp_nbd_server_add(Monitor *mon, const QDict *qdict); void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); +void hmp_chardev_change(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); void hmp_cpu_add(Monitor *mon, const QDict *qdict); diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 342f531..18fcd26 100644 --- a/include/sysemu/char.h +
Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap
On 05/31/2017 10:20 PM, Marc-André Lureau wrote: Hi On Tue, May 30, 2017 at 6:12 PM Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> wrote: This patch adds a possibility to change a char device without a frontend removal. 1. Ideally, it would have to happen transparently to a frontend, i.e. frontend would continue its regular operation. However, backends are not stateless and are set up by the frontends via qemu_chr_fe_<> functions, and it's not (generally) possible to replay that setup entirely in a backend code, as different chardevs respond to the setup calls differently, so do frontends work differently basing on those setup responses. Moreover, some frontend can generally get and save the backend pointer (qemu_chr_fe_get_driver()), and it will become invalid after backend change. So, a frontend which would like to support chardev hotswap has to register a "backend change" handler, and redo its backend setup there. 2. Write path can be used by multiple threads and thus protected with chr_write_lock. So hotswap also has to be protected so write functions won't access a backend being replaced. Tbh, I don't understand the need for a different lock. Could you explain? Even better would be to write a test that shows in which way the lock helps. hi Marc-André, The existing chr_write_lock belongs to Chardev. For the hotswap case, we need to ensure that be->chr won't change and the old Chardev (together with its mutex) won't be destroyed while it's used in the write functions. Maybe we could move the lock to CharBackend, instead of creating a new one. But I guess this 1. won't work for mux, where multiple CharBackends share the same Chardev 2. won't work for some chardevs, like pty which uses the lock for the timer handler Sorry if I'm not explaining clearly enough or maybe I'm missing some easier solution? I can try to add a test but can't quite see yet how to freeze the old chardev somewhere in cc->chr_write() and hotswap it while it's there. Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com>> Reviewed-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>> patch looks good otherwise --- chardev/char.c| 126 ++ include/sysemu/char.h | 9 qapi-schema.json | 40 3 files changed, 165 insertions(+), 10 deletions(-) -- Marc-André Lureau /Anton
Re: [Qemu-devel] [PATCH v1 05/13] qcow2: set inactive flag
On 05/26/2017 11:11 AM, Kevin Wolf wrote: Am 19.05.2017 um 11:34 hat Anton Nefedov geschrieben: Qcow2State and BlockDriverState flags have to be in sync Signed-off-by: Anton Nefedov --- block/qcow2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/qcow2.c b/block/qcow2.c index 6e7ce96..07c1706 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1939,6 +1939,7 @@ static int qcow2_inactivate(BlockDriverState *bs) if (result == 0) { qcow2_mark_clean(bs); +s->flags |= BDRV_O_INACTIVE; } Good catch. But can't we simply use bs->open_flags and completely get rid of s->flags? Kevin the problem is that qcow2_close() or qcow2_inactivate() have to consider the flags that the image was opened with, but bs->open_flags (at least in invalidate_cache() case) at that point contain the new flags already /Anton
[Qemu-devel] [PATCH v3 09/13] test-char: add hotswap test
Signed-off-by: Anton Nefedov --- tests/test-char.c | 68 +++ 1 file changed, 68 insertions(+) diff --git a/tests/test-char.c b/tests/test-char.c index ed6b18f..cd54f88 100644 --- a/tests/test-char.c +++ b/tests/test-char.c @@ -635,6 +635,73 @@ static void char_invalid_test(void) g_assert_null(chr); } +static int chardev_change(void *opaque) +{ +return 0; +} + +static int chardev_change_denied(void *opaque) +{ +return -1; +} + +static void char_hotswap_test(void) +{ +char *chr_args; +Chardev *chr; +CharBackend be; + +gchar *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL); +char *filename = g_build_filename(tmp_path, "file", NULL); +ChardevFile file = { .out = filename }; +ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE, + .u.file.data = &file }; + +int port; +int sock = make_udp_socket(&port); +g_assert_cmpint(sock, >, 0); + +chr_args = g_strdup_printf("udp:127.0.0.1:%d", port); + +chr = qemu_chr_new("chardev", chr_args); +qemu_chr_fe_init(&be, chr, &error_abort); + +/* check that chardev operates correctly */ +char_udp_test_internal(chr, sock); + +/* set the handler that denies the hotswap */ +qemu_chr_fe_set_handlers(&be, NULL, NULL, + NULL, chardev_change_denied, NULL, NULL, true); + +/* now, change is denied and has to keep the old backend operating */ +qmp_chardev_change("chardev", &backend, NULL); +g_assert(be.chr == chr); + +char_udp_test_internal(chr, sock); + +/* now allow the change */ +qemu_chr_fe_set_handlers(&be, NULL, NULL, + NULL, chardev_change, NULL, NULL, true); + +/* has to succeed now */ +qmp_chardev_change("chardev", &backend, &error_abort); +g_assert(be.chr != chr); + +close(sock); +chr = be.chr; + +/* run the file chardev test */ +char_file_test_internal(chr, filename); + +object_unparent(OBJECT(chr)); + +g_unlink(filename); +g_free(filename); +g_rmdir(tmp_path); +g_free(tmp_path); +g_free(chr_args); +} + int main(int argc, char **argv) { qemu_init_main_loop(&error_abort); @@ -666,6 +733,7 @@ int main(int argc, char **argv) #endif g_test_add_func("/char/socket", char_socket_test); g_test_add_func("/char/udp", char_udp_test); +g_test_add_func("/char/hotswap", char_hotswap_test); return g_test_run(); } -- 2.7.4
[Qemu-devel] [PATCH v3 08/13] test-char: split char_file_test
makes it possible to test the existing chardev-file Signed-off-by: Anton Nefedov --- tests/test-char.c | 127 +- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/tests/test-char.c b/tests/test-char.c index ad0752a..ed6b18f 100644 --- a/tests/test-char.c +++ b/tests/test-char.c @@ -484,75 +484,103 @@ static void char_udp_test(void) char_udp_test_internal(NULL, 0); } -static void char_file_test(void) +#ifndef _WIN32 +static void char_file_fifo_test(void) { +Chardev *chr; +CharBackend be; char *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL); +char *fifo = g_build_filename(tmp_path, "fifo", NULL); char *out = g_build_filename(tmp_path, "out", NULL); -char *contents = NULL; -ChardevFile file = { .out = out }; +ChardevFile file = { .in = fifo, + .has_in = true, + .out = out }; ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE, .u.file.data = &file }; +FeHandler fe = { 0, }; +int fd, ret; + +if (mkfifo(fifo, 0600) < 0) { +abort(); +} + +fd = open(fifo, O_RDWR); +ret = write(fd, "fifo-in", 8); +g_assert_cmpint(ret, ==, 8); + +chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend, + &error_abort); + +qemu_chr_fe_init(&be, chr, &error_abort); +qemu_chr_fe_set_handlers(&be, + fe_can_read, + fe_read, + fe_event, + NULL, + &fe, NULL, true); + +main_loop(); + +close(fd); + +g_assert_cmpint(fe.read_count, ==, 8); +g_assert_cmpstr(fe.read_buf, ==, "fifo-in"); + +qemu_chr_fe_deinit(&be); +object_unref(OBJECT(chr)); + +g_unlink(fifo); +g_free(fifo); +g_unlink(out); +g_free(out); +g_rmdir(tmp_path); +g_free(tmp_path); +} +#endif + +static void char_file_test_internal(Chardev *ext_chr, const char *filepath) +{ +char *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL); +char *out; Chardev *chr; +char *contents = NULL; +ChardevFile file = {}; +ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE, + .u.file.data = &file }; gsize length; int ret; -chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend, - &error_abort); +if (ext_chr) { +chr = ext_chr; +out = g_strdup(filepath); +file.out = out; +} else { +out = g_build_filename(tmp_path, "out", NULL); +file.out = out; +chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend, + &error_abort); +} ret = qemu_chr_write_all(chr, (uint8_t *)"hello!", 6); g_assert_cmpint(ret, ==, 6); -object_unref(OBJECT(chr)); ret = g_file_get_contents(out, &contents, &length, NULL); g_assert(ret == TRUE); g_assert_cmpint(length, ==, 6); g_assert(strncmp(contents, "hello!", 6) == 0); -g_free(contents); - -#ifndef _WIN32 -{ -CharBackend be; -FeHandler fe = { 0, }; -char *fifo = g_build_filename(tmp_path, "fifo", NULL); -int fd; - -if (mkfifo(fifo, 0600) < 0) { -abort(); -} - -fd = open(fifo, O_RDWR); -ret = write(fd, "fifo-in", 8); -g_assert_cmpint(ret, ==, 8); - -file.in = fifo; -file.has_in = true; -chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend, - &error_abort); - -qemu_chr_fe_init(&be, chr, &error_abort); -qemu_chr_fe_set_handlers(&be, - fe_can_read, - fe_read, - fe_event, - NULL, - &fe, NULL, true); - -main_loop(); -close(fd); - -g_assert_cmpint(fe.read_count, ==, 8); -g_assert_cmpstr(fe.read_buf, ==, "fifo-in"); -qemu_chr_fe_deinit(&be); +if (!ext_chr) { object_unref(OBJECT(chr)); -g_unlink(fifo); -g_free(fifo); +g_unlink(out); +g_free(out); } -#endif - -g_unlink(out); +g_free(contents); g_rmdir(tmp_path); g_free(tmp_path); -g_free(out); +} + +static void char_file_test(void) +{ +char_file_test_internal(NULL, NULL); } static void char_null_test(void) @@ -633,6 +661,9 @@ int main(int argc, char **argv) g_test_add_func("/char/pipe", char_pipe_test); #endif g_test_add_func("/char/file", char_file_test); +#ifndef _WIN32 +g_test_add_func("/char/file-fifo", char_file_fifo_test); +#endif g_test_add_func("/char/socket", char_socket_test); g_test_add_func("/char/udp", char_udp_test); -- 2.7.4
[Qemu-devel] [PATCH v3 04/13] char: forbid direct chardevice access for hotswap devices
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support should not access CharDriver ptr directly as CharDriver might change. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c| 7 +++ include/sysemu/char.h | 10 ++ 2 files changed, 17 insertions(+) diff --git a/chardev/char.c b/chardev/char.c index ba1a5f5..1eed934 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -484,9 +484,16 @@ static Notifier muxes_realize_notify = { Chardev *qemu_chr_fe_get_driver(CharBackend *be) { +/* this is unsafe for the users that support chardev hotswap */ +assert(be->chr_be_change == NULL); return be->chr; } +bool qemu_chr_fe_backend_connected(CharBackend *be) +{ +return !!be->chr; +} + static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 014ebbc..117d628 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -391,10 +391,20 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp); * * Returns the driver associated with a CharBackend or NULL if no * associated Chardev. + * Note: avoid this function as the driver should never be accessed directly, + * especially by the frontends that support chardevice hotswap. + * Consider qemu_chr_fe_backend_connected() to check for driver existence */ Chardev *qemu_chr_fe_get_driver(CharBackend *be); /** + * @qemu_chr_fe_backend_connected: + * + * Returns true if there is a chardevice associated with @be. + */ +bool qemu_chr_fe_backend_connected(CharBackend *be); + +/** * @qemu_chr_fe_deinit: * * Dissociate the CharBackend from the Chardev. -- 2.7.4
[Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap
This patch adds a possibility to change a char device without a frontend removal. 1. Ideally, it would have to happen transparently to a frontend, i.e. frontend would continue its regular operation. However, backends are not stateless and are set up by the frontends via qemu_chr_fe_<> functions, and it's not (generally) possible to replay that setup entirely in a backend code, as different chardevs respond to the setup calls differently, so do frontends work differently basing on those setup responses. Moreover, some frontend can generally get and save the backend pointer (qemu_chr_fe_get_driver()), and it will become invalid after backend change. So, a frontend which would like to support chardev hotswap has to register a "backend change" handler, and redo its backend setup there. 2. Write path can be used by multiple threads and thus protected with chr_write_lock. So hotswap also has to be protected so write functions won't access a backend being replaced. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c| 126 ++ include/sysemu/char.h | 9 qapi-schema.json | 40 3 files changed, 165 insertions(+), 10 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 1b097b3..ba1a5f5 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -132,12 +132,16 @@ static bool qemu_chr_replay(Chardev *chr) int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +Chardev *s; ChardevClass *cc; int ret; +qemu_mutex_lock(&be->chr_lock); +s = be->chr; + if (!s) { -return 0; +ret = 0; +goto end; } if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_PLAY) { @@ -145,7 +149,7 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) replay_char_write_event_load(&ret, &offset); assert(offset <= len); qemu_chr_fe_write_buffer(s, buf, offset, &offset); -return ret; +goto end; } cc = CHARDEV_GET_CLASS(s); @@ -161,7 +165,9 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_RECORD) { replay_char_write_event_save(ret, ret < 0 ? 0 : ret); } - + +end: +qemu_mutex_unlock(&be->chr_lock); return ret; } @@ -191,13 +197,16 @@ int qemu_chr_write_all(Chardev *s, const uint8_t *buf, int len) int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +Chardev *s; +int ret; -if (!s) { -return 0; -} +qemu_mutex_lock(&be->chr_lock); + +s = be->chr; +ret = s ? qemu_chr_write_all(s, buf, len) : 0; -return qemu_chr_write_all(s, buf, len); +qemu_mutex_unlock(&be->chr_lock); +return ret; } int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len) @@ -478,7 +487,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be) return be->chr; } -bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; @@ -507,6 +516,16 @@ unavailable: return false; } +bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +{ +if (!qemu_chr_fe_connect(b, s, errp)) { +return false; +} + +qemu_mutex_init(&b->chr_lock); +return true; +} + static bool qemu_chr_is_busy(Chardev *s) { if (CHARDEV_IS_MUX(s)) { @@ -531,6 +550,7 @@ void qemu_chr_fe_deinit(CharBackend *b) d->backends[b->tag] = NULL; } b->chr = NULL; +qemu_mutex_destroy(&b->chr_lock); } } @@ -1306,6 +1326,92 @@ ChardevReturn *qmp_chardev_add(const char *id, ChardevBackend *backend, return ret; } +ChardevReturn *qmp_chardev_change(const char *id, ChardevBackend *backend, + Error **errp) +{ +CharBackend *be; +const ChardevClass *cc; +Chardev *chr, *chr_new; +bool closed_sent = false; +ChardevReturn *ret; + +chr = qemu_chr_find(id); +if (!chr) { +error_setg(errp, "Chardev '%s' does not exist", id); +return NULL; +} + +if (CHARDEV_IS_MUX(chr)) { +error_setg(errp, "Mux device hotswap not supported yet"); +return NULL; +} + +if (qemu_chr_replay(chr)) { +error_setg(errp, +"Chardev '%s' cannot be changed in record/replay mode", id); +return NULL; +} + +be = chr->be; +if (!be) { +/* easy case */ +object_unparent(OBJECT(chr)); +return qmp_chardev_add(id, backend, errp); +} + +if (!be->chr_be_change) { +error_setg(errp, "Cha
[Qemu-devel] [PATCH v3 11/13] virtio-console: chardev hotswap support
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Amit Shah --- hw/char/virtio-console.c | 35 ++- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c index cf7331d..bd74669 100644 --- a/hw/char/virtio-console.c +++ b/hw/char/virtio-console.c @@ -49,7 +49,7 @@ static ssize_t flush_buf(VirtIOSerialPort *port, VirtConsole *vcon = VIRTIO_CONSOLE(port); ssize_t ret; -if (!qemu_chr_fe_get_driver(&vcon->chr)) { +if (!qemu_chr_fe_backend_connected(&vcon->chr)) { /* If there's no backend, we can just say we consumed all data. */ return len; } @@ -163,12 +163,35 @@ static void chr_event(void *opaque, int event) } } +static int chr_be_change(void *opaque) +{ +VirtConsole *vcon = opaque; +VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(vcon); +VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port); + +if (k->is_console) { +qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + NULL, chr_be_change, vcon, NULL, true); +} else { +qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + chr_event, chr_be_change, vcon, NULL, false); +} + +if (vcon->watch) { +g_source_remove(vcon->watch); +vcon->watch = qemu_chr_fe_add_watch(&vcon->chr, +G_IO_OUT | G_IO_HUP, +chr_write_unblocked, vcon); +} + +return 0; +} + static void virtconsole_realize(DeviceState *dev, Error **errp) { VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev); VirtConsole *vcon = VIRTIO_CONSOLE(dev); VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(dev); -Chardev *chr = qemu_chr_fe_get_driver(&vcon->chr); if (port->id == 0 && !k->is_console) { error_setg(errp, "Port number 0 on virtio-serial devices reserved " @@ -176,7 +199,7 @@ static void virtconsole_realize(DeviceState *dev, Error **errp) return; } -if (chr) { +if (qemu_chr_fe_backend_connected(&vcon->chr)) { /* * For consoles we don't block guest data transfer just * because nothing is connected - we'll just let it go @@ -188,11 +211,13 @@ static void virtconsole_realize(DeviceState *dev, Error **errp) */ if (k->is_console) { qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, - NULL, NULL, vcon, NULL, true); + NULL, chr_be_change, + vcon, NULL, true); virtio_serial_open(port); } else { qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, - chr_event, NULL, vcon, NULL, false); + chr_event, chr_be_change, + vcon, NULL, false); } } } -- 2.7.4
[Qemu-devel] [PATCH v3 01/13] char: move QemuOpts->ChardevBackend translation to a separate func
parse function will be used by the following patch Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c | 70 -- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 4e24dc3..3a0f543 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -854,17 +854,13 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -Chardev *qemu_chr_new_from_opts(QemuOpts *opts, -Error **errp) +static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) { Error *local_err = NULL; const ChardevClass *cc; -Chardev *chr; int i; ChardevBackend *backend = NULL; const char *name = qemu_opt_get(opts, "backend"); -const char *id = qemu_opts_id(opts); -char *bid = NULL; if (name == NULL) { error_setg(errp, "chardev: \"%s\" missing backend", @@ -872,21 +868,6 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, return NULL; } -if (is_help_option(name)) { -GString *str = g_string_new(""); - -chardev_name_foreach(help_string_append, str); - -error_report("Available chardev backend types: %s", str->str); -g_string_free(str, true); -exit(0); -} - -if (id == NULL) { -error_setg(errp, "chardev: no id specified"); -return NULL; -} - for (i = 0; i < ARRAY_SIZE(chardev_alias_table); i++) { if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) { name = chardev_alias_table[i].typename; @@ -902,16 +883,12 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, backend = g_new0(ChardevBackend, 1); backend->type = CHARDEV_BACKEND_KIND_NULL; -if (qemu_opt_get_bool(opts, "mux", 0)) { -bid = g_strdup_printf("%s-base", id); -} - -chr = NULL; if (cc->parse) { cc->parse(opts, backend, &local_err); if (local_err) { error_propagate(errp, local_err); -goto out; +qapi_free_ChardevBackend(backend); +return NULL; } } else { ChardevCommon *ccom = g_new0(ChardevCommon, 1); @@ -919,6 +896,47 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, backend->u.null.data = ccom; /* Any ChardevCommon member would work */ } +return backend; +} + +Chardev *qemu_chr_new_from_opts(QemuOpts *opts, Error **errp) +{ +const ChardevClass *cc; +Chardev *chr = NULL; +ChardevBackend *backend = NULL; +const char *name = qemu_opt_get(opts, "backend"); +const char *id = qemu_opts_id(opts); +char *bid = NULL; + +if (name && is_help_option(name)) { +GString *str = g_string_new(""); + +chardev_name_foreach(help_string_append, str); + +error_report("Available chardev backend types: %s", str->str); +g_string_free(str, true); +exit(0); +} + +if (id == NULL) { +error_setg(errp, "chardev: no id specified"); +return NULL; +} + +backend = qemu_chr_parse_opts(opts, errp); +if (backend == NULL) { +return NULL; +} + +cc = char_get_class(name, errp); +if (cc == NULL) { +goto out; +} + +if (qemu_opt_get_bool(opts, "mux", 0)) { +bid = g_strdup_printf("%s-base", id); +} + chr = qemu_chardev_new(bid ? bid : id, object_class_get_name(OBJECT_CLASS(cc)), backend, errp); -- 2.7.4
[Qemu-devel] [PATCH v3 10/13] hmp: add hmp analogue for qmp-chardev-change
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Dr. David Alan Gilbert --- chardev/char.c| 2 +- hmp-commands.hx | 16 hmp.c | 34 ++ hmp.h | 1 + include/sysemu/char.h | 12 5 files changed, 64 insertions(+), 1 deletion(-) diff --git a/chardev/char.c b/chardev/char.c index 2d6e204..1e2a2dd 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -888,7 +888,7 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp) { Error *local_err = NULL; const ChardevClass *cc; diff --git a/hmp-commands.hx b/hmp-commands.hx index baeac47..19bfddd 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as the -chardev command line switch. ETEXI { +.name = "chardev-change", +.args_type = "id:s,args:s", +.params = "id args", +.help = "change chardev", +.cmd= hmp_chardev_change, +}, + +STEXI +@item chardev-change args +@findex chardev-change +chardev_change accepts existing chardev @var{id} and then the same arguments +as the -chardev command line switch (except for "id"). + +ETEXI + +{ .name = "chardev-remove", .args_type = "id:s", .params = "id", diff --git a/hmp.c b/hmp.c index 20f5dab..7660495 100644 --- a/hmp.c +++ b/hmp.c @@ -2224,6 +2224,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_chardev_change(Monitor *mon, const QDict *qdict) +{ +const char *args = qdict_get_str(qdict, "args"); +const char *id; +Error *err = NULL; +ChardevBackend *backend = NULL; +ChardevReturn *ret = NULL; +QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args, + true); +if (!opts) { +error_setg(&err, "Parsing chardev args failed"); +goto end; +} + +id = qdict_get_str(qdict, "id"); +if (qemu_opts_id(opts)) { +error_setg(&err, "Unexpected 'id' parameter"); +goto end; +} + +backend = qemu_chr_parse_opts(opts, &err); +if (!backend) { +goto end; +} + +ret = qmp_chardev_change(id, backend, &err); + +end: +qapi_free_ChardevReturn(ret); +qapi_free_ChardevBackend(backend); +qemu_opts_del(opts); +hmp_handle_error(mon, &err); +} + void hmp_chardev_remove(Monitor *mon, const QDict *qdict) { Error *local_err = NULL; diff --git a/hmp.h b/hmp.h index d8b94ce..23e035c 100644 --- a/hmp.h +++ b/hmp.h @@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict); void hmp_nbd_server_add(Monitor *mon, const QDict *qdict); void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); +void hmp_chardev_change(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); void hmp_cpu_add(Monitor *mon, const QDict *qdict); diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 342f531..18fcd26 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -131,6 +131,18 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend); /** + * @qemu_chr_parse_opts: + * + * Parse the options to the ChardevBackend struct. + * + * @opts + * + * Returns: a new backend + */ +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, +Error **errp); + +/** * @qemu_chr_new: * * Create a new character backend from a URI. -- 2.7.4
[Qemu-devel] [PATCH v3 00/13] chardevice hotswap
Changed in v3: - minor remarks to patch 1 applied - patch 3: avoid using bottom-half, handle syncronously As mentioned, it gets thing complicated and is only a problem for a monitor-connected chardev hotswap and that is not supported for now - tests added (patches 6-9) This serie is a v2 of the February submit http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html The interface is changed as requested and the changes are slightly reworked and split into separate patches. The patchset adds support of the character device change without a frontend device removal. Yet isa-serial and virtio-serial frontends are supported. The feature can be helpful for e.g. Windows debug allowing to establish connection to a live VM from VM with WinDbg. Anton Nefedov (13): char: move QemuOpts->ChardevBackend translation to a separate func char: add backend hotswap handler char: chardevice hotswap char: forbid direct chardevice access for hotswap devices char: avoid chardevice direct access test-char: unref chardev-udp after test test-char: split char_udp_test test-char: split char_file_test test-char: add hotswap test hmp: add hmp analogue for qmp-chardev-change virtio-console: chardev hotswap support serial: move TIOCM update to a separate function serial: chardev hotswap support backends/rng-egd.c | 2 +- chardev/char-mux.c | 1 + chardev/char.c | 212 --- gdbstub.c | 4 +- hmp-commands.hx | 16 +++ hmp.c | 34 ++ hmp.h | 1 + hw/arm/pxa2xx.c | 3 +- hw/arm/strongarm.c | 4 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 4 +- hw/char/debugcon.c | 4 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 8 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 4 +- hw/char/grlib_apbuart.c | 4 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 4 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/parallel.c | 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 4 +- hw/char/sclpconsole.c | 4 +- hw/char/serial.c| 63 --- hw/char/sh_serial.c | 4 +- hw/char/spapr_vty.c | 4 +- hw/char/stm32f2xx_usart.c | 3 +- hw/char/terminal3270.c | 4 +- hw/char/virtio-console.c| 35 +- hw/char/xen_console.c | 4 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 4 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 6 +- hw/usb/ccid-card-passthru.c | 6 +- hw/usb/dev-serial.c | 7 +- hw/usb/redirect.c | 7 +- include/sysemu/char.h | 43 monitor.c | 4 +- net/colo-compare.c | 14 ++- net/filter-mirror.c | 8 +- net/slirp.c | 2 +- net/vhost-user.c| 7 +- qapi-schema.json| 40 +++ qtest.c | 2 +- tests/test-char.c | 263 +--- tests/vhost-user-test.c | 2 +- 52 files changed, 669 insertions(+), 202 deletions(-) -- 2.7.4
[Qemu-devel] [PATCH v3 06/13] test-char: unref chardev-udp after test
this is only not a problem if the test is last in a suite, otherwise it makes the following main_loop() calls to fail Signed-off-by: Anton Nefedov --- tests/test-char.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test-char.c b/tests/test-char.c index f3b377f..d63d3d2 100644 --- a/tests/test-char.c +++ b/tests/test-char.c @@ -456,6 +456,8 @@ static void char_udp_test(void) close(sock); g_free(tmp); +qemu_chr_fe_deinit(&be); +object_unparent(OBJECT(chr)); } static void char_file_test(void) -- 2.7.4
[Qemu-devel] [PATCH v3 13/13] serial: chardev hotswap support
for a backend change, a number of ioctls has to be replayed to sync the current setup of a frontend to a backend tty. This is hopefully enough so we don't have to track, store and replay the whole original control byte sequence. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Michael S. Tsirkin CC: Paolo Bonzini --- hw/char/serial.c | 32 ++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/hw/char/serial.c b/hw/char/serial.c index 1e6bdeb..ed01637 100644 --- a/hw/char/serial.c +++ b/hw/char/serial.c @@ -891,9 +891,37 @@ static void serial_reset(void *opaque) s->msr &= ~UART_MSR_ANY_DELTA; } +static int serial_be_change(void *opaque) +{ +SerialState *s = opaque; + +qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, + serial_event, serial_be_change, s, NULL, true); + +serial_update_parameters(s); + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK, + &s->last_break_enable); + +s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0; +serial_update_msl(s); + +if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) { +serial_update_tiocm(s); +} + +if (s->watch_tag > 0) { +g_source_remove(s->watch_tag); +s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP, + serial_watch_cb, s); +} + +return 0; +} + void serial_realize_core(SerialState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create serial device, empty char device"); return; } @@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp) qemu_register_reset(serial_reset, s); qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, - serial_event, NULL, s, NULL, true); + serial_event, serial_be_change, s, NULL, true); fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH); fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH); serial_reset(s); -- 2.7.4
[Qemu-devel] [PATCH v3 07/13] test-char: split char_udp_test
makes it possible to test the existing chardev-udp Signed-off-by: Anton Nefedov --- tests/test-char.c | 58 +++ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/tests/test-char.c b/tests/test-char.c index d63d3d2..ad0752a 100644 --- a/tests/test-char.c +++ b/tests/test-char.c @@ -413,16 +413,11 @@ static void char_pipe_test(void) } #endif -static void char_udp_test(void) +static int make_udp_socket(int *port) { -struct sockaddr_in addr = { 0, }, other; -SocketIdleData d = { 0, }; -Chardev *chr; -CharBackend be; +struct sockaddr_in addr = { 0, }; socklen_t alen = sizeof(addr); int ret, sock = qemu_socket(PF_INET, SOCK_DGRAM, 0); -char buf[10]; -char *tmp; g_assert_cmpint(sock, >, 0); addr.sin_family = AF_INET ; @@ -433,19 +428,41 @@ static void char_udp_test(void) ret = getsockname(sock, (struct sockaddr *)&addr, &alen); g_assert_cmpint(ret, ==, 0); -tmp = g_strdup_printf("udp:127.0.0.1:%d", - ntohs(addr.sin_port)); -chr = qemu_chr_new("client", tmp); -g_assert_nonnull(chr); +*port = ntohs(addr.sin_port); +return sock; +} + +static void char_udp_test_internal(Chardev *reuse_chr, int sock) +{ +struct sockaddr_in other; +SocketIdleData d = { 0, }; +Chardev *chr; +CharBackend *be; +socklen_t alen = sizeof(other); +int ret; +char buf[10]; +char *tmp = NULL; + +if (reuse_chr) { +chr = reuse_chr; +be = chr->be; +} else { +int port; +sock = make_udp_socket(&port); +tmp = g_strdup_printf("udp:127.0.0.1:%d", port); +chr = qemu_chr_new("client", tmp); +g_assert_nonnull(chr); + +be = g_alloca(sizeof(CharBackend)); +qemu_chr_fe_init(be, chr, &error_abort); +} d.chr = chr; -qemu_chr_fe_init(&be, chr, &error_abort); -qemu_chr_fe_set_handlers(&be, socket_can_read_hello, socket_read_hello, +qemu_chr_fe_set_handlers(be, socket_can_read_hello, socket_read_hello, NULL, NULL, &d, NULL, true); ret = qemu_chr_write_all(chr, (uint8_t *)"hello", 5); g_assert_cmpint(ret, ==, 5); -alen = sizeof(addr); ret = recvfrom(sock, buf, sizeof(buf), 0, (struct sockaddr *)&other, &alen); g_assert_cmpint(ret, ==, 5); @@ -454,10 +471,17 @@ static void char_udp_test(void) main_loop(); -close(sock); +if (!reuse_chr) { +close(sock); +qemu_chr_fe_deinit(be); +object_unparent(OBJECT(chr)); +} g_free(tmp); -qemu_chr_fe_deinit(&be); -object_unparent(OBJECT(chr)); +} + +static void char_udp_test(void) +{ +char_udp_test_internal(NULL, 0); } static void char_file_test(void) -- 2.7.4
[Qemu-devel] [PATCH v3 12/13] serial: move TIOCM update to a separate function
will be used by the following patch Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Michael S. Tsirkin CC: Paolo Bonzini Reviewed-by: Marc-André Lureau --- hw/char/serial.c | 31 +++ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/hw/char/serial.c b/hw/char/serial.c index d8d34d0..1e6bdeb 100644 --- a/hw/char/serial.c +++ b/hw/char/serial.c @@ -312,6 +312,24 @@ static void serial_write_fcr(SerialState *s, uint8_t val) } } +static void serial_update_tiocm(SerialState *s) +{ +int flags; + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags); + +flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR); + +if (s->mcr & UART_MCR_RTS) { +flags |= CHR_TIOCM_RTS; +} +if (s->mcr & UART_MCR_DTR) { +flags |= CHR_TIOCM_DTR; +} + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags); +} + static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -426,24 +444,13 @@ static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val, break; case 4: { -int flags; int old_mcr = s->mcr; s->mcr = val & 0x1f; if (val & UART_MCR_LOOP) break; if (s->poll_msl >= 0 && old_mcr != s->mcr) { - -qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags); - -flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR); - -if (val & UART_MCR_RTS) -flags |= CHR_TIOCM_RTS; -if (val & UART_MCR_DTR) -flags |= CHR_TIOCM_DTR; - -qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags); +serial_update_tiocm(s); /* Update the modem status after a one-character-send wait-time, since there may be a response from the device/computer at the other end of the serial line */ timer_mod(s->modem_status_poll, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->char_transmit_time); -- 2.7.4
[Qemu-devel] [PATCH v3 05/13] char: avoid chardevice direct access
frontends should avoid accessing CharDriver struct where possible Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c | 5 + gdbstub.c | 2 +- hw/arm/strongarm.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/escc.c | 6 +++--- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/ipoctal232.c| 2 +- hw/char/parallel.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/terminal3270.c | 2 +- hw/char/xen_console.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/misc/ivshmem.c | 4 ++-- hw/usb/ccid-card-passthru.c | 4 ++-- hw/usb/dev-serial.c | 5 ++--- hw/usb/redirect.c | 5 ++--- include/sysemu/char.h | 7 +++ net/filter-mirror.c | 2 +- 23 files changed, 39 insertions(+), 29 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 1eed934..2d6e204 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -494,6 +494,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be) return !!be->chr; } +bool qemu_chr_fe_backend_open(CharBackend *be) +{ +return be->chr && be->chr->be_open; +} + static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; diff --git a/gdbstub.c b/gdbstub.c index 1ac0489..68cbe8a 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2001,7 +2001,7 @@ int gdbserver_start(const char *device) NULL, &error_abort); monitor_init(mon_chr, 0); } else { -if (qemu_chr_fe_get_driver(&s->chr)) { +if (qemu_chr_fe_backend_connected(&s->chr)) { object_unparent(OBJECT(qemu_chr_fe_get_driver(&s->chr))); } mon_chr = s->mon_chr; diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c index bec093d..9d7cf21 100644 --- a/hw/arm/strongarm.c +++ b/hw/arm/strongarm.c @@ -1105,7 +1105,7 @@ static void strongarm_uart_tx(void *opaque) if (s->utcr3 & UTCR3_LBM) /* loopback */ { strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1); -} else if (qemu_chr_fe_get_driver(&s->chr)) { +} else if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1); diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c index 71867b3..19636c0 100644 --- a/hw/char/cadence_uart.c +++ b/hw/char/cadence_uart.c @@ -278,7 +278,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, GIOCondition cond, int ret; /* instant drain the fifo when there's no back-end */ -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { s->tx_count = 0; return FALSE; } diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c index 6d95297..bd0d4f0 100644 --- a/hw/char/debugcon.c +++ b/hw/char/debugcon.c @@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = { static void debugcon_realize_core(DebugconState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create debugcon device, empty char device"); return; } diff --git a/hw/char/escc.c b/hw/char/escc.c index aa882b6..dbbeb4a 100644 --- a/hw/char/escc.c +++ b/hw/char/escc.c @@ -416,7 +416,7 @@ static void escc_update_parameters(ChannelState *s) int speed, parity, data_bits, stop_bits; QEMUSerialSetParams ssp; -if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser) +if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser) return; if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) { @@ -556,7 +556,7 @@ static void escc_mem_write(void *opaque, hwaddr addr, trace_escc_mem_writeb_data(CHN_C(s), val); s->tx = val; if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled -if (qemu_chr_fe_get_driver(&s->chr)) { +if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx, 1); @@ -1012,7 +1012,7 @@ static void escc_realize(DeviceState *dev, Error **errp) ESCC_SIZE << s->it_shift); for (i = 0; i < 2; i++) { -if (qemu_chr_fe_get_driver(&s->chn[i].chr)) { +if (qemu_chr_fe_backend_connected(&s->chn[i].chr)) {
[Qemu-devel] [PATCH v3 02/13] char: add backend hotswap handler
Frontends should have an interface to setup the handler of a backend change. The interface will be used in the next commits Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Marc-André Lureau --- backends/rng-egd.c | 2 +- chardev/char-mux.c | 1 + chardev/char.c | 4 +++- gdbstub.c | 2 +- hw/arm/pxa2xx.c | 3 ++- hw/arm/strongarm.c | 2 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 2 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 2 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/serial.c| 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/stm32f2xx_usart.c | 3 ++- hw/char/terminal3270.c | 2 +- hw/char/virtio-console.c| 4 ++-- hw/char/xen_console.c | 2 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 2 +- hw/usb/ccid-card-passthru.c | 2 +- hw/usb/dev-serial.c | 2 +- hw/usb/redirect.c | 2 +- include/sysemu/char.h | 5 + monitor.c | 4 ++-- net/colo-compare.c | 14 -- net/filter-mirror.c | 6 +++--- net/slirp.c | 2 +- net/vhost-user.c| 7 --- qtest.c | 2 +- tests/test-char.c | 14 ++ tests/vhost-user-test.c | 2 +- 47 files changed, 78 insertions(+), 59 deletions(-) diff --git a/backends/rng-egd.c b/backends/rng-egd.c index 380b19a..0b0e945 100644 --- a/backends/rng-egd.c +++ b/backends/rng-egd.c @@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp) /* FIXME we should resubmit pending requests when the CDS reconnects. */ qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read, - rng_egd_chr_read, NULL, s, NULL, true); + rng_egd_chr_read, NULL, NULL, s, NULL, true); } static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp) diff --git a/chardev/char-mux.c b/chardev/char-mux.c index 37d42c6..5849ea5 100644 --- a/chardev/char-mux.c +++ b/chardev/char-mux.c @@ -278,6 +278,7 @@ void mux_chr_set_handlers(Chardev *chr, GMainContext *context) mux_chr_can_read, mux_chr_read, mux_chr_event, + NULL, chr, context, true); } diff --git a/chardev/char.c b/chardev/char.c index 3a0f543..1b097b3 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -522,7 +522,7 @@ void qemu_chr_fe_deinit(CharBackend *b) assert(b); if (b->chr) { -qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true); +qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true); if (b->chr->be == b) { b->chr->be = NULL; } @@ -538,6 +538,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, IOCanReadHandler *fd_can_read, IOReadHandler *fd_read, IOEventHandler *fd_event, + BackendChangeHandler *be_change, void *opaque, GMainContext *context, bool set_open) @@ -561,6 +562,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, b->chr_can_read = fd_can_read; b->chr_read = fd_read; b->chr_event = fd_event; +b->chr_be_change = be_change; b->opaque = opaque; if (cc->chr_update_read_handler) { cc->chr_update_read_handler(s, context); diff --git a/gdbstub.c b/gdbstub.c index 86eed4f..1ac0489 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2013,7 +2013,7 @@ int gdbserver_start(const char *device) if (chr) { qemu_chr_fe_init(&s->chr, chr, &error_abort); qemu_chr_fe_set_handlers(&s->chr, gdb_chr_can_receive, gdb_chr_receive, - gdb_chr_event, NULL, NULL, true); + gdb_chr_event, NULL, NULL, NULL, true); } s->state = chr ? RS_IDLE : RS_INACTIVE; s->mon_chr = mon_chr; diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c index eea551d..3e51882 10
Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW
On 05/26/2017 04:08 PM, Alberto Garcia wrote: On Fri 26 May 2017 02:47:55 PM CEST, Anton Nefedov wrote: Tried the another machine; about 10% improvement here [...] [root@localhost ~]# fio --name=randwrite --blocksize=4k --filename=/dev/sdb --rw=randwrite --direct=1 --ioengine=libaio --size=2g --io_size=32m In my tests I sometimes detected slight performance decreases in that HDD scenario but using 'write' instead of 'randwrite' (and --runtime=60 instead of --io_size). Can you try and see how that works for you? Berto For me it keeps giving pretty much the same result before and after write: io=512736KB, bw=8545.4KB/s, iops=2136, runt= 60004msec /Anton
Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW
On 05/26/2017 01:17 PM, Kevin Wolf wrote: Am 24.05.2017 um 18:09 hat Anton Nefedov geschrieben: I agree; as mentioned we have similar patches and they don't conflict much. We noticed a performance regression on HDD though, for the presumably optimized case (random 4k write over a large backed image); so the patches were put on hold. You're talking about your own patches that should do the same thing, right? Can you re-do the same test with Berto's patches? Maybe there was just an implementation glitch in yours. This approach should very obviously result in a performance improvement, and the patches are relatively simple, so I'm very much inclined to merge this as soon as possible. Kevin Tried the another machine; about 10% improvement here [root@dpclient centos-7.3-x86_64]# qemu-img info ./harddisk2.hdd image: ./harddisk2.hdd file format: qcow2 virtual size: 2.0G (2147483648 bytes) disk size: 260K cluster_size: 65536 backing file: harddisk2.hdd.base (actual path: ./harddisk2.hdd.base) Format specific information: compat: 1.1 lazy refcounts: false refcount bits: 16 corrupt: false [root@dpclient centos-7.3-x86_64]# qemu-img info ./harddisk2.hdd.base image: ./harddisk2.hdd.base file format: qcow2 virtual size: 2.0G (2147483648 bytes) disk size: 2.0G cluster_size: 65536 Format specific information: compat: 1.1 lazy refcounts: true refcount bits: 16 corrupt: false [root@dpclient centos-7.3-x86_64]# filefrag ./harddisk2.hdd.base ./harddisk2.hdd.base: 3 extents found [root@localhost ~]# fio --name=randwrite --blocksize=4k --filename=/dev/sdb --rw=randwrite --direct=1 --ioengine=libaio --size=2g --io_size=32m /* master */ write: io=32768KB, bw=372785B/s, iops=91, runt= 90010msec /* Berto's patches */ write: io=32768KB, bw=404304B/s, iops=98, runt= 82993msec /Anton
Re: [Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk
On 05/26/2017 11:57 AM, Denis V. Lunev wrote: On 05/26/2017 11:11 AM, Kevin Wolf wrote: Am 19.05.2017 um 11:34 hat Anton Nefedov geschrieben: From: "Denis V. Lunev" Currently each single write operation can result in 3 write operations if guest offsets are not cluster aligned. One write is performed for the real payload and two for COW-ed areas. Thus the data possibly lays non-contiguously on the host filesystem. This will reduce further sequential read performance significantly. The patch allocates the space in the file with cluster granularity, ensuring 1. better host offset locality 2. less space allocation operations (which can be expensive on distributed storages) Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov I don't think this is the right approach. You end up with two write operations: One write_zeroes and then one data write. If the backend actually supports efficient zero writes, write_zeroes won't necessarily allocate space, but writing data will definitely split the already existing allocation. If anything, we need a new bdrv_allocate() or something that would call fallocate() instead of abusing write_zeroes. great idea. Very nice then. hi Kevin, thanks a lot for your comments. The driver's callback is commented as: /* * Efficiently zero a region of the disk image. Typically an image format * would use a compact metadata representation to implement this. This * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() * will be called instead. */ int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, int64_t offset, int count, BdrvRequestFlags flags); so it looks like one may expect either efficiency or ENOTSUP from that one? but not necessarily from the safe "facade" bdrv_co_pwrite_zeroes() which falls back to pwritev() /* * Efficiently zero a region of the disk image. Note that this is a regular * I/O request like read or write and should have a reasonable size. This * function is not suitable for zeroing the entire image in a single request * because it may allocate memory for the entire region. */ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, int count, BdrvRequestFlags flags); (maybe we should have even more explicit comment here) Then let's indeed create a new bdrv_allocate(), which will result in bdrv_co_pwrite_zeroes(flags|=ZERO_EFFICIENTLY), so we'll reuse most of the code and employ the same driver callback, but return ENOTSUP if it fails? It seems much clearer to me that simply unifying the three write requests into a single one is an improvement. And it's easy to do, I even had a patch once to do this. The reason that I didn't send it was that it seemed to conflict with the data cache approach These changes help a lot on 2 patterns: - writing 4Kb into the middle of the block. The bigger the block size, more we gain - sequential write, which is not sector aligned and comes in 2 requests: we perform allocation, which should be significantly faster than actual write and after that both parts of the write can be executed in parallel. At my opinion both cases are frequent and important. But OK, the code should be improved, you are absolutely right here. block/qcow2.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index a8d61f0..2e6a0ec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1575,6 +1575,32 @@ fail: return ret; } +static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) +{ +BDRVQcow2State *s = bs->opaque; +BlockDriverState *file = bs->file->bs; +QCowL2Meta *m; +int ret; + +for (m = l2meta; m != NULL; m = m->next) { +uint64_t bytes = m->nb_clusters << s->cluster_bits; + +if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { +continue; +} + +/* try to alloc host space in one chunk for better locality */ +ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 0); No. This is what you bypass: * All sanity checks that the block layer does * bdrv_inc/dec_in_flight(), which is required for drain to work correctly. Not doing this will cause crashes. * tracked_request_begin/end(), mark_request_serialising() and wait_serialising_requests(), which are required for serialising requests to work correctly * Ensuring correct request alignment for file. This means that e.g. qcow2 with cluster size 512 on a host with a 4k native disk will break. * blkdebug events * before_write_notifiers. Not calling these will cause corrupted backups if someone backups file. * Dirty bitmap updates * Updating write_gen, wr_highest_offset and total_sectors * Ensuring tha
Re: [Qemu-devel] [PATCH v2 3/9] char: chardevice hotswap
Hi Hi Marc-André, thanks for reviewing, pls see answers below On Fri, May 19, 2017 at 4:51 PM Anton Nefedov wrote: This patch adds a possibility to change a char device without a frontend removal. 1. Ideally, it would have to happen transparently to a frontend, i.e. frontend would continue its regular operation. However, backends are not stateless and are set up by the frontends via qemu_chr_fe_<> functions, and it's not (generally) possible to replay that setup entirely in a backend code, as different chardevs respond to the setup calls differently, so do frontends work differently basing on those setup responses. Moreover, some frontend can generally get and save the backend pointer (qemu_chr_fe_get_driver()), and it will become invalid after backend change. So, a frontend which would like to support chardev hotswap has to register a "backend change" handler, and redo its backend setup there. 2. Write path can be used by multiple threads and thus protected with chr_write_lock. So hotswap also has to be protected so write functions won't access a backend being replaced. Why not use the chr_write_lock then? (rename it chr_lock?) chr_write_lock is a Chardev property, it will be gone together with a swapped device. We could remove chr_write_lock and leave only the new be->chr_lock, since it covers everything; I guess the only problem is mux, where multiple CharBackends share the same Chardev. Would smth like this be better?: static void char_lock(CharBackend *be) { qemu_mutex_lock(be->chr && CHARDEV_IS_MUX(be->chr) ? &be->chr->chr_lock : &be->chr_lock); } and use this instead of lock(be->chr_lock) and remove chr_write_lock. (This would rely on a fact that mux is never hotswapped so be->chr_lock is not needed). 3. Hotswap function can be called from e.g. a read handler of a monitor socket. This can cause troubles so it's safer to defer execution to a bottom-half (however, it means we cannot return some of the errors synchronously - but most of them we can) What kind of troubles? if someone tried to hotswap monitor, it would look like: (gdb) bt #0 qmp_chardev_change (id=id@entry=0x7fc24bee7a60 "charmonitor", backend=backend@entry=0x7fc24cc28290, errp=errp@entry=0x7ffde4d8d4f0) at /mnt/code/us-qemu/chardev/char.c:1438 #1 0x7fc249fd914b in hmp_chardev_change (mon=, qdict=) at /mnt/code/us-qemu/hmp.c:2252 #2 0x7fc249edeece in handle_hmp_command (mon=mon@entry=0x7fc24bef87c0, cmdline=0x7fc24bf2345f "charmonitor socket,path=/vz/vmprivate/centosos/mon.socket,server,nowait") at /mnt/code/us-qemu/monitor.c:3100 #3 0x7fc249ee02fa in monitor_command_cb (opaque=0x7fc24bef87c0, cmdline=, readline_opaque=) at /mnt/code/us-qemu/monitor.c:3897 #4 0x7fc24a242428 in readline_handle_byte (rs=0x7fc24bf23450, ch=) at /mnt/code/us-qemu/util/readline.c:393 #5 0x7fc249edf0e2 in monitor_read (opaque=, buf=, size=) at /mnt/code/us-qemu/monitor.c:3880 #6 0x7fc24a1deb2b in tcp_chr_read (chan=, cond=, opaque=) at /mnt/code/us-qemu/chardev/char-socket.c:441 #7 0x7fc240997d7a in g_main_dispatch (context=0x7fc24beda950) at gmain.c:3152 #8 g_main_context_dispatch (context=context@entry=0x7fc24beda950) at gmain.c:3767 #9 0x7fc24a22fe7c in glib_pollfds_poll () at /mnt/code/us-qemu/util/main-loop.c:213 #10 os_host_main_loop_wait (timeout=) at /mnt/code/us-qemu/util/main-loop.c:261 #11 main_loop_wait (nonblocking=nonblocking@entry=0) at /mnt/code/us-qemu/util/main-loop.c:517 #12 0x7fc249e9a19a in main_loop () at /mnt/code/us-qemu/vl.c:1900 #13 main (argc=, argv=, envp=) at /mnt/code/us-qemu/vl.c:4732 i.e. hotswap the device that is in tcp_chr_read (frame #6), and can potentially be accessed (with old pointer on a stack) after it is destroyed by qmp_chardev_change() however, there is no support for monitor hotswap in the patchset. Maybe it's not worth to mess with bottom-halves at all? It causes problems, like those mentioned in your further comments Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c| 147 ++ include/sysemu/char.h | 10 qapi-schema.json | 40 ++ 3 files changed, 187 insertions(+), 10 deletions(-) -bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +static bool fe_connect(CharBackend *b, Chardev *s, Error **errp) { I would rather keep a qemu_chr prefix for consistency Done. int tag = 0; @@ -507,6 +516,17 @@ unavailable: return false; } +bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +{ +if (!fe_connect(b, s, errp)) { +return false; +} + +qemu_mutex_init(&b->chr_lock); +b->hotswap_bh = NULL; +return true; +} + static bool qemu_chr_is_busy(Chardev *s) { if (CHARDEV_IS_MUX(s)) { @@ -531,6 +551,10 @@ void qem
Re: [Qemu-devel] [PATCH v1 04/13] qcow2: preallocation at image expand
On 05/22/2017 10:29 PM, Eric Blake wrote: On 05/19/2017 04:34 AM, Anton Nefedov wrote: From: "Denis V. Lunev" This patch adds image preallocation at expand to provide better locality of QCOW2 image file and optimize this procedure for some distributed storages where this procedure is slow. Image expand requests have to be suspended until the allocation is performed which is done via special QCowL2Meta. This meta is invisible to handle_dependencies() code. This is the main reason for also calling preallocation before metadata write: it might intersect with preallocation triggered by another IO, and has to yield How does this interact with Max's work on preallocated truncate? https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg00267.html Seems like Max's patchset makes it possible to manually (with qemu-img) allocate a continuous guest address space in advance? Preallocation in this series is a little bit different: - it's in-flight, and it fallocates (using drv->write_zeroes) space ahead in the underlying image as qcow2 writes beyond EOF. - it doesn't link L2. So it leaves the image 'sparse' on qcow2 level. If you ask me, these 2 patchsets are not contradictory, and probably won't even merge-conflict :) /Anton
Re: [Qemu-devel] [PATCH 7/7] qcow2: Merge the writing of the COW regions with the guest data
If the guest tries to write data that results on the allocation of a new cluster, instead of writing the guest data first and then the data from the COW regions, write everything together using one single I/O operation. This can improve the write performance by 25% or more, depending on several factors such as the media type, the cluster size and the I/O request size. Signed-off-by: Alberto Garcia --- block/qcow2-cluster.c | 34 ++ block/qcow2.c | 58 ++- block/qcow2.h | 7 +++ 3 files changed, 80 insertions(+), 19 deletions(-) -/* And now we can write everything */ -qemu_iovec_reset(&qiov); -qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); -ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov); -if (ret < 0) { -goto fail; +/* And now we can write everything. If we have the guest data we + * can write everything in one single operation */ +if (m->data_qiov) { +qemu_iovec_reset(&qiov); +qemu_iovec_add(&qiov, start_buffer, start->nb_bytes); +qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes); +qemu_iovec_add(&qiov, end_buffer, end->nb_bytes); Can it be a problem if (m->data_qiov->niov == IOV_MAX)? We had to add merge-iovecs code for the case (maybe there's better solution?) Also, will this work if allocation is split into several l2metas? e.g. one has cow_start.nb_bytes and another has cow_end.nb_bytes /Anton
Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW
On 05/24/2017 07:20 PM, Alberto Garcia wrote: On Wed 24 May 2017 06:09:42 PM CEST, Anton Nefedov wrote: I agree; as mentioned we have similar patches and they don't conflict much. We noticed a performance regression on HDD though, for the presumably optimized case (random 4k write over a large backed image); so the patches were put on hold. Interesting, I think that scenario was noticeably faster in my tests. What cluster size(s) and image size(s) were you using? 64k cluster, 2g image, write 32m in portions of 4k at random offsets /Anton
Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW
On 05/24/2017 05:20 PM, Alberto Garcia wrote: On Tue 23 May 2017 04:36:52 PM CEST, Eric Blake wrote: here's a patch series that rewrites the copy-on-write code in the qcow2 driver to reduce the number of I/O operations. And it competes with Denis and Anton's patches: https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg04547.html What plan of attack should we take on merging the best parts of these two series? I took a look at that series and unless I overlooked something important it seems that there's actually not that much overlap. Denis and Anton's series deals with cluster preallocation and mine reduces the number of I/O operations when there's a COW scenario. Most of my modifications are in the peform_cow() function, but they barely touch that one. I think we can review both separately, either of us can rebase our series on top of the other, I don't expect big changes or conflicts. Berto I agree; as mentioned we have similar patches and they don't conflict much. We noticed a performance regression on HDD though, for the presumably optimized case (random 4k write over a large backed image); so the patches were put on hold. SSD looked fine though. /Anton
Re: [Qemu-devel] [PATCH v1 02/13] qcow2: is_zero_sectors(): return true if area is outside of backing file
On 05/22/2017 10:14 PM, Eric Blake wrote: On 05/22/2017 02:12 PM, Eric Blake wrote: +++ b/block/qcow2.c @@ -2482,7 +2482,7 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start, int64_t res; if (start + count > bs->total_sectors) { -count = bs->total_sectors - start; +count = start < bs->total_sectors ? bs->total_sectors - start : 0; } if (!count) { @@ -2490,7 +2490,9 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start, } res = bdrv_get_block_status_above(bs, NULL, start, count, &nr, &file); -return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count; +return res >= 0 +&& (((res & BDRV_BLOCK_ZERO) && nr == count) +|| nr == 0); The logic makes sense to me, although the formatting is unusual (we tend to split && and || with the operator at the tail of the previous line, not the head of the new line). This quick check may make me revisit whether it is worth my my RFC series about adding BDRV_BLOCK_EOF for more quickly tracking reads beyond EOF (my solution in that series was able to make the same change to test 154, but by doing it at the block layer instead of the qcow2.c code). https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01127.html Actually, re-reading my RFC - I was able to change 6 instances in test 154, while your tweak only affected 2 instances (you still left four instances that were allocating). So my approach may still be more optimal in the long run. Yes, looks like your approach is more general; let's drop this one then /Anton
Re: [Qemu-devel] [PATCH v1 03/13] qcow2: do not COW the empty areas
On 05/22/2017 10:24 PM, Eric Blake wrote: On 05/19/2017 04:34 AM, Anton Nefedov wrote: If COW area of the newly allocated cluster is zeroes, there is no reason to write zero sectors in perform_cow() again now as whole clusters are zeroed out in single chunks by handle_alloc_space(). But that's only true if you can guarantee that handle_alloc_space() succeeded at ensuring the cluster reads as zeroes. If you silently ignore errors (which is what patch 1/13 does), you risk assuming that the cluster reads as zeroes when in reality it does not, and then you have corrupted data. Sure; COW is only skipped if pwrite_zeroes() from patch 1/13 succeeds The idea of avoiding a COW of areas that read as zero at the source when the destination also already reads as zeroes makes sense, but I'm not convinced that this patch is safe as written. /Anton
Re: [Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk
On 05/22/2017 10:00 PM, Eric Blake wrote: On 05/19/2017 04:34 AM, Anton Nefedov wrote: From: "Denis V. Lunev" Currently each single write operation can result in 3 write operations if guest offsets are not cluster aligned. One write is performed for the real payload and two for COW-ed areas. Thus the data possibly lays non-contiguously on the host filesystem. This will reduce further sequential read performance significantly. The patch allocates the space in the file with cluster granularity, ensuring 1. better host offset locality 2. less space allocation operations (which can be expensive on distributed storages) s/storages/storage/ Done. Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index a8d61f0..2e6a0ec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1575,6 +1575,32 @@ fail: return ret; } +static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) +{ +BDRVQcow2State *s = bs->opaque; +BlockDriverState *file = bs->file->bs; +QCowL2Meta *m; +int ret; + +for (m = l2meta; m != NULL; m = m->next) { +uint64_t bytes = m->nb_clusters << s->cluster_bits; + +if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { +continue; +} + +/* try to alloc host space in one chunk for better locality */ +ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 0); Are we guaranteed that this is a fast operation? (That is, it either results in a hole or an error, and doesn't waste time tediously writing actual zeroes) well, block_int.h reads: /* * Efficiently zero a region of the disk image. Typically an image format * would use a compact metadata representation to implement this. This * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() * will be called instead. */ int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, int64_t offset, int count, BdrvRequestFlags flags); (and that's why the driver function is used directly, bypassing the 'safe' bdrv interface that would try to write zeroes no matter the cost) As far as I checked the drivers mostly follow the idea + +if (ret != 0) { +continue; +} Supposing we are using a file system that doesn't support holes, then ret will not be zero, and we ended up not allocating anything after all. Is that a problem that we are just blindly continuing the loop as our reaction to the error? /reads further I guess not - you aren't reacting to any error call, but merely using the side effect that an allocation happened for speed when it worked, and ignoring failure (you get the old behavior of the write() now causing the allocation) when it didn't. yes, exactly + +file->total_sectors = MAX(file->total_sectors, + (m->alloc_offset + bytes) / BDRV_SECTOR_SIZE); +} +} + static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -1656,8 +1682,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, if (ret < 0) { goto fail; } - qemu_co_mutex_unlock(&s->lock); + +if (bs->file->bs->drv->bdrv_co_pwrite_zeroes != NULL) { +handle_alloc_space(bs, l2meta); +} Is it really a good idea to be modifying the underlying protocol image outside of the mutex? as far as I understand, qcow2 usually modifies the underlying image outside of the mutex? I guess it's qcow2 metadata that we wouldn't want to touch unlocked At any rate, it looks like your patch is doing a best-effort write zeroes as an attempt to trigger consecutive allocation of the entire cluster in the underlying protocol right after a cluster has been allocated at the qcow2 format layer. Which means there are more syscalls now than there were previously, but now when we do three write() calls at offsets B, A, C, those three calls are into file space that was allocated earlier by the write zeroes, rather than fresh calls into unallocated space that is likely to trigger up to three disjoint allocations. As a discussion point, wouldn't we achieve the same effect of less fragmentation if we instead collect our data into a bounce buffer, and only then do a single write() (or more likely, a writev() where the iov is set up to reconstruct a single buffer on the syscall, but where the source data is still at different offsets)? We'd be avoiding the extra syscalls of pre-allocating the cluster, and while our write() call is still c
[Qemu-devel] [PATCH v2 7/9] serial: move TIOCM update to a separate function
will be used by the following patch Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Michael S. Tsirkin CC: Paolo Bonzini --- hw/char/serial.c | 31 +++ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/hw/char/serial.c b/hw/char/serial.c index d8d34d0..1e6bdeb 100644 --- a/hw/char/serial.c +++ b/hw/char/serial.c @@ -312,6 +312,24 @@ static void serial_write_fcr(SerialState *s, uint8_t val) } } +static void serial_update_tiocm(SerialState *s) +{ +int flags; + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags); + +flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR); + +if (s->mcr & UART_MCR_RTS) { +flags |= CHR_TIOCM_RTS; +} +if (s->mcr & UART_MCR_DTR) { +flags |= CHR_TIOCM_DTR; +} + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags); +} + static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) { @@ -426,24 +444,13 @@ static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val, break; case 4: { -int flags; int old_mcr = s->mcr; s->mcr = val & 0x1f; if (val & UART_MCR_LOOP) break; if (s->poll_msl >= 0 && old_mcr != s->mcr) { - -qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags); - -flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR); - -if (val & UART_MCR_RTS) -flags |= CHR_TIOCM_RTS; -if (val & UART_MCR_DTR) -flags |= CHR_TIOCM_DTR; - -qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags); +serial_update_tiocm(s); /* Update the modem status after a one-character-send wait-time, since there may be a response from the device/computer at the other end of the serial line */ timer_mod(s->modem_status_poll, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->char_transmit_time); -- 2.7.4
[Qemu-devel] [PATCH v2 9/9] char: avoid chardevice direct access
frontends should avoid accessing CharDriver struct where possible Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c | 5 + gdbstub.c | 2 +- hw/arm/strongarm.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/escc.c | 6 +++--- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/ipoctal232.c| 2 +- hw/char/parallel.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/terminal3270.c | 2 +- hw/char/xen_console.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/misc/ivshmem.c | 4 ++-- hw/usb/ccid-card-passthru.c | 4 ++-- hw/usb/dev-serial.c | 5 ++--- hw/usb/redirect.c | 5 ++--- include/sysemu/char.h | 7 +++ net/filter-mirror.c | 2 +- 23 files changed, 39 insertions(+), 29 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 36d6f36..e43d840 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -494,6 +494,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be) return !!be->chr; } +bool qemu_chr_fe_backend_open(CharBackend *be) +{ +return be->chr && be->chr->be_open; +} + static bool fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; diff --git a/gdbstub.c b/gdbstub.c index 1ac0489..68cbe8a 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2001,7 +2001,7 @@ int gdbserver_start(const char *device) NULL, &error_abort); monitor_init(mon_chr, 0); } else { -if (qemu_chr_fe_get_driver(&s->chr)) { +if (qemu_chr_fe_backend_connected(&s->chr)) { object_unparent(OBJECT(qemu_chr_fe_get_driver(&s->chr))); } mon_chr = s->mon_chr; diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c index bec093d..9d7cf21 100644 --- a/hw/arm/strongarm.c +++ b/hw/arm/strongarm.c @@ -1105,7 +1105,7 @@ static void strongarm_uart_tx(void *opaque) if (s->utcr3 & UTCR3_LBM) /* loopback */ { strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1); -} else if (qemu_chr_fe_get_driver(&s->chr)) { +} else if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1); diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c index 71867b3..19636c0 100644 --- a/hw/char/cadence_uart.c +++ b/hw/char/cadence_uart.c @@ -278,7 +278,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, GIOCondition cond, int ret; /* instant drain the fifo when there's no back-end */ -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { s->tx_count = 0; return FALSE; } diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c index 6d95297..bd0d4f0 100644 --- a/hw/char/debugcon.c +++ b/hw/char/debugcon.c @@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = { static void debugcon_realize_core(DebugconState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create debugcon device, empty char device"); return; } diff --git a/hw/char/escc.c b/hw/char/escc.c index aa882b6..dbbeb4a 100644 --- a/hw/char/escc.c +++ b/hw/char/escc.c @@ -416,7 +416,7 @@ static void escc_update_parameters(ChannelState *s) int speed, parity, data_bits, stop_bits; QEMUSerialSetParams ssp; -if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser) +if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser) return; if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) { @@ -556,7 +556,7 @@ static void escc_mem_write(void *opaque, hwaddr addr, trace_escc_mem_writeb_data(CHN_C(s), val); s->tx = val; if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled -if (qemu_chr_fe_get_driver(&s->chr)) { +if (qemu_chr_fe_backend_connected(&s->chr)) { /* XXX this blocks entire thread. Rewrite to use * qemu_chr_fe_write and background I/O callbacks */ qemu_chr_fe_write_all(&s->chr, &s->tx, 1); @@ -1012,7 +1012,7 @@ static void escc_realize(DeviceState *dev, Error **errp) ESCC_SIZE << s->it_shift); for (i = 0; i < 2; i++) { -if (qemu_chr_fe_get_driver(&s->chn[i].chr)) { +if (qemu_chr_fe_backend_connected(&s->chn[i].chr)) {
[Qemu-devel] [PATCH v2 4/9] hmp: add hmp analogue for qmp-chardev-change
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Dr. David Alan Gilbert --- chardev/char.c| 4 ++-- hmp-commands.hx | 16 hmp.c | 34 ++ hmp.h | 1 + include/sysemu/char.h | 12 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index bac5e1c..0483f19 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -880,8 +880,8 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, - Error **errp) +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, +Error **errp) { Error *local_err = NULL; const ChardevClass *cc; diff --git a/hmp-commands.hx b/hmp-commands.hx index 0aca984..0f2a059 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as the -chardev command line switch. ETEXI { +.name = "chardev-change", +.args_type = "id:s,args:s", +.params = "id args", +.help = "change chardev", +.cmd= hmp_chardev_change, +}, + +STEXI +@item chardev-change args +@findex chardev-change +chardev_change accepts existing chardev @var{id} and then the same arguments +as the -chardev command line switch (except for "id"). + +ETEXI + +{ .name = "chardev-remove", .args_type = "id:s", .params = "id", diff --git a/hmp.c b/hmp.c index 3dceaf8..f7d0b38 100644 --- a/hmp.c +++ b/hmp.c @@ -2209,6 +2209,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_chardev_change(Monitor *mon, const QDict *qdict) +{ +const char *args = qdict_get_str(qdict, "args"); +const char *id; +Error *err = NULL; +ChardevBackend *backend = NULL; +ChardevReturn *ret = NULL; +QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args, + true); +if (!opts) { +error_setg(&err, "Parsing chardev args failed"); +goto end; +} + +id = qdict_get_str(qdict, "id"); +if (qemu_opts_id(opts)) { +error_setg(&err, "Unexpected 'id' parameter"); +goto end; +} + +backend = qemu_chr_parse_opts(opts, &err); +if (!backend) { +goto end; +} + +ret = qmp_chardev_change(id, backend, &err); + +end: +qapi_free_ChardevReturn(ret); +qapi_free_ChardevBackend(backend); +qemu_opts_del(opts); +hmp_handle_error(mon, &err); +} + void hmp_chardev_remove(Monitor *mon, const QDict *qdict) { Error *local_err = NULL; diff --git a/hmp.h b/hmp.h index d8b94ce..23e035c 100644 --- a/hmp.h +++ b/hmp.h @@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict); void hmp_nbd_server_add(Monitor *mon, const QDict *qdict); void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); +void hmp_chardev_change(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); void hmp_cpu_add(Monitor *mon, const QDict *qdict); diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 68c7876..92ae57e 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -132,6 +132,18 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend); /** + * @qemu_chr_parse_opts: + * + * Parse the options to the ChardevBackend struct. + * + * @opts + * + * Returns: a new backend + */ +ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, +Error **errp); + +/** * @qemu_chr_new: * * Create a new character backend from a URI. -- 2.7.4
[Qemu-devel] [PATCH v2 6/9] virtio-console: chardev hotswap support
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Amit Shah --- hw/char/virtio-console.c | 35 ++- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c index cf7331d..bd74669 100644 --- a/hw/char/virtio-console.c +++ b/hw/char/virtio-console.c @@ -49,7 +49,7 @@ static ssize_t flush_buf(VirtIOSerialPort *port, VirtConsole *vcon = VIRTIO_CONSOLE(port); ssize_t ret; -if (!qemu_chr_fe_get_driver(&vcon->chr)) { +if (!qemu_chr_fe_backend_connected(&vcon->chr)) { /* If there's no backend, we can just say we consumed all data. */ return len; } @@ -163,12 +163,35 @@ static void chr_event(void *opaque, int event) } } +static int chr_be_change(void *opaque) +{ +VirtConsole *vcon = opaque; +VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(vcon); +VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port); + +if (k->is_console) { +qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + NULL, chr_be_change, vcon, NULL, true); +} else { +qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + chr_event, chr_be_change, vcon, NULL, false); +} + +if (vcon->watch) { +g_source_remove(vcon->watch); +vcon->watch = qemu_chr_fe_add_watch(&vcon->chr, +G_IO_OUT | G_IO_HUP, +chr_write_unblocked, vcon); +} + +return 0; +} + static void virtconsole_realize(DeviceState *dev, Error **errp) { VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev); VirtConsole *vcon = VIRTIO_CONSOLE(dev); VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(dev); -Chardev *chr = qemu_chr_fe_get_driver(&vcon->chr); if (port->id == 0 && !k->is_console) { error_setg(errp, "Port number 0 on virtio-serial devices reserved " @@ -176,7 +199,7 @@ static void virtconsole_realize(DeviceState *dev, Error **errp) return; } -if (chr) { +if (qemu_chr_fe_backend_connected(&vcon->chr)) { /* * For consoles we don't block guest data transfer just * because nothing is connected - we'll just let it go @@ -188,11 +211,13 @@ static void virtconsole_realize(DeviceState *dev, Error **errp) */ if (k->is_console) { qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, - NULL, NULL, vcon, NULL, true); + NULL, chr_be_change, + vcon, NULL, true); virtio_serial_open(port); } else { qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, - chr_event, NULL, vcon, NULL, false); + chr_event, chr_be_change, + vcon, NULL, false); } } } -- 2.7.4
[Qemu-devel] [PATCH v2 2/9] char: add backend hotswap handler
Frontends should have an interface to setup the handler of a backend change. The interface will be used in the next commits Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- backends/rng-egd.c | 2 +- chardev/char-mux.c | 1 + chardev/char.c | 4 +++- gdbstub.c | 2 +- hw/arm/pxa2xx.c | 3 ++- hw/arm/strongarm.c | 2 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 2 +- hw/char/debugcon.c | 2 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 2 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 2 +- hw/char/grlib_apbuart.c | 2 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 2 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 2 +- hw/char/sclpconsole.c | 2 +- hw/char/serial.c| 2 +- hw/char/sh_serial.c | 2 +- hw/char/spapr_vty.c | 2 +- hw/char/stm32f2xx_usart.c | 3 ++- hw/char/terminal3270.c | 2 +- hw/char/virtio-console.c| 4 ++-- hw/char/xen_console.c | 2 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 2 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 2 +- hw/usb/ccid-card-passthru.c | 2 +- hw/usb/dev-serial.c | 2 +- hw/usb/redirect.c | 2 +- include/sysemu/char.h | 5 + monitor.c | 4 ++-- net/colo-compare.c | 14 -- net/filter-mirror.c | 6 +++--- net/slirp.c | 2 +- net/vhost-user.c| 7 --- qtest.c | 2 +- tests/test-char.c | 14 ++ tests/vhost-user-test.c | 2 +- 47 files changed, 78 insertions(+), 59 deletions(-) diff --git a/backends/rng-egd.c b/backends/rng-egd.c index 380b19a..0b0e945 100644 --- a/backends/rng-egd.c +++ b/backends/rng-egd.c @@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp) /* FIXME we should resubmit pending requests when the CDS reconnects. */ qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read, - rng_egd_chr_read, NULL, s, NULL, true); + rng_egd_chr_read, NULL, NULL, s, NULL, true); } static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp) diff --git a/chardev/char-mux.c b/chardev/char-mux.c index 37d42c6..5849ea5 100644 --- a/chardev/char-mux.c +++ b/chardev/char-mux.c @@ -278,6 +278,7 @@ void mux_chr_set_handlers(Chardev *chr, GMainContext *context) mux_chr_can_read, mux_chr_read, mux_chr_event, + NULL, chr, context, true); } diff --git a/chardev/char.c b/chardev/char.c index 684cccd..ae60950 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -522,7 +522,7 @@ void qemu_chr_fe_deinit(CharBackend *b) assert(b); if (b->chr) { -qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true); +qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true); if (b->chr->be == b) { b->chr->be = NULL; } @@ -538,6 +538,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, IOCanReadHandler *fd_can_read, IOReadHandler *fd_read, IOEventHandler *fd_event, + BackendChangeHandler *be_change, void *opaque, GMainContext *context, bool set_open) @@ -561,6 +562,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b, b->chr_can_read = fd_can_read; b->chr_read = fd_read; b->chr_event = fd_event; +b->chr_be_change = be_change; b->opaque = opaque; if (cc->chr_update_read_handler) { cc->chr_update_read_handler(s, context); diff --git a/gdbstub.c b/gdbstub.c index 86eed4f..1ac0489 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2013,7 +2013,7 @@ int gdbserver_start(const char *device) if (chr) { qemu_chr_fe_init(&s->chr, chr, &error_abort); qemu_chr_fe_set_handlers(&s->chr, gdb_chr_can_receive, gdb_chr_receive, - gdb_chr_event, NULL, NULL, true); + gdb_chr_event, NULL, NULL, NULL, true); } s->state = chr ? RS_IDLE : RS_INACTIVE; s->mon_chr = mon_chr; diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c index eea551d..3e51882 100644 --- a/hw/a
[Qemu-devel] [PATCH v2 3/9] char: chardevice hotswap
This patch adds a possibility to change a char device without a frontend removal. 1. Ideally, it would have to happen transparently to a frontend, i.e. frontend would continue its regular operation. However, backends are not stateless and are set up by the frontends via qemu_chr_fe_<> functions, and it's not (generally) possible to replay that setup entirely in a backend code, as different chardevs respond to the setup calls differently, so do frontends work differently basing on those setup responses. Moreover, some frontend can generally get and save the backend pointer (qemu_chr_fe_get_driver()), and it will become invalid after backend change. So, a frontend which would like to support chardev hotswap has to register a "backend change" handler, and redo its backend setup there. 2. Write path can be used by multiple threads and thus protected with chr_write_lock. So hotswap also has to be protected so write functions won't access a backend being replaced. 3. Hotswap function can be called from e.g. a read handler of a monitor socket. This can cause troubles so it's safer to defer execution to a bottom-half (however, it means we cannot return some of the errors synchronously - but most of them we can) Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c| 147 ++ include/sysemu/char.h | 10 qapi-schema.json | 40 ++ 3 files changed, 187 insertions(+), 10 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index ae60950..bac5e1c 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -132,12 +132,16 @@ static bool qemu_chr_replay(Chardev *chr) int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +Chardev *s; ChardevClass *cc; int ret; +qemu_mutex_lock(&be->chr_lock); +s = be->chr; + if (!s) { -return 0; +ret = 0; +goto end; } if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_PLAY) { @@ -145,7 +149,7 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) replay_char_write_event_load(&ret, &offset); assert(offset <= len); qemu_chr_fe_write_buffer(s, buf, offset, &offset); -return ret; +goto end; } cc = CHARDEV_GET_CLASS(s); @@ -161,7 +165,9 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len) if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_RECORD) { replay_char_write_event_save(ret, ret < 0 ? 0 : ret); } - + +end: +qemu_mutex_unlock(&be->chr_lock); return ret; } @@ -191,13 +197,16 @@ int qemu_chr_write_all(Chardev *s, const uint8_t *buf, int len) int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len) { -Chardev *s = be->chr; +Chardev *s; +int ret; -if (!s) { -return 0; -} +qemu_mutex_lock(&be->chr_lock); -return qemu_chr_write_all(s, buf, len); +s = be->chr; +ret = s ? qemu_chr_write_all(s, buf, len) : 0; + +qemu_mutex_unlock(&be->chr_lock); +return ret; } int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len) @@ -478,7 +487,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be) return be->chr; } -bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +static bool fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; @@ -507,6 +516,17 @@ unavailable: return false; } +bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp) +{ +if (!fe_connect(b, s, errp)) { +return false; +} + +qemu_mutex_init(&b->chr_lock); +b->hotswap_bh = NULL; +return true; +} + static bool qemu_chr_is_busy(Chardev *s) { if (CHARDEV_IS_MUX(s)) { @@ -531,6 +551,10 @@ void qemu_chr_fe_deinit(CharBackend *b) d->backends[b->tag] = NULL; } b->chr = NULL; +qemu_mutex_destroy(&b->chr_lock); +if (b->hotswap_bh) { +qemu_bh_delete(b->hotswap_bh); +} } } @@ -1308,6 +1332,109 @@ ChardevReturn *qmp_chardev_add(const char *id, ChardevBackend *backend, return ret; } +static void chardev_change_bh(void *opaque) +{ +Chardev *chr_new = opaque; +const char *id = chr_new->label; +Chardev *chr = qemu_chr_find(id); +CharBackend *be = chr->be; +bool closed_sent = false; + +if (!be) { +/* disconnected since we checked: ok, less work for us */ +goto end; +} + +if (chr->be_open && !chr_new->be_open) { +qemu_chr_be_event(chr, CHR_EVENT_CLOSED); +closed_sent = true; +} + +qemu_mutex_lock(&be->chr_lock); +chr->be = NULL; +fe_connect(be, chr_new, &error_abort); + +if (be->chr_be_chan
[Qemu-devel] [PATCH v2 1/9] char: move QemuOpts->ChardevBackend translation to a separate func
Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c | 72 +- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/chardev/char.c b/chardev/char.c index 4e24dc3..684cccd 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -854,17 +854,14 @@ help_string_append(const char *name, void *opaque) g_string_append_printf(str, "\n%s", name); } -Chardev *qemu_chr_new_from_opts(QemuOpts *opts, -Error **errp) +static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, + Error **errp) { Error *local_err = NULL; const ChardevClass *cc; -Chardev *chr; int i; ChardevBackend *backend = NULL; const char *name = qemu_opt_get(opts, "backend"); -const char *id = qemu_opts_id(opts); -char *bid = NULL; if (name == NULL) { error_setg(errp, "chardev: \"%s\" missing backend", @@ -872,21 +869,6 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, return NULL; } -if (is_help_option(name)) { -GString *str = g_string_new(""); - -chardev_name_foreach(help_string_append, str); - -error_report("Available chardev backend types: %s", str->str); -g_string_free(str, true); -exit(0); -} - -if (id == NULL) { -error_setg(errp, "chardev: no id specified"); -return NULL; -} - for (i = 0; i < ARRAY_SIZE(chardev_alias_table); i++) { if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) { name = chardev_alias_table[i].typename; @@ -902,16 +884,12 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, backend = g_new0(ChardevBackend, 1); backend->type = CHARDEV_BACKEND_KIND_NULL; -if (qemu_opt_get_bool(opts, "mux", 0)) { -bid = g_strdup_printf("%s-base", id); -} - -chr = NULL; if (cc->parse) { cc->parse(opts, backend, &local_err); if (local_err) { error_propagate(errp, local_err); -goto out; +qapi_free_ChardevBackend(backend); +return NULL; } } else { ChardevCommon *ccom = g_new0(ChardevCommon, 1); @@ -919,6 +897,48 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts, backend->u.null.data = ccom; /* Any ChardevCommon member would work */ } +return backend; +} + +Chardev *qemu_chr_new_from_opts(QemuOpts *opts, +Error **errp) +{ +const ChardevClass *cc; +Chardev *chr = NULL; +ChardevBackend *backend = NULL; +const char *name = qemu_opt_get(opts, "backend"); +const char *id = qemu_opts_id(opts); +char *bid = NULL; + +if (name && is_help_option(name)) { +GString *str = g_string_new(""); + +chardev_name_foreach(help_string_append, str); + +error_report("Available chardev backend types: %s", str->str); +g_string_free(str, true); +exit(0); +} + +if (id == NULL) { +error_setg(errp, "chardev: no id specified"); +return NULL; +} + +backend = qemu_chr_parse_opts(opts, errp); +if (backend == NULL) { +return NULL; +} + +cc = char_get_class(name, errp); +if (cc == NULL) { +goto out; +} + +if (qemu_opt_get_bool(opts, "mux", 0)) { +bid = g_strdup_printf("%s-base", id); +} + chr = qemu_chardev_new(bid ? bid : id, object_class_get_name(OBJECT_CLASS(cc)), backend, errp); -- 2.7.4
[Qemu-devel] [PATCH v2 8/9] serial: chardev hotswap support
for a backend change, a number of ioctls has to be replayed to sync the current setup of a frontend to a backend tty. This is hopefully enough so we don't have to track, store and replay the whole original control byte sequence. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy CC: Michael S. Tsirkin CC: Paolo Bonzini --- hw/char/serial.c | 32 ++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/hw/char/serial.c b/hw/char/serial.c index 1e6bdeb..ed01637 100644 --- a/hw/char/serial.c +++ b/hw/char/serial.c @@ -891,9 +891,37 @@ static void serial_reset(void *opaque) s->msr &= ~UART_MSR_ANY_DELTA; } +static int serial_be_change(void *opaque) +{ +SerialState *s = opaque; + +qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, + serial_event, serial_be_change, s, NULL, true); + +serial_update_parameters(s); + +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK, + &s->last_break_enable); + +s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0; +serial_update_msl(s); + +if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) { +serial_update_tiocm(s); +} + +if (s->watch_tag > 0) { +g_source_remove(s->watch_tag); +s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP, + serial_watch_cb, s); +} + +return 0; +} + void serial_realize_core(SerialState *s, Error **errp) { -if (!qemu_chr_fe_get_driver(&s->chr)) { +if (!qemu_chr_fe_backend_connected(&s->chr)) { error_setg(errp, "Can't create serial device, empty char device"); return; } @@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp) qemu_register_reset(serial_reset, s); qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1, - serial_event, NULL, s, NULL, true); + serial_event, serial_be_change, s, NULL, true); fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH); fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH); serial_reset(s); -- 2.7.4
[Qemu-devel] [PATCH v2 0/9] chardevice hotswap
This serie is a v2 of the February submit http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html The interface is changed as requested and the changes are slightly reworked and split into separate patches. The patchset adds support of the character device change without a frontend device removal. Yet isa-serial and virtio-serial frontends are supported. The feature can be helpful for e.g. Windows debug allowing to establish connection to a live VM from VM with WinDbg. Anton Nefedov (9): char: move QemuOpts->ChardevBackend translation to a separate func char: add backend hotswap handler char: chardevice hotswap hmp: add hmp analogue for qmp-chardev-change char: forbid direct chardevice access for hotswap devices virtio-console: chardev hotswap support serial: move TIOCM update to a separate function serial: chardev hotswap support char: avoid chardevice direct access backends/rng-egd.c | 2 +- chardev/char-mux.c | 1 + chardev/char.c | 235 +--- gdbstub.c | 4 +- hmp-commands.hx | 16 +++ hmp.c | 34 +++ hmp.h | 1 + hw/arm/pxa2xx.c | 3 +- hw/arm/strongarm.c | 4 +- hw/char/bcm2835_aux.c | 2 +- hw/char/cadence_uart.c | 4 +- hw/char/debugcon.c | 4 +- hw/char/digic-uart.c| 2 +- hw/char/escc.c | 8 +- hw/char/etraxfs_ser.c | 2 +- hw/char/exynos4210_uart.c | 4 +- hw/char/grlib_apbuart.c | 4 +- hw/char/imx_serial.c| 2 +- hw/char/ipoctal232.c| 4 +- hw/char/lm32_juart.c| 2 +- hw/char/lm32_uart.c | 2 +- hw/char/mcf_uart.c | 2 +- hw/char/milkymist-uart.c| 2 +- hw/char/parallel.c | 2 +- hw/char/pl011.c | 2 +- hw/char/sclpconsole-lm.c| 4 +- hw/char/sclpconsole.c | 4 +- hw/char/serial.c| 63 +--- hw/char/sh_serial.c | 4 +- hw/char/spapr_vty.c | 4 +- hw/char/stm32f2xx_usart.c | 3 +- hw/char/terminal3270.c | 4 +- hw/char/virtio-console.c| 35 ++- hw/char/xen_console.c | 4 +- hw/char/xilinx_uartlite.c | 2 +- hw/ipmi/ipmi_bmc_extern.c | 4 +- hw/mips/boston.c| 2 +- hw/mips/mips_malta.c| 2 +- hw/misc/ivshmem.c | 6 +- hw/usb/ccid-card-passthru.c | 6 +- hw/usb/dev-serial.c | 7 +- hw/usb/redirect.c | 7 +- include/sysemu/char.h | 44 + monitor.c | 4 +- net/colo-compare.c | 14 +-- net/filter-mirror.c | 8 +- net/slirp.c | 2 +- net/vhost-user.c| 7 +- qapi-schema.json| 40 qtest.c | 2 +- tests/test-char.c | 14 ++- tests/vhost-user-test.c | 2 +- 52 files changed, 506 insertions(+), 140 deletions(-) -- 2.7.4
[Qemu-devel] [PATCH v2 5/9] char: forbid direct chardevice access for hotswap devices
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support should not access CharDriver ptr directly as CharDriver might change. Signed-off-by: Anton Nefedov Reviewed-by: Vladimir Sementsov-Ogievskiy --- chardev/char.c| 7 +++ include/sysemu/char.h | 10 ++ 2 files changed, 17 insertions(+) diff --git a/chardev/char.c b/chardev/char.c index 0483f19..36d6f36 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -484,9 +484,16 @@ static Notifier muxes_realize_notify = { Chardev *qemu_chr_fe_get_driver(CharBackend *be) { +/* this is unsafe for the users that support chardev hotswap */ +assert(be->chr_be_change == NULL); return be->chr; } +bool qemu_chr_fe_backend_connected(CharBackend *be) +{ +return !!be->chr; +} + static bool fe_connect(CharBackend *b, Chardev *s, Error **errp) { int tag = 0; diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 92ae57e..fa21535 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -404,10 +404,20 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp); * * Returns the driver associated with a CharBackend or NULL if no * associated Chardev. + * Note: avoid this function as the driver should never be accessed directly, + * especially by the frontends that support chardevice hotswap. + * Consider qemu_chr_fe_backend_connected() to check for driver existence */ Chardev *qemu_chr_fe_get_driver(CharBackend *be); /** + * @qemu_chr_fe_backend_connected: + * + * Returns true if there is a chardevice associated with @be. + */ +bool qemu_chr_fe_backend_connected(CharBackend *be); + +/** * @qemu_chr_fe_deinit: * * Dissociate the CharBackend from the Chardev. -- 2.7.4
[Qemu-devel] [PATCH v1 10/13] qcow2-cluster: slightly refactor handle_dependencies()
- assert the alignment on return if the allocation has to stop (at the start of a running allocation) - make use of const specifiers for local variables Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 20 +++- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4204db9..03d6f7e 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -900,15 +900,15 @@ out: * Check if there already is an AIO write request in flight which allocates * the same cluster. In this case we need to wait until the previous * request has completed and updated the L2 table accordingly. - * * Returns: * 0 if there was no dependency. *cur_bytes indicates the number of * bytes from guest_offset that can be read before the next - * dependency must be processed (or the request is complete) + * dependency must be processed (or the request is complete). + * *m is not modified * - * -EAGAIN if we had to wait for another request, previously gathered - * information on cluster allocation may be invalid now. The caller - * must start over anyway, so consider *cur_bytes undefined. + * -EAGAIN if we had to wait for another request. The caller + * must start over, so consider *cur_bytes undefined. + * *m is not modified */ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, uint64_t *cur_bytes, QCowL2Meta **m) @@ -919,10 +919,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { -uint64_t start = guest_offset; -uint64_t end = start + bytes; -uint64_t old_start = l2meta_cow_start(old_alloc); -uint64_t old_end = l2meta_cow_end(old_alloc); +const uint64_t start = guest_offset; +const uint64_t end = start + bytes; +const uint64_t old_start = l2meta_cow_start(old_alloc); +const uint64_t old_end = l2meta_cow_end(old_alloc); if (end <= old_start || start >= old_end) { /* No intersection */ @@ -939,6 +939,8 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, * and deal with requests depending on them before starting to * gather new ones. Not worth the trouble. */ if (bytes == 0 && *m) { +/* start must be cluster aligned at this point */ +assert(start == start_of_cluster(s, start)); *cur_bytes = 0; return 0; } -- 2.7.4
[Qemu-devel] [PATCH v1 08/13] qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation
Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 2 ++ block/qcow2.c | 8 +++- block/qcow2.h | 4 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index b2879b9..25210cd 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1143,6 +1143,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) { BDRVQcow2State *s = bs->opaque; +const uint64_t old_data_end = s->data_end; int l2_index; uint64_t *l2_table; uint64_t entry; @@ -1264,6 +1265,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, .alloc_offset = alloc_cluster_offset, .offset = start_of_cluster(s, guest_offset), .nb_clusters= nb_clusters, +.clusters_are_trailing = alloc_cluster_offset >= old_data_end, .keep_old_clusters = keep_old_clusters, diff --git a/block/qcow2.c b/block/qcow2.c index 503f0dc..97a66a0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1684,7 +1684,13 @@ restart: if (end <= bdrv_getlength(file)) { /* No need to care, file size will not be changed */ -return false; + +/* We're safe to assume that the area is zeroes if the area + * was allocated at the end of data (s->data_end). + * In this case, the only way for file length to be bigger is that + * the area was preallocated by another request. + */ +return m->clusters_are_trailing; } meta = g_alloca(sizeof(*meta)); diff --git a/block/qcow2.h b/block/qcow2.h index e28c54a..2fd8510 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -333,6 +333,10 @@ typedef struct QCowL2Meta /** Do not free the old clusters */ bool keep_old_clusters; +/** True if the area is allocated after the end of data area + * (i.e. >= s->data_end), which means that it is zeroed */ +bool clusters_are_trailing; + /** * Requests that overlap with this allocation and wait to be restarted * when the allocating request has completed. -- 2.7.4
[Qemu-devel] [PATCH v1 06/13] qcow2: truncate preallocated space
From: "Denis V. Lunev" This could be done after calculation of the end of data and metadata in the qcow2 image. Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2-cluster.c | 9 + block/qcow2-refcount.c | 7 +++ block/qcow2.c | 8 block/qcow2.h | 3 +++ 4 files changed, 27 insertions(+) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index a4b6d40..b2879b9 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1957,3 +1957,12 @@ fail: g_free(l1_table); return ret; } + +void qcow2_update_data_end(BlockDriverState *bs, uint64_t off) +{ +BDRVQcow2State *s = bs->opaque; + +if (s->data_end < off) { +s->data_end = off; +} +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 873a1d2..8156466 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -744,6 +744,9 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, ret = alloc_refcount_block(bs, cluster_index, &refcount_block); if (ret < 0) { goto fail; +} else { +qcow2_update_data_end(bs, s->refcount_table_offset + +s->refcount_table_size * sizeof(uint64_t)); } } old_table_index = table_index; @@ -865,6 +868,8 @@ retry: s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) { return -EFBIG; +} else { +qcow2_update_data_end(bs, s->free_cluster_index << s->cluster_bits); } #ifdef DEBUG_ALLOC2 @@ -929,6 +934,8 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, if (ret < 0) { return ret; +} else { +qcow2_update_data_end(bs, offset + (nb_clusters << s->cluster_bits)); } return i; diff --git a/block/qcow2.c b/block/qcow2.c index 07c1706..7b4359b 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1192,6 +1192,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags, } } +s->data_end = bdrv_getlength(bs->file->bs); + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -1948,12 +1950,18 @@ static int qcow2_inactivate(BlockDriverState *bs) static void qcow2_close(BlockDriverState *bs) { BDRVQcow2State *s = bs->opaque; + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; if (!(s->flags & BDRV_O_INACTIVE)) { qcow2_inactivate(bs); + +/* truncate preallocated space */ +if (!bs->read_only && s->data_end < bdrv_getlength(bs->file->bs)) { +bdrv_truncate(bs->file, s->data_end, NULL); +} } cache_clean_timer_del(bs); diff --git a/block/qcow2.h b/block/qcow2.h index a0d222d..e28c54a 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -297,6 +297,7 @@ typedef struct BDRVQcow2State { char *image_backing_format; uint64_t prealloc_size; +uint64_t data_end; } BDRVQcow2State; typedef struct Qcow2COWRegion { @@ -607,4 +608,6 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); +void qcow2_update_data_end(BlockDriverState *bs, uint64_t off); + #endif -- 2.7.4
[Qemu-devel] [PATCH v1 12/13] qcow2: allow concurrent unaligned writes to the same clusters
If COW area of a write request to unallocated cluster is empty, concurrent write requests can be allowed with a little bit of extra synchronization; so they don't have to wait until L2 is filled. Let qcow2_cluster.c::handle_dependencies() do the most of the job: if there is an in-flight request to the same cluster, and the current request wants to write in its COW area, and its COW area is marked empty, - steal the allocated offset and write concurrently. Let the original request update L2 later when it likes. This gives an improvement for parallel misaligned writes to unallocated clusters with no backing data: HDD fio over xfs iodepth=4: seqwrite 4k: 18400 -> 22800 IOPS ( x1.24 ) seqwrite 68k: 1600 -> 2300 IOPS ( x1.44 ) Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 169 +++--- block/qcow2.c | 28 - block/qcow2.h | 12 +++- 3 files changed, 181 insertions(+), 28 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index c0974e8..7cffdd4 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -898,20 +898,32 @@ out: /* * Check if there already is an AIO write request in flight which allocates - * the same cluster. In this case we need to wait until the previous - * request has completed and updated the L2 table accordingly. + * the same cluster. + * In this case, check if that request has explicitly allowed to write + * in its COW area(s). + * If yes - fill the meta to point to the same cluster. + * If no - we need to wait until the previous request has completed and + *updated the L2 table accordingly or + *has allowed writing in its COW area(s). * Returns: * 0 if there was no dependency. *cur_bytes indicates the number of * bytes from guest_offset that can be read before the next * dependency must be processed (or the request is complete). - * *m is not modified + * *m, *host_offset are not modified + * + * 1 if there is a dependency but it is possible to write concurrently + * *m is filled accordingly, + * *cur_bytes may have decreased and describes + * the length of the area that can be written to, + * *host_offset contains the starting host image offset to write to * * -EAGAIN if we had to wait for another request. The caller - * must start over, so consider *cur_bytes undefined. + * must start over, so consider *cur_bytes and *host_offset undefined. * *m is not modified */ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, -uint64_t *cur_bytes, QCowL2Meta **m) + uint64_t *host_offset, uint64_t *cur_bytes, + QCowL2Meta **m) { BDRVQcow2State *s = bs->opaque; QCowL2Meta *old_alloc; @@ -924,7 +936,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, const uint64_t old_start = l2meta_cow_start(old_alloc); const uint64_t old_end = l2meta_cow_end(old_alloc); -if (end <= old_start || start >= old_end) { +if (end <= old_start || start >= old_end || old_alloc->piggybacked) { /* No intersection */ continue; } @@ -936,21 +948,95 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, continue; } -/* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ -if (*m) { +/* offsets of the cluster we're intersecting in */ +const uint64_t cluster_start = start_of_cluster(s, start); +const uint64_t cluster_end = cluster_start + s->cluster_size; + +const uint64_t old_data_start = old_start ++ old_alloc->cow_start.nb_bytes; +const uint64_t old_data_end = old_alloc->offset ++ old_alloc->cow_end.offset; + +const bool conflict_in_data_area = +end > old_data_start && start < old_data_end; +const bool conflict_in_old_cow_start = +/* 1). new write request area is before the old */ +start < old_data_start +&& /* 2). old request did not allow writing in its cow area */ +!old_alloc->cow_start.reduced; +const bool conflict_in_old_cow_end = +/* 1). new write request area is after the old */ +start > old_data_start +&& /* 2). old request did not allow writing in its cow area */ +!old_alloc->cow_end.reduced; + +if (conflict_in_data_a
[Qemu-devel] [PATCH v1 13/13] iotest 046: test simultaneous cluster write error case
Signed-off-by: Anton Nefedov --- tests/qemu-iotests/046 | 38 +- tests/qemu-iotests/046.out | 23 +++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046 index f2ebecf..c210b55 100755 --- a/tests/qemu-iotests/046 +++ b/tests/qemu-iotests/046 @@ -29,7 +29,8 @@ status=1 # failure is the default! _cleanup() { - _cleanup_test_img +_cleanup_test_img +rm "$TEST_DIR/blkdebug.conf" } trap "_cleanup; exit \$status" 0 1 2 3 15 @@ -188,6 +189,37 @@ overlay_io | $QEMU_IO blkdebug::"$TEST_IMG" | _filter_qemu_io |\ sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g' echo +echo "== Concurrency error case ==" + +# 1. 1st request allocated the cluster, stop before it writes and updates L2 +# 2. 2nd request starts at the same cluster must complete write and start +#waiting for the 1st to update L2 +# 3. Resume the 1st request to make it fail (injected error) +# 4. 2nd request must wake and fail as well +#1 cluster will end up leaked +cat > "$TEST_DIR/blkdebug.conf" <
[Qemu-devel] [PATCH v1 02/13] qcow2: is_zero_sectors(): return true if area is outside of backing file
in such case, bdrv_get_block_status() shall return 0, *nr == 0 iotest 154 updated accordingly: write-zeroes tail alignment can be detected as zeroes now, so pwrite_zeroes succeeds Signed-off-by: Anton Nefedov --- block/qcow2.c | 6 -- tests/qemu-iotests/154.out | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 2e6a0ec..b885dfc 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2482,7 +2482,7 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start, int64_t res; if (start + count > bs->total_sectors) { -count = bs->total_sectors - start; +count = start < bs->total_sectors ? bs->total_sectors - start : 0; } if (!count) { @@ -2490,7 +2490,9 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start, } res = bdrv_get_block_status_above(bs, NULL, start, count, &nr, &file); -return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count; +return res >= 0 +&& (((res & BDRV_BLOCK_ZERO) && nr == count) +|| nr == 0); } static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out index d8485ee..259340e 100644 --- a/tests/qemu-iotests/154.out +++ b/tests/qemu-iotests/154.out @@ -322,7 +322,7 @@ wrote 1024/1024 bytes at offset 134218240 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) 2048/2048 bytes allocated at offset 128 MiB [{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false}, -{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] +{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}] Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base wrote 2048/2048 bytes at offset 134217728 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) @@ -348,7 +348,7 @@ wrote 1024/1024 bytes at offset 134218240 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) 2048/2048 bytes allocated at offset 128 MiB [{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false}, -{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, "offset": OFFSET}] +{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}] Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 backing_file=TEST_DIR/t.IMGFMT.base wrote 2048/2048 bytes at offset 134217728 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -- 2.7.4
[Qemu-devel] [PATCH v1 11/13] qcow2-cluster: make handle_dependencies() logic easier to follow
Avoid complicated nested conditions; return or continue asap instead. The logic is not changed. Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 45 ++--- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 03d6f7e..c0974e8 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -926,32 +926,31 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, if (end <= old_start || start >= old_end) { /* No intersection */ -} else { -if (start < old_start) { -/* Stop at the start of a running allocation */ -bytes = old_start - start; -} else { -bytes = 0; -} +continue; +} -/* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ -if (bytes == 0 && *m) { -/* start must be cluster aligned at this point */ -assert(start == start_of_cluster(s, start)); -*cur_bytes = 0; -return 0; -} +if (start < old_start) { +/* Stop at the start of a running allocation */ +bytes = old_start - start; +/* ..if there is no other conflict, keep checking */ +continue; +} -if (bytes == 0) { -/* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ -qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); -return -EAGAIN; -} +/* Stop if already an l2meta exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. */ +if (*m) { +/* start must be cluster aligned at this point */ +assert(start == start_of_cluster(s, start)); +*cur_bytes = 0; +return 0; } + +/* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ +qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); +return -EAGAIN; } /* Make sure that existing clusters and new allocations are only used up to -- 2.7.4
[Qemu-devel] [PATCH v1 05/13] qcow2: set inactive flag
Qcow2State and BlockDriverState flags have to be in sync Signed-off-by: Anton Nefedov --- block/qcow2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/qcow2.c b/block/qcow2.c index 6e7ce96..07c1706 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1939,6 +1939,7 @@ static int qcow2_inactivate(BlockDriverState *bs) if (result == 0) { qcow2_mark_clean(bs); +s->flags |= BDRV_O_INACTIVE; } return result; -- 2.7.4
[Qemu-devel] [PATCH v1 07/13] qcow2: check space leak at the end of the image
From: Pavel Butsykin Preallocating memory in the image may remain unused after the fall, for the qcow2_check adds the ability to identify and fix it, so as not to store extra memory on the host. Signed-off-by: Pavel Butsykin Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2.c | 31 +++ tests/qemu-iotests/026.out | 104 - tests/qemu-iotests/026.out.nocache | 104 - tests/qemu-iotests/029.out | 5 +- tests/qemu-iotests/060.out | 10 +++- tests/qemu-iotests/061.out | 5 +- tests/qemu-iotests/066.out | 5 +- tests/qemu-iotests/098.out | 7 ++- tests/qemu-iotests/108.out | 5 +- tests/qemu-iotests/112.out | 5 +- 10 files changed, 225 insertions(+), 56 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 7b4359b..503f0dc 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -322,6 +322,32 @@ int qcow2_mark_consistent(BlockDriverState *bs) return 0; } +static int qcow2_check_extra_preallocation(BlockDriverState *bs, +BdrvCheckResult *res, BdrvCheckMode fix) +{ +BDRVQcow2State *s = bs->opaque; +uint64_t img_size = bdrv_getlength(bs->file->bs); + +if (res->image_end_offset < img_size) { +uint64_t count = +DIV_ROUND_UP(img_size - res->image_end_offset, s->cluster_size); +fprintf(stderr, "%s space leaked at the end of the image %jd\n", +fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", +img_size - res->image_end_offset); +res->leaks += count; +if (fix & BDRV_FIX_LEAKS) { +int ret = bdrv_truncate(bs->file, res->image_end_offset, NULL); +if (ret < 0) { +res->check_errors++; +return ret; +} +res->leaks_fixed += count; +} +} + +return 0; +} + static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix) { @@ -330,6 +356,11 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, return ret; } +ret = qcow2_check_extra_preallocation(bs, result, fix); +if (ret < 0) { +return ret; +} + if (fix && result->check_errors == 0 && result->corruptions == 0) { ret = qcow2_mark_clean(bs); if (ret < 0) { diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out index 86a50a2..e8cf348 100644 --- a/tests/qemu-iotests/026.out +++ b/tests/qemu-iotests/026.out @@ -5,7 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 5; imm: off; once: on; write write failed: Input/output error -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 5; imm: off; once: on; write -b @@ -33,7 +36,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l1_update; errno: 28; imm: off; once: on; write -b @@ -181,7 +187,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 5; imm: off; once: on; write write failed: Input/output error -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 5; imm: off; once: on; write -b @@ -207,7 +216,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked at the end of the image 1024 + +1 leaked clusters were found on the image. +This means waste of disk space, but no harm to data. Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: l2_alloc_write; errno: 28; imm: off; once: on; write -b @@ -468,20 +480,27 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 Event: refblock_alloc_hookup; errno: 28; imm: off; once: on; write write failed: No space left on device -No errors were found on the image. +ERROR space leaked a
[Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk
From: "Denis V. Lunev" Currently each single write operation can result in 3 write operations if guest offsets are not cluster aligned. One write is performed for the real payload and two for COW-ed areas. Thus the data possibly lays non-contiguously on the host filesystem. This will reduce further sequential read performance significantly. The patch allocates the space in the file with cluster granularity, ensuring 1. better host offset locality 2. less space allocation operations (which can be expensive on distributed storages) Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2.c | 32 +++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index a8d61f0..2e6a0ec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1575,6 +1575,32 @@ fail: return ret; } +static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) +{ +BDRVQcow2State *s = bs->opaque; +BlockDriverState *file = bs->file->bs; +QCowL2Meta *m; +int ret; + +for (m = l2meta; m != NULL; m = m->next) { +uint64_t bytes = m->nb_clusters << s->cluster_bits; + +if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) { +continue; +} + +/* try to alloc host space in one chunk for better locality */ +ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 0); + +if (ret != 0) { +continue; +} + +file->total_sectors = MAX(file->total_sectors, + (m->alloc_offset + bytes) / BDRV_SECTOR_SIZE); +} +} + static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) @@ -1656,8 +1682,12 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset, if (ret < 0) { goto fail; } - qemu_co_mutex_unlock(&s->lock); + +if (bs->file->bs->drv->bdrv_co_pwrite_zeroes != NULL) { +handle_alloc_space(bs, l2meta); +} + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), cluster_offset + offset_in_cluster); -- 2.7.4
[Qemu-devel] [PATCH v1 09/13] qcow2: fix misleading comment about L2 linking
Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 10 -- 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 25210cd..4204db9 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -827,12 +827,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { -/* if two concurrent writes happen to the same unallocated cluster - * each write allocates separate cluster and writes data concurrently. - * The first one to complete updates l2 table with pointer to its - * cluster the second one has to do RMW (which is done above by - * perform_cow()), update l2 table with its cluster pointer and free - * old cluster. This is what this loop does */ +/* handle_dependencies() protects from normal cluster allocation + * collision; still L2 entry might be !0 in case of zero or compressed + * cluster reusage or writing over the snapshot + */ if (l2_table[l2_index + i] != 0) { old_cluster[j++] = l2_table[l2_index + i]; } -- 2.7.4
[Qemu-devel] [PATCH v1 04/13] qcow2: preallocation at image expand
From: "Denis V. Lunev" This patch adds image preallocation at expand to provide better locality of QCOW2 image file and optimize this procedure for some distributed storages where this procedure is slow. Image expand requests have to be suspended until the allocation is performed which is done via special QCowL2Meta. This meta is invisible to handle_dependencies() code. This is the main reason for also calling preallocation before metadata write: it might intersect with preallocation triggered by another IO, and has to yield Signed-off-by: Denis V. Lunev Signed-off-by: Anton Nefedov --- block/qcow2-cache.c| 3 + block/qcow2-cluster.c | 5 ++ block/qcow2-refcount.c | 14 + block/qcow2.c | 151 + block/qcow2.h | 5 ++ 5 files changed, 178 insertions(+) diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 1d25147..aa9da5f 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -204,6 +204,9 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) return ret; } +/* check and preallocate extra space if touching a fresh metadata cluster */ +qcow2_handle_prealloc(bs, c->entries[i].offset, s->cluster_size); + if (c == s->refcount_block_cache) { BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); } else if (c == s->l2_table_cache) { diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index cf18dee..a4b6d40 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -108,6 +108,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, goto fail; } +qcow2_handle_prealloc(bs, new_l1_table_offset, + QEMU_ALIGN_UP(new_l1_size2, s->cluster_size)); + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); @@ -1820,6 +1823,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } +qcow2_handle_prealloc(bs, offset, s->cluster_size); + ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0); if (ret < 0) { if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) { diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 7c06061..873a1d2 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -547,6 +547,8 @@ static int alloc_refcount_block(BlockDriverState *bs, } /* Write refcount blocks to disk */ +qcow2_handle_prealloc(bs, meta_offset, blocks_clusters * s->cluster_size); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, blocks_clusters * s->cluster_size); @@ -561,6 +563,10 @@ static int alloc_refcount_block(BlockDriverState *bs, cpu_to_be64s(&new_table[i]); } +qcow2_handle_prealloc(bs, table_offset, + QEMU_ALIGN_UP(table_size * sizeof(uint64_t), +s->cluster_size)); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, table_size * sizeof(uint64_t)); @@ -2104,6 +2110,8 @@ write_refblocks: goto fail; } +qcow2_handle_prealloc(bs, refblock_offset, s->cluster_size); + /* The size of *refcount_table is always cluster-aligned, therefore the * write operation will not overflow */ on_disk_refblock = (void *)((char *) *refcount_table + @@ -2158,6 +2166,8 @@ write_refblocks: } assert(reftable_size < INT_MAX / sizeof(uint64_t)); +qcow2_handle_prealloc(bs, reftable_offset, + reftable_size * sizeof(uint64_t)); ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, reftable_size * sizeof(uint64_t)); if (ret < 0) { @@ -2845,6 +2855,10 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, cpu_to_be64s(&new_reftable[i]); } +qcow2_handle_prealloc(bs, new_reftable_offset, + QEMU_ALIGN_UP(new_reftable_size * sizeof(uint64_t), +s->cluster_size)); + ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable, new_reftable_size * sizeof(uint64_t)); diff --git a/block/qcow2.c b/block/qcow2.c index b438f22..6e7ce96 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -464,6 +464,11 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_NUMBER, .help = "Clean unused cache entries after this time (in seconds)", }, +{ +.name = QCOW2_OPT_PRE
[Qemu-devel] [PATCH v1 03/13] qcow2: do not COW the empty areas
If COW area of the newly allocated cluster is zeroes, there is no reason to write zero sectors in perform_cow() again now as whole clusters are zeroed out in single chunks by handle_alloc_space(). Introduce QCowL2Meta field "reduced", since the existing fields (offset and nb_bytes) still has to keep other write requests from simultaneous writing in the area iotest 060: write to the discarded cluster does not trigger COW anymore. so, break on write_aio event instead, will work for the test (but write won't fail anymore, so update reference output) iotest 066: cluster-alignment areas that were not really COWed are now detected as zeroes, hence the initial write has to be exactly the same size for the maps to match performance tests: === qemu-io, results in seconds to complete (less is better) random write 4k to empty image, no backing HDD 64k cluster 128M over 128M image: 160 -> 160 ( x1 ) 128M over 2G image:86 -> 84 ( x1 ) 128M over 8G image:40 -> 29 ( x1.4 ) 1M cluster 32M over 8G image:58 -> 23 ( x2.5 ) SSD 64k cluster 2G over 2G image:71 -> 38 ( x1.9 ) 512M over 8G image:85 -> 8 ( x10.6 ) 1M cluster 128M over 32G image: 314 -> 2 ( x157 ) - improvement grows bigger the bigger the cluster size, - first data portions to the fresh image benefit the most (more chance to hit an unallocated cluster) - SSD improvement is close to the IO length reduction rate (e.g. writing only 4k instead of 64k) gives theoretical x16 and practical x10 improvement) fio tests over xfs, empty image (cluster 64k), no backing, first megabytes of random writes: randwrite 4k, size=8g: HDD (io_size=128m) : 730 -> 1050 IOPS ( x1.45) SSD (io_size=512m) : 1500 -> 7000 IOPS ( x4.7 ) random writes io_size==image_size: randwrite 4k, size=2g io_size=2g: HDD : 200 IOPS (no difference) SSD : 7500 -> 9500 IOPS ( x1.3 ) sequential write: seqwrite 4k, size=4g, iodepth=4 SSD : 7000 -> 18000 IOPS ( x2.6 ) - numbers are similar to qemu-io tests, slightly less improvement (damped by fs?) Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev --- block/qcow2-cluster.c | 4 +++- block/qcow2.c | 23 +++ block/qcow2.h | 4 tests/qemu-iotests/060 | 2 +- tests/qemu-iotests/060.out | 3 ++- tests/qemu-iotests/066 | 2 +- tests/qemu-iotests/066.out | 4 ++-- 7 files changed, 36 insertions(+), 6 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 347d94b..cf18dee 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -758,7 +758,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) BDRVQcow2State *s = bs->opaque; int ret; -if (r->nb_bytes == 0) { +if (r->nb_bytes == 0 || r->reduced) { return 0; } @@ -1267,10 +1267,12 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, .cow_start = { .offset = 0, .nb_bytes = offset_into_cluster(s, guest_offset), +.reduced= false, }, .cow_end = { .offset = nb_bytes, .nb_bytes = avail_bytes - nb_bytes, +.reduced= false, }, }; qemu_co_queue_init(&(*m)->dependent_requests); diff --git a/block/qcow2.c b/block/qcow2.c index b885dfc..b438f22 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -64,6 +64,9 @@ typedef struct { #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA #define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 +static bool is_zero_sectors(BlockDriverState *bs, int64_t start, +uint32_t count); + static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) { const QCowHeader *cow_header = (const void *)buf; @@ -1575,6 +1578,25 @@ fail: return ret; } +static void handle_cow_reduce(BlockDriverState *bs, QCowL2Meta *m) +{ +if (bs->encrypted) { +return; +} +if (!m->cow_start.reduced && m->cow_start.nb_bytes != 0 && +is_zero_sectors(bs, +(m->offset + m->cow_start.offset) >> BDRV_SECTOR_BITS, +m->cow_start.nb_bytes >> BDRV_SECTOR_BITS)) { +m->cow_start.reduced = true; +} +if (!m->cow_end.reduced && m->cow_end.nb_bytes != 0 && +is_zero_sectors(bs, +(m->offset + m->cow_end.offset) >> BDRV_SECTOR_BITS, +m->cow_end.nb_bytes >> BDRV_SECTOR_BITS)) { +m->cow_end.reduced = true; +} +} + static void handle_alloc_space(BlockDriverState *bs, QCowL2Met
[Qemu-devel] [PATCH v1 00/13] qcow2: space preallocation and COW improvements
This pull request is to address a few performance problems of qcow2 format: 1. non cluster-aligned write requests (to unallocated clusters) explicitly pad data with zeroes if there is no backing data. This can be avoided and the whole clusters are preallocated and zeroed in a single efficient write_zeroes() operation, also providing better host file continuity 2. moreover, efficient write_zeroes() operation can be used to preallocate space megabytes ahead which gives noticeable improvement on some storage types (e.g. distributed storages where space allocation operation is expensive) 3. preallocating/zeroing the clusters in advance makes possible to enable simultaneous writes to the same unallocated cluster, which is beneficial for parallel sequential write operations which are not cluster-aligned Performance test results are added to commit messages (see patch 3, 12) Anton Nefedov (9): qcow2: is_zero_sectors(): return true if area is outside of backing file qcow2: do not COW the empty areas qcow2: set inactive flag qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation qcow2: fix misleading comment about L2 linking qcow2-cluster: slightly refactor handle_dependencies() qcow2-cluster: make handle_dependencies() logic easier to follow qcow2: allow concurrent unaligned writes to the same clusters iotest 046: test simultaneous cluster write error case Denis V. Lunev (3): qcow2: alloc space for COW in one chunk qcow2: preallocation at image expand qcow2: truncate preallocated space Pavel Butsykin (1): qcow2: check space leak at the end of the image block/qcow2-cache.c| 3 + block/qcow2-cluster.c | 216 +++- block/qcow2-refcount.c | 21 +++ block/qcow2.c | 286 - block/qcow2.h | 26 tests/qemu-iotests/026.out | 104 ++ tests/qemu-iotests/026.out.nocache | 104 ++ tests/qemu-iotests/029.out | 5 +- tests/qemu-iotests/046 | 38 - tests/qemu-iotests/046.out | 23 +++ tests/qemu-iotests/060 | 2 +- tests/qemu-iotests/060.out | 13 +- tests/qemu-iotests/061.out | 5 +- tests/qemu-iotests/066 | 2 +- tests/qemu-iotests/066.out | 9 +- tests/qemu-iotests/098.out | 7 +- tests/qemu-iotests/108.out | 5 +- tests/qemu-iotests/112.out | 5 +- tests/qemu-iotests/154.out | 4 +- 19 files changed, 769 insertions(+), 109 deletions(-) -- 2.7.4
[Qemu-devel] [PATCH v3 1/1] qemu-img: wait for convert coroutines to complete
On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- qemu-img.c | 26 +++--- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index bbe1574..8c50379 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1761,13 +1761,13 @@ static void coroutine_fn convert_co_do_copy(void *opaque) qemu_co_mutex_lock(&s->lock); if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) { qemu_co_mutex_unlock(&s->lock); -goto out; +break; } n = convert_iteration_sectors(s, s->sector_num); if (n < 0) { qemu_co_mutex_unlock(&s->lock); s->ret = n; -goto out; +break; } /* save current sector and allocation status to local variables */ sector_num = s->sector_num; @@ -1792,7 +1792,6 @@ static void coroutine_fn convert_co_do_copy(void *opaque) error_report("error while reading sector %" PRId64 ": %s", sector_num, strerror(-ret)); s->ret = ret; -goto out; } } else if (!s->min_sparse && status == BLK_ZERO) { status = BLK_DATA; @@ -1801,22 +1800,20 @@ static void coroutine_fn convert_co_do_copy(void *opaque) if (s->wr_in_order) { /* keep writes in order */ -while (s->wr_offs != sector_num) { -if (s->ret != -EINPROGRESS) { -goto out; -} +while (s->wr_offs != sector_num && s->ret == -EINPROGRESS) { s->wait_sector_num[index] = sector_num; qemu_coroutine_yield(); } s->wait_sector_num[index] = -1; } -ret = convert_co_write(s, sector_num, n, buf, status); -if (ret < 0) { -error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); -s->ret = ret; -goto out; +if (s->ret == -EINPROGRESS) { +ret = convert_co_write(s, sector_num, n, buf, status); +if (ret < 0) { +error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); +s->ret = ret; +} } if (s->wr_in_order) { @@ -1837,7 +1834,6 @@ static void coroutine_fn convert_co_do_copy(void *opaque) } } -out: qemu_vfree(buf); s->co[index] = NULL; s->running_coroutines--; @@ -1899,7 +1895,7 @@ static int convert_do_copy(ImgConvertState *s) qemu_coroutine_enter(s->co[i]); } -while (s->ret == -EINPROGRESS) { +while (s->running_coroutines) { main_loop_wait(false); } -- 2.7.4
Re: [Qemu-devel] [Qemu-stable] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete
On 04/25/2017 12:55 PM, Peter Lieven wrote: Am 24.04.2017 um 22:13 schrieb Anton Nefedov: On 24/04/2017 21:16, Peter Lieven wrote: Am 24.04.2017 um 18:27 schrieb Anton Nefedov : On 04/21/2017 03:37 PM, Peter Lieven wrote: Am 21.04.2017 um 14:19 schrieb Anton Nefedov: On 04/21/2017 01:44 PM, Peter Lieven wrote: Am 21.04.2017 um 12:04 schrieb Anton Nefedov: On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- [..] And what if we error out in the read path? Wouldn't be something like this easier? diff --git a/qemu-img.c b/qemu-img.c index 22f559a..4ff1085 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s) main_loop_wait(false); } +/* on error path we need to enter all coroutines that are still + * running before cleaning up common structures */ +if (s->ret) { +for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i]) { + qemu_coroutine_enter(s->co[i]); + } +} +} + if (s->compressed && !s->ret) { /* signal EOF to align */ ret = blk_pwrite_compressed(s->target, 0, NULL, 0); Peter seemed a bit too daring to me to re-enter every coroutine potentially including the ones that yielded waiting for I/O completion. If that's ok - that is for sure easier :) I think we should enter every coroutine that is still running and have it terminate correctly. It was my mistake that I have not done this in the original patch. Can you check if the above fixes your test cases that triggered the bug? hi, sorry I'm late with the answer this segfaults in bdrv_close(). Looks like it tries to finish some i/o which coroutine we have already entered and terminated? (gdb) run Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O [..] Peter /Anton it seems that this is a bit tricky, can you share how your test case works? thanks, peter how I tested today was basically diff --git a/qemu-img.c b/qemu-img.c index 4425aaa..3d2d506 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1788,6 +1788,10 @@ static void coroutine_fn convert_co_do_copy(void *opaque) if (status == BLK_DATA) { ret = convert_co_read(s, sector_num, n, buf); +const uint64_t fsector = 10*1024*1024/512; +if (sector_num <= fsector && fsector < sector_num+n) { +ret = -EIO; +} if (ret < 0) { error_report("error while reading sector %" PRId64 ": %s", sector_num, strerror(-ret)); I ended up with sth similar to your approach. Can you check this? Thanks, Peter diff --git a/qemu-img.c b/qemu-img.c index b94fc11..ed9ce9e 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1761,13 +1761,13 @@ static void coroutine_fn convert_co_do_copy(void *opaque) qemu_co_mutex_lock(&s->lock); if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) { qemu_co_mutex_unlock(&s->lock); -goto out; +break; } n = convert_iteration_sectors(s, s->sector_num); if (n < 0) { qemu_co_mutex_unlock(&s->lock); s->ret = n; -goto out; +break; } /* save current sector and allocation status to local variables */ sector_num = s->sector_num; @@ -1792,7 +1792,6 @@ static void coroutine_fn convert_co_do_copy(void *opaque) error_report("error while reading sector %" PRId64 ": %s", sector_num, strerror(-ret)); s->ret = ret; -goto out; } } else if (!s->min_sparse && status == BLK_ZERO) { status = BLK_DATA; @@ -1801,22 +1800,20 @@ static void coroutine_fn convert_co_do_copy(void *opaque) if (s->wr_in_order) { /* keep writes in order */ -while (s->wr_offs != sector_num) { -if (s->ret != -EINPROGRESS) { -goto out; -} +while (s->wr_offs != sector_num && s->ret == -EINPROGRESS) { s->wait_sector_num[index] = sector_num; qemu_coroutine_yield(); } s->wait_sector_num[index] = -1; } -ret = convert_co_write(s, sector_num, n, buf, status); -if (ret < 0) { -error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); -s->ret = ret; -
Re: [Qemu-devel] [Qemu-stable] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete
On 24/04/2017 21:16, Peter Lieven wrote: Am 24.04.2017 um 18:27 schrieb Anton Nefedov : On 04/21/2017 03:37 PM, Peter Lieven wrote: Am 21.04.2017 um 14:19 schrieb Anton Nefedov: On 04/21/2017 01:44 PM, Peter Lieven wrote: Am 21.04.2017 um 12:04 schrieb Anton Nefedov: On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- [..] And what if we error out in the read path? Wouldn't be something like this easier? diff --git a/qemu-img.c b/qemu-img.c index 22f559a..4ff1085 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s) main_loop_wait(false); } +/* on error path we need to enter all coroutines that are still + * running before cleaning up common structures */ +if (s->ret) { +for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i]) { + qemu_coroutine_enter(s->co[i]); + } +} +} + if (s->compressed && !s->ret) { /* signal EOF to align */ ret = blk_pwrite_compressed(s->target, 0, NULL, 0); Peter seemed a bit too daring to me to re-enter every coroutine potentially including the ones that yielded waiting for I/O completion. If that's ok - that is for sure easier :) I think we should enter every coroutine that is still running and have it terminate correctly. It was my mistake that I have not done this in the original patch. Can you check if the above fixes your test cases that triggered the bug? hi, sorry I'm late with the answer this segfaults in bdrv_close(). Looks like it tries to finish some i/o which coroutine we have already entered and terminated? (gdb) run Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O qcow2 ./harddisk.hdd.c ./harddisk.hdd [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". [New Thread 0x7fffeac2d700 (LWP 436020)] [New Thread 0x7fffe4ed6700 (LWP 436021)] qemu-img: error while reading sector 20480: Input/output error qemu-img: error while writing sector 19712: Operation now in progress Program received signal SIGSEGV, Segmentation fault. aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454 454 ctx = atomic_read(&co->ctx); (gdb) bt #0 aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454 /* [Anton]: thread_pool_co_cb () here */ #1 0x55634629 in thread_pool_completion_bh (opaque=0x55cfe020) at /mnt/code/us-qemu/util/thread-pool.c:189 #2 0x55633b31 in aio_bh_call (bh=0x55cfe0f0) at /mnt/code/us-qemu/util/async.c:90 #3 aio_bh_poll (ctx=ctx@entry=0x55cee6d0) at /mnt/code/us-qemu/util/async.c:118 #4 0x55636f14 in aio_poll (ctx=ctx@entry=0x55cee6d0, blocking=) at /mnt/code/us-qemu/util/aio-posix.c:682 #5 0x555c52d4 in bdrv_drain_recurse (bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:164 #6 0x555c5aed in bdrv_drained_begin (bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:248 #7 0x55581443 in bdrv_close (bs=0x55d22560) at /mnt/code/us-qemu/block.c:2909 #8 bdrv_delete (bs=0x55d22560) at /mnt/code/us-qemu/block.c:3100 #9 bdrv_unref (bs=0x55d22560) at /mnt/code/us-qemu/block.c:4087 #10 0x555baf44 in blk_remove_bs (blk=blk@entry=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:552 #11 0x555bb173 in blk_delete (blk=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:238 #12 blk_unref (blk=blk@entry=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:282 #13 0x5557a22c in img_convert (argc=, argv=) at /mnt/code/us-qemu/qemu-img.c:2359 #14 0x55574189 in main (argc=5, argv=0x7fffe4a0) at /mnt/code/us-qemu/qemu-img.c:4464 Peter /Anton it seems that this is a bit tricky, can you share how your test case works? thanks, peter how I tested today was basically diff --git a/qemu-img.c b/qemu-img.c index 4425aaa..3d2d506 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1788,6 +1788,10 @@ static void coroutine_fn convert_co_do_copy(void *opaque) if (status == BLK_DATA) { ret = convert_co_read(s, sector_num, n, buf); +const uint64_t fsector = 10*1024*1024/512; +if (sector_num <= fsector && fsector < sector_num+n) { +ret = -EIO; +} if (ret < 0) { error_report("error while reading sector %" PRId64 ": %s", sector_num, strerror(-ret));
Re: [Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete
On 04/21/2017 03:37 PM, Peter Lieven wrote: Am 21.04.2017 um 14:19 schrieb Anton Nefedov: On 04/21/2017 01:44 PM, Peter Lieven wrote: Am 21.04.2017 um 12:04 schrieb Anton Nefedov: On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- [..] And what if we error out in the read path? Wouldn't be something like this easier? diff --git a/qemu-img.c b/qemu-img.c index 22f559a..4ff1085 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s) main_loop_wait(false); } +/* on error path we need to enter all coroutines that are still + * running before cleaning up common structures */ +if (s->ret) { +for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i]) { + qemu_coroutine_enter(s->co[i]); + } +} +} + if (s->compressed && !s->ret) { /* signal EOF to align */ ret = blk_pwrite_compressed(s->target, 0, NULL, 0); Peter seemed a bit too daring to me to re-enter every coroutine potentially including the ones that yielded waiting for I/O completion. If that's ok - that is for sure easier :) I think we should enter every coroutine that is still running and have it terminate correctly. It was my mistake that I have not done this in the original patch. Can you check if the above fixes your test cases that triggered the bug? hi, sorry I'm late with the answer this segfaults in bdrv_close(). Looks like it tries to finish some i/o which coroutine we have already entered and terminated? (gdb) run Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O qcow2 ./harddisk.hdd.c ./harddisk.hdd [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". [New Thread 0x7fffeac2d700 (LWP 436020)] [New Thread 0x7fffe4ed6700 (LWP 436021)] qemu-img: error while reading sector 20480: Input/output error qemu-img: error while writing sector 19712: Operation now in progress Program received signal SIGSEGV, Segmentation fault. aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454 454 ctx = atomic_read(&co->ctx); (gdb) bt #0 aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454 /* [Anton]: thread_pool_co_cb () here */ #1 0x55634629 in thread_pool_completion_bh (opaque=0x55cfe020) at /mnt/code/us-qemu/util/thread-pool.c:189 #2 0x55633b31 in aio_bh_call (bh=0x55cfe0f0) at /mnt/code/us-qemu/util/async.c:90 #3 aio_bh_poll (ctx=ctx@entry=0x55cee6d0) at /mnt/code/us-qemu/util/async.c:118 #4 0x55636f14 in aio_poll (ctx=ctx@entry=0x55cee6d0, blocking=) at /mnt/code/us-qemu/util/aio-posix.c:682 #5 0x555c52d4 in bdrv_drain_recurse (bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:164 #6 0x555c5aed in bdrv_drained_begin (bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:248 #7 0x55581443 in bdrv_close (bs=0x55d22560) at /mnt/code/us-qemu/block.c:2909 #8 bdrv_delete (bs=0x55d22560) at /mnt/code/us-qemu/block.c:3100 #9 bdrv_unref (bs=0x55d22560) at /mnt/code/us-qemu/block.c:4087 #10 0x555baf44 in blk_remove_bs (blk=blk@entry=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:552 #11 0x555bb173 in blk_delete (blk=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:238 #12 blk_unref (blk=blk@entry=0x55d22380) at /mnt/code/us-qemu/block/block-backend.c:282 #13 0x5557a22c in img_convert (argc=, argv=) at /mnt/code/us-qemu/qemu-img.c:2359 #14 0x55574189 in main (argc=5, argv=0x7fffe4a0) at /mnt/code/us-qemu/qemu-img.c:4464 Peter /Anton
[Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete
On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- qemu-img.c | 44 +++- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index b220cf7..24950c1 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1735,6 +1735,27 @@ static int coroutine_fn convert_co_write(ImgConvertState *s, int64_t sector_num, return 0; } +static void coroutine_fn convert_reenter_waiting(ImgConvertState *s, + uint64_t wr_offs) +{ +int i; +if (s->wr_in_order) { +/* reenter the coroutine that might have waited + * for the write to complete */ +for (i = 0; i < s->num_coroutines; i++) { +if (s->co[i] && s->wait_sector_num[i] == wr_offs) { +/* + * A -> B -> A cannot occur because A has + * s->wait_sector_num[i] == -1 during A -> B. Therefore + * B will never enter A during this time window. + */ +qemu_coroutine_enter(s->co[i]); +break; +} +} +} +} + static void coroutine_fn convert_co_do_copy(void *opaque) { ImgConvertState *s = opaque; @@ -1792,6 +1813,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque) error_report("error while reading sector %" PRId64 ": %s", sector_num, strerror(-ret)); s->ret = ret; +convert_reenter_waiting(s, sector_num + n); goto out; } } else if (!s->min_sparse && status == BLK_ZERO) { @@ -1803,6 +1825,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque) /* keep writes in order */ while (s->wr_offs != sector_num) { if (s->ret != -EINPROGRESS) { +convert_reenter_waiting(s, sector_num + n); goto out; } s->wait_sector_num[index] = sector_num; @@ -1816,25 +1839,12 @@ static void coroutine_fn convert_co_do_copy(void *opaque) error_report("error while writing sector %" PRId64 ": %s", sector_num, strerror(-ret)); s->ret = ret; +convert_reenter_waiting(s, sector_num + n); goto out; } -if (s->wr_in_order) { -/* reenter the coroutine that might have waited - * for this write to complete */ -s->wr_offs = sector_num + n; -for (i = 0; i < s->num_coroutines; i++) { -if (s->co[i] && s->wait_sector_num[i] == s->wr_offs) { -/* - * A -> B -> A cannot occur because A has - * s->wait_sector_num[i] == -1 during A -> B. Therefore - * B will never enter A during this time window. - */ -qemu_coroutine_enter(s->co[i]); -break; -} -} -} +s->wr_offs = sector_num + n; +convert_reenter_waiting(s, s->wr_offs); } out: @@ -1899,7 +1909,7 @@ static int convert_do_copy(ImgConvertState *s) qemu_coroutine_enter(s->co[i]); } -while (s->ret == -EINPROGRESS) { +while (s->running_coroutines) { main_loop_wait(false); } -- 2.7.4
Re: [Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete
On 04/21/2017 01:44 PM, Peter Lieven wrote: Am 21.04.2017 um 12:04 schrieb Anton Nefedov: On error path (like i/o error in one of the coroutines), it's required to - wait for coroutines completion before cleaning the common structures - reenter dependent coroutines so they ever finish Introduced in 2d9187bc65. Signed-off-by: Anton Nefedov --- [..] And what if we error out in the read path? Wouldn't be something like this easier? diff --git a/qemu-img.c b/qemu-img.c index 22f559a..4ff1085 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s) main_loop_wait(false); } +/* on error path we need to enter all coroutines that are still + * running before cleaning up common structures */ +if (s->ret) { +for (i = 0; i < s->num_coroutines; i++) { + if (s->co[i]) { + qemu_coroutine_enter(s->co[i]); + } +} +} + if (s->compressed && !s->ret) { /* signal EOF to align */ ret = blk_pwrite_compressed(s->target, 0, NULL, 0); Peter seemed a bit too daring to me to re-enter every coroutine potentially including the ones that yielded waiting for I/O completion. If that's ok - that is for sure easier :) Or maybe some intermediate variant, as you suggest but only enter the "wait_sector" coroutines (and main_loop_wait for the rest)? Or do not even re-enter them, like just -while (s->ret == -EINPROGRESS) { +while (s->running_coroutines != s->waiting_coroutines) { main_loop_wait(false); } /Anton
Re: [Qemu-devel] [PATCH 1/1] qemu-img: wait for convert coroutines to complete
On 04/21/2017 12:18 PM, Peter Lieven wrote: Am 18.04.2017 um 12:27 schrieb Denis V. Lunev: From: Anton Nefedov We should wait for other coroutines on error path, i.e. one of coroutines terminates with i/o error, before cleaning the common structures. In the other case we would crash in a lot of different places. This behaviour was introduced by commit 2d9187bc65. Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev CC: Peter Lieven CC: Kevin Wolf CC: Max Reitz Cc: qemu-sta...@nongnu.org This should go into 2.9.1 Peter Actually I'm afraid the patch is incorrect.. The erroneous coroutine bails out without reentering its possible dependent coroutine so it (and the following dependents) will never finish. I'll send out the 2nd version soon /Anton
Re: [Qemu-devel] [PATCH] io: fully parse & validate HTTP headers for websocket protocol handshake
On 02/28/2017 03:07 PM, Daniel P. Berrange wrote: The current websockets protocol handshake code is very relaxed, just doing crude string searching across the HTTP header data. This causes it to both reject valid connections and fail to reject invalid connections. For example, according to the RFC 6455 it: - MUST reject any method other than "GET" - MUST reject any HTTP version less than "HTTP/1.1" - MUST reject Connection header without "Upgrade" listed - MUST reject Upgrade header which is not 'websocket' - MUST reject missing Host header - MUST treat HTTP header names as case insensitive To do all this validation correctly requires that we fully parse the HTTP headers, populating a data structure containing the header fields. After this change, we also reject any path other than '/' Signed-off-by: Daniel P. Berrange --- thanks, this works with the client that used to fail (tornado python lib http://www.tornadoweb.org/en/stable/) /Anton
Re: [Qemu-devel] [PATCH 3/3] vl: pass CPUState to qemu_system_guest_panicked
hi, On 02/21/2017 01:42 PM, Paolo Bonzini wrote: qemu_system_guest_panicked was already using current_cpu implicitly, so it makes sense for it to receive a CPUState. This lets the function call cpu_get_crash_info and free the result. my motivation against this was that as qemu_system_guest_panicked() is also used by pvpanic device, if it's ever going to pass any information, GuestPanicInformation struct could be reused. Though I guess it doesnt matter much now /Anton
[Qemu-devel] [PATCH 0/1] qmp-events: fix GUEST_PANICKED description formatting
This patch is a follow-up for: http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg03647.html
[Qemu-devel] [PATCH] qmp-events: fix GUEST_PANICKED description formatting
also remove a useless NULL check in the event reporting code Signed-off-by: Anton Nefedov --- qapi/event.json | 4 ++-- vl.c| 22 ++ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/qapi/event.json b/qapi/event.json index 970ff02..e02852c 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -488,9 +488,9 @@ # # @action: action that has been taken, currently always "pause" # -# @info: optional information about a panic +# @info: #optional information about a panic (since 2.9) # -# Since: 1.5 (@info since 2.9) +# Since: 1.5 # # Example: # diff --git a/vl.c b/vl.c index 903c46d..f410e03 100644 --- a/vl.c +++ b/vl.c @@ -1710,6 +1710,15 @@ void qemu_system_reset(bool report) void qemu_system_guest_panicked(GuestPanicInformation *info) { qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed\n"); +if (info && info->type == GUEST_PANIC_INFORMATION_KIND_HYPER_V) { +qemu_log_mask(LOG_GUEST_ERROR, "HV crash parameters: (%#"PRIx64 + " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n", + info->u.hyper_v.data->arg1, + info->u.hyper_v.data->arg2, + info->u.hyper_v.data->arg3, + info->u.hyper_v.data->arg4, + info->u.hyper_v.data->arg5); +} if (current_cpu) { current_cpu->crash_occurred = true; @@ -1723,18 +1732,7 @@ void qemu_system_guest_panicked(GuestPanicInformation *info) qemu_system_shutdown_request(); } -if (info) { -if (info->type == GUEST_PANIC_INFORMATION_KIND_HYPER_V) { -qemu_log_mask(LOG_GUEST_ERROR, "HV crash parameters: (%#"PRIx64 - " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n", - info->u.hyper_v.data->arg1, - info->u.hyper_v.data->arg2, - info->u.hyper_v.data->arg3, - info->u.hyper_v.data->arg4, - info->u.hyper_v.data->arg5); -} -qapi_free_GuestPanicInformation(info); -} +qapi_free_GuestPanicInformation(info); } void qemu_system_reset_request(void) -- 2.7.4
[Qemu-devel] [PATCH] qmp-events: fix GUEST_PANICKED description formatting
Signed-off-by: Anton Nefedov --- qapi/event.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qapi/event.json b/qapi/event.json index 970ff02..e02852c 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -488,9 +488,9 @@ # # @action: action that has been taken, currently always "pause" # -# @info: optional information about a panic +# @info: #optional information about a panic (since 2.9) # -# Since: 1.5 (@info since 2.9) +# Since: 1.5 # # Example: # -- 2.7.4
Re: [Qemu-devel] [PATCH 1/1] mirror: restore offset after zeroing out the image
On 01/30/2017 08:33 PM, Denis V. Lunev wrote: On 01/30/2017 08:16 PM, Eric Blake wrote: On 01/30/2017 04:22 AM, Denis V. Lunev wrote: If explicit zeroing out before mirroring is required for the target image, it moves the block job offset counter to EOF, then offset and len counters count the image size twice. There is no harm but confusing stats (e.g. for 1G image the completion counter starts from 1G and increases to 2G) The patch fixed that problem by resetting the offset counter. Counters are explicitly documented NOT tied to disk length; they are merely estimates of proportional completion. I'm not sure if this makes the numbers jump backwards from the observer's viewpoint, but if you can ever spot 1g/1g right before rewinding to 0g/1g (where pre-patch could see 1g/2g), then that is a reason to not take this patch. FWIW you'll actually see 1g/0g first (if you happen to catch that tiny phase :) On the other hand, your argument that the pre-patch behavior progressing towards 2g has a very fast progression from 0-1g/2g, and then a much slower 1g-2g/2g, which makes the estimate of percent completion skewed, while a newer progression of 0-1g/1g is more realistic, may have some merit. I'm not sold on this patch yet, but stronger arguments in the commit message may sway me. +++ b/tests/qemu-iotests/094.out @@ -3,9 +3,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 Formatting 'TEST_DIR/source.IMGFMT', fmt=IMGFMT size=67108864 {"return": {}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 67108864, "offset": 67108864, "speed": 0, "type": "mirror"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}} {"return": {}} This part of the change is scary - a ready event showing 0/0 HAS been known to confuse libvirt in the past. Qemu should NEVER advertise a ready event with 0/0, it should at least be 1/1 (because of the number of clients that have workarounds to deal with older qemu behavior on 0/0 and which might misbehave if we ever issue that again). I think this 094.out modification is not needed and actually breaks the test; apologies -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 67108864, "offset": 67108864, "speed": 0, "type": "mirror"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}} So NACK to the patch as currently written, but not necessarily to the idea if you can give better progress numbers and never reach the state of BLOCK_JOB_READY and BLOCK_JOB_COMPLETED with a 0/0 length/offset. ok. fair enough. Thank you for the review. Will it be better to (somehow) skip progressing below using some condition during mirror_dirty_init() stage? static void mirror_iteration_done(MirrorOp *op, int ret) { . if (ret >= 0) { if (s->cow_bitmap) { bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); } s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; < specifically this progressing } Isn't this going to look quite too invasive for such cause -- best regards, Anton