[Qemu-devel] [PATCH v4 07/13] test-char: split char_udp_test

2017-06-26 Thread Anton Nefedov
makes it possible to test the existing chardev-udp

Signed-off-by: Anton Nefedov 
Reviewed-by: Marc-André Lureau 
---
 tests/test-char.c | 57 ++-
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/tests/test-char.c b/tests/test-char.c
index 90b1f50..1f0de25 100644
--- a/tests/test-char.c
+++ b/tests/test-char.c
@@ -409,16 +409,11 @@ static void char_pipe_test(void)
 }
 #endif
 
-static void char_udp_test(void)
+static int make_udp_socket(int *port)
 {
-struct sockaddr_in addr = { 0, }, other;
-SocketIdleData d = { 0, };
-Chardev *chr;
-CharBackend be;
+struct sockaddr_in addr = { 0, };
 socklen_t alen = sizeof(addr);
 int ret, sock = qemu_socket(PF_INET, SOCK_DGRAM, 0);
-char buf[10];
-char *tmp;
 
 g_assert_cmpint(sock, >, 0);
 addr.sin_family = AF_INET ;
@@ -429,19 +424,41 @@ static void char_udp_test(void)
 ret = getsockname(sock, (struct sockaddr *)&addr, &alen);
 g_assert_cmpint(ret, ==, 0);
 
-tmp = g_strdup_printf("udp:127.0.0.1:%d",
-  ntohs(addr.sin_port));
-chr = qemu_chr_new("client", tmp);
-g_assert_nonnull(chr);
+*port = ntohs(addr.sin_port);
+return sock;
+}
+
+static void char_udp_test_internal(Chardev *reuse_chr, int sock)
+{
+struct sockaddr_in other;
+SocketIdleData d = { 0, };
+Chardev *chr;
+CharBackend *be;
+socklen_t alen = sizeof(other);
+int ret;
+char buf[10];
+char *tmp = NULL;
+
+if (reuse_chr) {
+chr = reuse_chr;
+be = chr->be;
+} else {
+int port;
+sock = make_udp_socket(&port);
+tmp = g_strdup_printf("udp:127.0.0.1:%d", port);
+chr = qemu_chr_new("client", tmp);
+g_assert_nonnull(chr);
+
+be = g_alloca(sizeof(CharBackend));
+qemu_chr_fe_init(be, chr, &error_abort);
+}
 
 d.chr = chr;
-qemu_chr_fe_init(&be, chr, &error_abort);
-qemu_chr_fe_set_handlers(&be, socket_can_read_hello, socket_read_hello,
+qemu_chr_fe_set_handlers(be, socket_can_read_hello, socket_read_hello,
  NULL, NULL, &d, NULL, true);
 ret = qemu_chr_write_all(chr, (uint8_t *)"hello", 5);
 g_assert_cmpint(ret, ==, 5);
 
-alen = sizeof(addr);
 ret = recvfrom(sock, buf, sizeof(buf), 0,
(struct sockaddr *)&other, &alen);
 g_assert_cmpint(ret, ==, 5);
@@ -450,10 +467,16 @@ static void char_udp_test(void)
 
 main_loop();
 
-close(sock);
+if (!reuse_chr) {
+close(sock);
+qemu_chr_fe_deinit(be, true);
+}
 g_free(tmp);
-qemu_chr_fe_deinit(&be);
-object_unparent(OBJECT(chr));
+}
+
+static void char_udp_test(void)
+{
+char_udp_test_internal(NULL, 0);
 }
 
 #ifdef HAVE_CHARDEV_SERIAL
-- 
2.7.4




[Qemu-devel] [PATCH v4 04/13] char: forbid direct chardevice access for hotswap devices

2017-06-26 Thread Anton Nefedov
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support
should not access CharDriver ptr directly as CharDriver might change.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Marc-André Lureau 
---
 include/chardev/char-fe.h | 10 ++
 chardev/char-fe.c |  7 +++
 2 files changed, 17 insertions(+)

diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h
index 0314870..385aa99 100644
--- a/include/chardev/char-fe.h
+++ b/include/chardev/char-fe.h
@@ -63,10 +63,20 @@ bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error 
**errp);
  *
  * Returns the driver associated with a CharBackend or NULL if no
  * associated Chardev.
+ * Note: avoid this function as the driver should never be accessed directly,
+ *   especially by the frontends that support chardevice hotswap.
+ *   Consider qemu_chr_fe_backend_connected() to check for driver existence
  */
 Chardev *qemu_chr_fe_get_driver(CharBackend *be);
 
 /**
+ * @qemu_chr_fe_backend_connected:
+ *
+ * Returns true if there is a chardevice associated with @be.
+ */
+bool qemu_chr_fe_backend_connected(CharBackend *be);
+
+/**
  * @qemu_chr_fe_set_handlers:
  * @b: a CharBackend
  * @fd_can_read: callback to get the amount of data the frontend may
diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index 86a878b..be96fb5 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -183,9 +183,16 @@ void qemu_chr_fe_printf(CharBackend *be, const char *fmt, 
...)
 
 Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 {
+/* this is unsafe for the users that support chardev hotswap */
+assert(be->chr_be_change == NULL);
 return be->chr;
 }
 
+bool qemu_chr_fe_backend_connected(CharBackend *be)
+{
+return !!be->chr;
+}
+
 bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
-- 
2.7.4




[Qemu-devel] [PATCH v4 03/13] char: chardevice hotswap

2017-06-26 Thread Anton Nefedov
This patch adds a possibility to change a char device without a frontend
removal.

1. Ideally, it would have to happen transparently to a frontend, i.e.
frontend would continue its regular operation.
However, backends are not stateless and are set up by the frontends
via qemu_chr_fe_<> functions, and it's not (generally) possible to replay
that setup entirely in a backend code, as different chardevs respond
to the setup calls differently, so do frontends work differently basing
on those setup responses.
Moreover, some frontend can generally get and save the backend pointer
(qemu_chr_fe_get_driver()), and it will become invalid after backend change.

So, a frontend which would like to support chardev hotswap has to register
a "backend change" handler, and redo its backend setup there.

2. Write path can be used by multiple threads and thus protected with
chr_write_lock.
So hotswap also has to be protected so write functions won't access
a backend being replaced.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/chardev/char-fe.h | 17 ++
 include/chardev/char.h|  9 +
 chardev/char-fe.c | 33 +-
 chardev/char.c| 86 +++
 qapi-schema.json  | 40 ++
 5 files changed, 176 insertions(+), 9 deletions(-)

diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h
index 9f38060..0314870 100644
--- a/include/chardev/char-fe.h
+++ b/include/chardev/char-fe.h
@@ -10,6 +10,12 @@ typedef int BackendChangeHandler(void *opaque);
  * Chardev */
 struct CharBackend {
 Chardev *chr;
+/* chr_lock is used to ensure thread-safety of operations with
+ * the associated Chardev.
+ * There is no guarantee otherwise, generally, that *chr won't become
+ * invalid.
+ */
+QemuMutex chr_lock;
 IOEventHandler *chr_event;
 IOCanReadHandler *chr_can_read;
 IOReadHandler *chr_read;
@@ -42,6 +48,17 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error 
**errp);
 void qemu_chr_fe_deinit(CharBackend *b, bool del);
 
 /**
+ * @qemu_chr_fe_connect:
+ *
+ * Connects the already initialized front end with a given Chardev.
+ * Call qemu_chr_fe_deinit() to remove the association and
+ * release the driver.
+ *
+ * Returns: false on error.
+ */
+bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp);
+
+/**
  * @qemu_chr_fe_get_driver:
  *
  * Returns the driver associated with a CharBackend or NULL if no
diff --git a/include/chardev/char.h b/include/chardev/char.h
index 8a9ade4..22fd734 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -93,6 +93,15 @@ void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon 
*backend);
 Chardev *qemu_chr_new(const char *label, const char *filename);
 
 /**
+ * @qemu_chr_change:
+ *
+ * Change an existing character backend
+ *
+ * @opts the new backend options
+ */
+void qemu_chr_change(QemuOpts *opts, Error **errp);
+
+/**
  * @qemu_chr_cleanup:
  *
  * Delete all chardevs (when leaving qemu)
diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index 7054863..86a878b 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -33,24 +33,28 @@
 
 int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+int ret = 0;
 
-if (!s) {
-return 0;
+qemu_mutex_lock(&be->chr_lock);
+if (be->chr) {
+ret = qemu_chr_write(be->chr, buf, len, false);
 }
+qemu_mutex_unlock(&be->chr_lock);
 
-return qemu_chr_write(s, buf, len, false);
+return ret;
 }
 
 int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+int ret = 0;
 
-if (!s) {
-return 0;
+qemu_mutex_lock(&be->chr_lock);
+if (be->chr) {
+ret = qemu_chr_write(be->chr, buf, len, true);
 }
+qemu_mutex_unlock(&be->chr_lock);
 
-return qemu_chr_write(s, buf, len, true);
+return ret;
 }
 
 int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len)
@@ -182,7 +186,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 return be->chr;
 }
 
-bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
 
@@ -211,6 +215,16 @@ unavailable:
 return false;
 }
 
+bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+{
+if (!qemu_chr_fe_connect(b, s, errp)) {
+return false;
+}
+
+qemu_mutex_init(&b->chr_lock);
+return true;
+}
+
 void qemu_chr_fe_deinit(CharBackend *b, bool del)
 {
 assert(b);
@@ -228,6 +242,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del)
 object_unparent(OBJECT(b->chr));
 }
 b->chr = NULL;
+qemu_mutex_destroy(&b->chr_lock);
 }
 }
 
diff --git a/chardev/char.c b/char

[Qemu-devel] [PATCH v4 10/13] hmp: add hmp analogue for qmp-chardev-change

2017-06-26 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Dr. David Alan Gilbert 
---
 include/chardev/char.h | 10 ++
 hmp.h  |  1 +
 chardev/char.c |  2 +-
 hmp-commands.hx| 18 +-
 hmp.c  | 34 ++
 tests/test-hmp.c   |  1 +
 6 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/include/chardev/char.h b/include/chardev/char.h
index 22fd734..1604ea9 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -81,6 +81,16 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend);
 
 /**
+ * @qemu_chr_parse_opts:
+ *
+ * Parse the options to the ChardevBackend struct.
+ *
+ * Returns: a new backend or NULL on error
+ */
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
+Error **errp);
+
+/**
  * @qemu_chr_new:
  *
  * Create a new character backend from a URI.
diff --git a/hmp.h b/hmp.h
index d8b94ce..23e035c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
 void hmp_chardev_add(Monitor *mon, const QDict *qdict);
+void hmp_chardev_change(Monitor *mon, const QDict *qdict);
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
 void hmp_qemu_io(Monitor *mon, const QDict *qdict);
 void hmp_cpu_add(Monitor *mon, const QDict *qdict);
diff --git a/chardev/char.c b/chardev/char.c
index 6b2cb7b..8e8b881 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -567,7 +567,7 @@ static const char *chardev_alias_translate(const char *name)
 return name;
 }
 
-static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index e763606..2cad758 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1724,7 +1724,23 @@ ETEXI
 STEXI
 @item chardev-add args
 @findex chardev-add
-chardev_add accepts the same parameters as the -chardev command line switch.
+chardev-add accepts the same parameters as the -chardev command line switch.
+
+ETEXI
+
+{
+.name   = "chardev-change",
+.args_type  = "id:s,args:s",
+.params = "id args",
+.help   = "change chardev",
+.cmd= hmp_chardev_change,
+},
+
+STEXI
+@item chardev-change args
+@findex chardev-change
+chardev-change accepts existing chardev @var{id} and then the same arguments
+as the -chardev command line switch (except for "id").
 
 ETEXI
 
diff --git a/hmp.c b/hmp.c
index 8c72c58..91e4317 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2225,6 +2225,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict)
 hmp_handle_error(mon, &err);
 }
 
+void hmp_chardev_change(Monitor *mon, const QDict *qdict)
+{
+const char *args = qdict_get_str(qdict, "args");
+const char *id;
+Error *err = NULL;
+ChardevBackend *backend = NULL;
+ChardevReturn *ret = NULL;
+QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args,
+ true);
+if (!opts) {
+error_setg(&err, "Parsing chardev args failed");
+goto end;
+}
+
+id = qdict_get_str(qdict, "id");
+if (qemu_opts_id(opts)) {
+error_setg(&err, "Unexpected 'id' parameter");
+goto end;
+}
+
+backend = qemu_chr_parse_opts(opts, &err);
+if (!backend) {
+goto end;
+}
+
+ret = qmp_chardev_change(id, backend, &err);
+
+end:
+qapi_free_ChardevReturn(ret);
+qapi_free_ChardevBackend(backend);
+qemu_opts_del(opts);
+hmp_handle_error(mon, &err);
+}
+
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict)
 {
 Error *local_err = NULL;
diff --git a/tests/test-hmp.c b/tests/test-hmp.c
index 99e35ec..299df19 100644
--- a/tests/test-hmp.c
+++ b/tests/test-hmp.c
@@ -22,6 +22,7 @@ static int verbose;
 static const char *hmp_cmds[] = {
 "boot_set ndc",
 "chardev-add null,id=testchardev1",
+"chardev-change testchardev1 ringbuf",
 "chardev-remove testchardev1",
 "commit all",
 "cpu-add 1",
-- 
2.7.4




[Qemu-devel] [PATCH v4 01/13] char: move QemuOpts->ChardevBackend translation to a separate func

2017-06-26 Thread Anton Nefedov
parse function will be used by the following patch

Signed-off-by: Anton Nefedov 
---
 chardev/char.c | 81 --
 1 file changed, 51 insertions(+), 30 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index bcfc065..f8817c5 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -556,17 +556,23 @@ help_string_append(const char *name, void *opaque)
 g_string_append_printf(str, "\n%s", name);
 }
 
-Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
-Error **errp)
+static const char *chardev_alias_translate(const char *name)
+{
+int i;
+for (i = 0; i < (int)ARRAY_SIZE(chardev_alias_table); i++) {
+if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) {
+return chardev_alias_table[i].typename;
+}
+}
+return name;
+}
+
+static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
-Chardev *chr;
-int i;
 ChardevBackend *backend = NULL;
-const char *name = qemu_opt_get(opts, "backend");
-const char *id = qemu_opts_id(opts);
-char *bid = NULL;
+const char *name = chardev_alias_translate(qemu_opt_get(opts, "backend"));
 
 if (name == NULL) {
 error_setg(errp, "chardev: \"%s\" missing backend",
@@ -574,7 +580,40 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 return NULL;
 }
 
-if (is_help_option(name)) {
+cc = char_get_class(name, errp);
+if (cc == NULL) {
+return NULL;
+}
+
+backend = g_new0(ChardevBackend, 1);
+backend->type = CHARDEV_BACKEND_KIND_NULL;
+
+if (cc->parse) {
+cc->parse(opts, backend, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+qapi_free_ChardevBackend(backend);
+return NULL;
+}
+} else {
+ChardevCommon *ccom = g_new0(ChardevCommon, 1);
+qemu_chr_parse_common(opts, ccom);
+backend->u.null.data = ccom; /* Any ChardevCommon member would work */
+}
+
+return backend;
+}
+
+Chardev *qemu_chr_new_from_opts(QemuOpts *opts, Error **errp)
+{
+const ChardevClass *cc;
+Chardev *chr = NULL;
+ChardevBackend *backend = NULL;
+const char *name = chardev_alias_translate(qemu_opt_get(opts, "backend"));
+const char *id = qemu_opts_id(opts);
+char *bid = NULL;
+
+if (name && is_help_option(name)) {
 GString *str = g_string_new("");
 
 chardev_name_foreach(help_string_append, str);
@@ -589,38 +628,20 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 return NULL;
 }
 
-for (i = 0; i < (int)ARRAY_SIZE(chardev_alias_table); i++) {
-if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) {
-name = chardev_alias_table[i].typename;
-break;
-}
+backend = qemu_chr_parse_opts(opts, errp);
+if (backend == NULL) {
+return NULL;
 }
 
 cc = char_get_class(name, errp);
 if (cc == NULL) {
-return NULL;
+goto out;
 }
 
-backend = g_new0(ChardevBackend, 1);
-backend->type = CHARDEV_BACKEND_KIND_NULL;
-
 if (qemu_opt_get_bool(opts, "mux", 0)) {
 bid = g_strdup_printf("%s-base", id);
 }
 
-chr = NULL;
-if (cc->parse) {
-cc->parse(opts, backend, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
-goto out;
-}
-} else {
-ChardevCommon *ccom = g_new0(ChardevCommon, 1);
-qemu_chr_parse_common(opts, ccom);
-backend->u.null.data = ccom; /* Any ChardevCommon member would work */
-}
-
 chr = qemu_chardev_new(bid ? bid : id,
object_class_get_name(OBJECT_CLASS(cc)),
backend, errp);
-- 
2.7.4




[Qemu-devel] [PATCH v4 13/13] serial: chardev hotswap support

2017-06-26 Thread Anton Nefedov
for a backend change, a number of ioctls has to be replayed to sync
the current setup of a frontend to a backend tty. This is hopefully
enough so we don't have to track, store and replay the whole original
control byte sequence.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Michael S. Tsirkin 
CC: Paolo Bonzini 
---
 hw/char/serial.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index e47f0b6..9aec6c6 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -891,9 +891,37 @@ static void serial_reset(void *opaque)
 s->msr &= ~UART_MSR_ANY_DELTA;
 }
 
+static int serial_be_change(void *opaque)
+{
+SerialState *s = opaque;
+
+qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
+ serial_event, serial_be_change, s, NULL, true);
+
+serial_update_parameters(s);
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK,
+  &s->last_break_enable);
+
+s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0;
+serial_update_msl(s);
+
+if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) {
+serial_update_tiocm(s);
+}
+
+if (s->watch_tag > 0) {
+g_source_remove(s->watch_tag);
+s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP,
+ serial_watch_cb, s);
+}
+
+return 0;
+}
+
 void serial_realize_core(SerialState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create serial device, empty char device");
 return;
 }
@@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp)
 qemu_register_reset(serial_reset, s);
 
 qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
- serial_event, NULL, s, NULL, true);
+ serial_event, serial_be_change, s, NULL, true);
 fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH);
 fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH);
 serial_reset(s);
-- 
2.7.4




[Qemu-devel] [PATCH v4 02/13] char: add backend hotswap handler

2017-06-26 Thread Anton Nefedov
Frontends should have an interface to setup the handler of a backend change.
The interface will be used in the next commits

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Marc-André Lureau 
---
 include/chardev/char-fe.h   |  5 +
 backends/rng-egd.c  |  2 +-
 chardev/char-fe.c   |  4 +++-
 chardev/char-mux.c  |  1 +
 gdbstub.c   |  2 +-
 hw/arm/pxa2xx.c |  3 ++-
 hw/arm/strongarm.c  |  2 +-
 hw/char/bcm2835_aux.c   |  2 +-
 hw/char/cadence_uart.c  |  2 +-
 hw/char/debugcon.c  |  2 +-
 hw/char/digic-uart.c|  2 +-
 hw/char/escc.c  |  2 +-
 hw/char/etraxfs_ser.c   |  2 +-
 hw/char/exynos4210_uart.c   |  2 +-
 hw/char/grlib_apbuart.c |  2 +-
 hw/char/imx_serial.c|  2 +-
 hw/char/ipoctal232.c|  2 +-
 hw/char/lm32_juart.c|  2 +-
 hw/char/lm32_uart.c |  2 +-
 hw/char/mcf_uart.c  |  2 +-
 hw/char/milkymist-uart.c|  2 +-
 hw/char/pl011.c |  2 +-
 hw/char/sclpconsole-lm.c|  2 +-
 hw/char/sclpconsole.c   |  2 +-
 hw/char/serial.c|  2 +-
 hw/char/sh_serial.c |  2 +-
 hw/char/spapr_vty.c |  2 +-
 hw/char/stm32f2xx_usart.c   |  3 ++-
 hw/char/terminal3270.c  |  2 +-
 hw/char/virtio-console.c|  4 ++--
 hw/char/xen_console.c   |  2 +-
 hw/char/xilinx_uartlite.c   |  2 +-
 hw/ipmi/ipmi_bmc_extern.c   |  2 +-
 hw/mips/boston.c|  2 +-
 hw/mips/mips_malta.c|  2 +-
 hw/misc/ivshmem.c   |  2 +-
 hw/usb/ccid-card-passthru.c |  2 +-
 hw/usb/dev-serial.c |  2 +-
 hw/usb/redirect.c   |  2 +-
 monitor.c   |  4 ++--
 net/colo-compare.c  | 10 ++
 net/filter-mirror.c |  6 +++---
 net/slirp.c |  2 +-
 net/vhost-user.c|  7 ---
 qtest.c |  2 +-
 tests/test-char.c   | 14 ++
 tests/vhost-user-test.c |  2 +-
 47 files changed, 76 insertions(+), 57 deletions(-)

diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h
index 2cbb262..9f38060 100644
--- a/include/chardev/char-fe.h
+++ b/include/chardev/char-fe.h
@@ -4,6 +4,7 @@
 #include "chardev/char.h"
 
 typedef void IOEventHandler(void *opaque, int event);
+typedef int BackendChangeHandler(void *opaque);
 
 /* This is the backend as seen by frontend, the actual backend is
  * Chardev */
@@ -12,6 +13,7 @@ struct CharBackend {
 IOEventHandler *chr_event;
 IOCanReadHandler *chr_can_read;
 IOReadHandler *chr_read;
+BackendChangeHandler *chr_be_change;
 void *opaque;
 int tag;
 int fe_open;
@@ -54,6 +56,8 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be);
  *   receive
  * @fd_read: callback to receive data from char
  * @fd_event: event callback
+ * @be_change: backend change callback; passing NULL means hot backend change
+ * is not supported and will not be attempted
  * @opaque: an opaque pointer for the callbacks
  * @context: a main loop context or NULL for the default
  * @set_open: whether to call qemu_chr_fe_set_open() implicitely when
@@ -68,6 +72,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
   IOCanReadHandler *fd_can_read,
   IOReadHandler *fd_read,
   IOEventHandler *fd_event,
+  BackendChangeHandler *be_change,
   void *opaque,
   GMainContext *context,
   bool set_open);
diff --git a/backends/rng-egd.c b/backends/rng-egd.c
index e7ce2ca..d2b9ce6 100644
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp)
 
 /* FIXME we should resubmit pending requests when the CDS reconnects. */
 qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read,
- rng_egd_chr_read, NULL, s, NULL, true);
+ rng_egd_chr_read, NULL, NULL, s, NULL, true);
 }
 
 static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index 3f90f05..7054863 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -216,7 +216,7 @@ void qemu_chr_fe_deinit(CharBackend *b, bool del)
 assert(b);
 
 if (b->chr) {
-qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true);
+qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true);
 if (b->chr->be == b) {
 b->chr->be = NULL;
 }
@@ -235,6 +235,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
   IOCanReadHandler *fd_can_read,
   IOReadHandler *fd_read,
   IOEventHandler *fd_event,
+ 

[Qemu-devel] [PATCH v4 05/13] char: avoid chardevice direct access

2017-06-26 Thread Anton Nefedov
frontends should avoid accessing CharDriver struct where possible

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Marc-André Lureau 
---
 include/chardev/char-fe.h   | 7 +++
 chardev/char-fe.c   | 5 +
 hw/arm/strongarm.c  | 2 +-
 hw/char/cadence_uart.c  | 2 +-
 hw/char/debugcon.c  | 2 +-
 hw/char/escc.c  | 6 +++---
 hw/char/exynos4210_uart.c   | 2 +-
 hw/char/grlib_apbuart.c | 2 +-
 hw/char/ipoctal232.c| 2 +-
 hw/char/parallel.c  | 2 +-
 hw/char/sclpconsole-lm.c| 2 +-
 hw/char/sclpconsole.c   | 2 +-
 hw/char/sh_serial.c | 2 +-
 hw/char/spapr_vty.c | 2 +-
 hw/char/terminal3270.c  | 2 +-
 hw/char/xen_console.c   | 2 +-
 hw/ipmi/ipmi_bmc_extern.c   | 2 +-
 hw/misc/ivshmem.c   | 4 ++--
 hw/usb/ccid-card-passthru.c | 4 ++--
 hw/usb/dev-serial.c | 5 ++---
 hw/usb/redirect.c   | 5 ++---
 net/filter-mirror.c | 2 +-
 22 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/include/chardev/char-fe.h b/include/chardev/char-fe.h
index 385aa99..61c7f97 100644
--- a/include/chardev/char-fe.h
+++ b/include/chardev/char-fe.h
@@ -77,6 +77,13 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be);
 bool qemu_chr_fe_backend_connected(CharBackend *be);
 
 /**
+ * @qemu_chr_fe_backend_open:
+ *
+ * Returns true if chardevice associated with @be is open.
+ */
+bool qemu_chr_fe_backend_open(CharBackend *be);
+
+/**
  * @qemu_chr_fe_set_handlers:
  * @b: a CharBackend
  * @fd_can_read: callback to get the amount of data the frontend may
diff --git a/chardev/char-fe.c b/chardev/char-fe.c
index be96fb5..3120441 100644
--- a/chardev/char-fe.c
+++ b/chardev/char-fe.c
@@ -193,6 +193,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be)
 return !!be->chr;
 }
 
+bool qemu_chr_fe_backend_open(CharBackend *be)
+{
+return be->chr && be->chr->be_open;
+}
+
 bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index 1fa082c..6a45dcc 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -1106,7 +1106,7 @@ static void strongarm_uart_tx(void *opaque)
 
 if (s->utcr3 & UTCR3_LBM) /* loopback */ {
 strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1);
-} else if (qemu_chr_fe_get_driver(&s->chr)) {
+} else if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1);
diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c
index 3a9335c..6143494 100644
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -279,7 +279,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, 
GIOCondition cond,
 int ret;
 
 /* instant drain the fifo when there's no back-end */
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 s->tx_count = 0;
 return FALSE;
 }
diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c
index 9b56fb8..95ccec6 100644
--- a/hw/char/debugcon.c
+++ b/hw/char/debugcon.c
@@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = {
 
 static void debugcon_realize_core(DebugconState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create debugcon device, empty char device");
 return;
 }
diff --git a/hw/char/escc.c b/hw/char/escc.c
index 3546df3..89ae9eb 100644
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -417,7 +417,7 @@ static void escc_update_parameters(ChannelState *s)
 int speed, parity, data_bits, stop_bits;
 QEMUSerialSetParams ssp;
 
-if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser)
+if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser)
 return;
 
 if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) {
@@ -557,7 +557,7 @@ static void escc_mem_write(void *opaque, hwaddr addr,
 trace_escc_mem_writeb_data(CHN_C(s), val);
 s->tx = val;
 if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled
-if (qemu_chr_fe_get_driver(&s->chr)) {
+if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx, 1);
@@ -1013,7 +1013,7 @@ static void escc_realize(DeviceState *dev, Error **errp)
   ESCC_SIZE << s->it_shift);
 
 for (i = 0; i < 2; i++) {
-if (qemu_chr_fe_get_driver(&s

[Qemu-devel] [PATCH v4 00/13] chardevice hotswap

2017-06-26 Thread Anton Nefedov
Changed in v4:
  - rebased on top of the latest chardev changes
  - remarks applied
  - patch 1 fixed so it works with alias names



Changed in v3:
  - minor remarks to patch 1 applied
  - patch 3: avoid using bottom-half, handle syncronously
As mentioned, it gets thing complicated and is only a problem for
a monitor-connected chardev hotswap and that is not supported for now
  - tests added (patches 6-9)



This serie is a v2 of the February submit
http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html

The interface is changed as requested and the changes are slightly reworked
and split into separate patches.



The patchset adds support of the character device change without
a frontend device removal.
Yet isa-serial and virtio-serial frontends are supported.

The feature can be helpful for e.g. Windows debug allowing to
establish connection to a live VM from VM with WinDbg.

Anton Nefedov (13):
  char: move QemuOpts->ChardevBackend translation to a separate func
  char: add backend hotswap handler
  char: chardevice hotswap
  char: forbid direct chardevice access for hotswap devices
  char: avoid chardevice direct access
  test-char: unref chardev-udp after test
  test-char: split char_udp_test
  test-char: split char_file_test
  test-char: add hotswap test
  hmp: add hmp analogue for qmp-chardev-change
  virtio-console: chardev hotswap support
  serial: move TIOCM update to a separate function
  serial: chardev hotswap support

 include/chardev/char-fe.h   |  39 +++
 include/chardev/char.h  |  19 
 hmp.h   |   1 +
 backends/rng-egd.c  |   2 +-
 chardev/char-fe.c   |  49 +++--
 chardev/char-mux.c  |   1 +
 chardev/char.c  | 167 +++-
 gdbstub.c   |   2 +-
 hmp-commands.hx |  18 ++-
 hmp.c   |  34 ++
 hw/arm/pxa2xx.c |   3 +-
 hw/arm/strongarm.c  |   4 +-
 hw/char/bcm2835_aux.c   |   2 +-
 hw/char/cadence_uart.c  |   4 +-
 hw/char/debugcon.c  |   4 +-
 hw/char/digic-uart.c|   2 +-
 hw/char/escc.c  |   8 +-
 hw/char/etraxfs_ser.c   |   2 +-
 hw/char/exynos4210_uart.c   |   4 +-
 hw/char/grlib_apbuart.c |   4 +-
 hw/char/imx_serial.c|   2 +-
 hw/char/ipoctal232.c|   4 +-
 hw/char/lm32_juart.c|   2 +-
 hw/char/lm32_uart.c |   2 +-
 hw/char/mcf_uart.c  |   2 +-
 hw/char/milkymist-uart.c|   2 +-
 hw/char/parallel.c  |   2 +-
 hw/char/pl011.c |   2 +-
 hw/char/sclpconsole-lm.c|   4 +-
 hw/char/sclpconsole.c   |   4 +-
 hw/char/serial.c|  63 ---
 hw/char/sh_serial.c |   4 +-
 hw/char/spapr_vty.c |   4 +-
 hw/char/stm32f2xx_usart.c   |   3 +-
 hw/char/terminal3270.c  |   4 +-
 hw/char/virtio-console.c|  35 +-
 hw/char/xen_console.c   |   4 +-
 hw/char/xilinx_uartlite.c   |   2 +-
 hw/ipmi/ipmi_bmc_extern.c   |   4 +-
 hw/mips/boston.c|   2 +-
 hw/mips/mips_malta.c|   2 +-
 hw/misc/ivshmem.c   |   6 +-
 hw/usb/ccid-card-passthru.c |   6 +-
 hw/usb/dev-serial.c |   7 +-
 hw/usb/redirect.c   |   7 +-
 monitor.c   |   4 +-
 net/colo-compare.c  |  10 +-
 net/filter-mirror.c |   8 +-
 net/slirp.c |   2 +-
 net/vhost-user.c|   7 +-
 qapi-schema.json|  40 +++
 qtest.c |   2 +-
 tests/test-char.c   | 263 +---
 tests/test-hmp.c|   1 +
 tests/vhost-user-test.c |   2 +-
 55 files changed, 685 insertions(+), 202 deletions(-)

-- 
2.7.4




Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap

2017-06-15 Thread Anton Nefedov



On 06/13/2017 03:32 PM, Marc-André Lureau wrote:

Hi

On Tue, Jun 13, 2017 at 3:53 PM Anton Nefedov 
mailto:anton.nefe...@virtuozzo.com>> wrote:


 > The existing chr_write_lock belongs to Chardev.
 > For the hotswap case, we need to ensure that be->chr won't
change and
 > the old Chardev (together with its mutex) won't be destroyed
while it's
 > used in the write functions.
 >
 > Maybe we could move the lock to CharBackend, instead of
creating a new
 > one. But I guess this
 >1. won't work for mux, where multiple CharBackends share
the same
 > Chardev
 >2. won't work for some chardevs, like pty which uses the
lock for the
 > timer handler
 >
 > Sorry if I'm not explaining clearly enough or maybe I'm
missing some
 > easier solution?
 >
 >
 > It looks to me like you would get the same guarantees by using the
 > chr_write_lock directly:
 >
 > @@ -1381,7 +1374,7 @@ ChardevReturn *qmp_chardev_change(const
char *id,
 > ChardevBackend *backend,
 >   closed_sent = true;
 >   }
 >
 > -qemu_mutex_lock(&be->chr_lock);
 > +qemu_mutex_lock(&chr->chr_write_lock);
 >   chr->be = NULL;
 >   qemu_chr_fe_connect(be, chr_new, &error_abort);
 >
 > @@ -1389,14 +1382,14 @@ ChardevReturn *qmp_chardev_change(const char
 > *id, ChardevBackend *backend,
 >   error_setg(errp, "Chardev '%s' change failed",
chr_new->label);
 >   chr_new->be = NULL;
 >   qemu_chr_fe_connect(be, chr, &error_abort);
 > -qemu_mutex_unlock(&be->chr_lock);
 > +qemu_mutex_unlock(&chr->chr_write_lock);
 >   if (closed_sent) {
 >   qemu_chr_be_event(chr, CHR_EVENT_OPENED);
 >   }
 >   object_unref(OBJECT(chr_new));
 >   return NULL;
 >   }
 > -qemu_mutex_unlock(&be->chr_lock);
 > +qemu_mutex_unlock(&chr->chr_write_lock);
 >
 > I wonder if we should rename 'chr_write_lock' to just 'lock' :)
 >

hi,

but isn't there a potential race?

Assume thread-1 is somewhere in qemu_chr_write() and thread-2 is in
qmp_chardev_change().

T2 can change the chardev, and destroy the old one while T1 still has
the pointer ( = use after free).
Or T1 unlocks chr_write_lock, T2 acquires that in
qemu_chr_write_buffer(), then T1 destroys it together
with the chardev ( = undefined behaviour).


What I'm trying to say is that the critical section for the hotswap is
bigger than what chr_write_lock currently covers - we also need to cover
the be->chr pointer.
Or am I missing something?


Thanks for the detail, I think you are correct, and probably your 
solution is good. Another way would be to object_ref/unref the Chardev 
when using it in seperate thread, this way we wouldn't need an extra 
lock, I think. Paolo might be of good advice to get this right.




We also need to protect be->chr pointer, not just the Chardev itself, so
probably ref/unref won't be thread-safe, strictly speaking?

e.g.

  fe_write(CharBackend *be) {
ref(be->chr);
...
unref(be->chr); // oops, be->chr might have changed
  }

or

  fe_write(CharBackend *be) {
Chardev *chr = be->chr;
ref(chr); // not good either, chr might be destroyed already
...
unref(chr);
  }




--
Marc-André Lureau

/Anton



Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap

2017-06-13 Thread Anton Nefedov


On 06/09/2017 05:53 PM, Marc-André Lureau wrote:



On Thu, Jun 1, 2017 at 3:23 PM Anton Nefedov 
mailto:anton.nefe...@virtuozzo.com>> wrote:




On 05/31/2017 10:20 PM, Marc-André Lureau wrote:
 > Hi
 >
 > On Tue, May 30, 2017 at 6:12 PM Anton Nefedov
 > mailto:anton.nefe...@virtuozzo.com>
<mailto:anton.nefe...@virtuozzo.com
<mailto:anton.nefe...@virtuozzo.com>>> wrote:
 >
 > This patch adds a possibility to change a char device without
a frontend
 > removal.
 >
 > 1. Ideally, it would have to happen transparently to a
frontend, i.e.
 > frontend would continue its regular operation.
 > However, backends are not stateless and are set up by the
frontends
 > via qemu_chr_fe_<> functions, and it's not (generally)
possible to
 > replay
 > that setup entirely in a backend code, as different chardevs
respond
 > to the setup calls differently, so do frontends work
differently basing
 > on those setup responses.
 > Moreover, some frontend can generally get and save the
backend pointer
 > (qemu_chr_fe_get_driver()), and it will become invalid after
backend
 > change.
 >
 > So, a frontend which would like to support chardev hotswap has to
 > register
 > a "backend change" handler, and redo its backend setup there.
 >
 > 2. Write path can be used by multiple threads and thus
protected with
 > chr_write_lock.
 > So hotswap also has to be protected so write functions won't
access
 > a backend being replaced.
 >
 >
 > Tbh, I don't understand the need for a different lock. Could you
 > explain? Even better would be to write a test that shows in which way
 > the lock helps.
 >

hi Marc-André,

The existing chr_write_lock belongs to Chardev.
For the hotswap case, we need to ensure that be->chr won't change and
the old Chardev (together with its mutex) won't be destroyed while it's
used in the write functions.

Maybe we could move the lock to CharBackend, instead of creating a new
one. But I guess this
   1. won't work for mux, where multiple CharBackends share the same
Chardev
   2. won't work for some chardevs, like pty which uses the lock for the
timer handler

Sorry if I'm not explaining clearly enough or maybe I'm missing some
easier solution?


It looks to me like you would get the same guarantees by using the 
chr_write_lock directly:


@@ -1381,7 +1374,7 @@ ChardevReturn *qmp_chardev_change(const char *id, 
ChardevBackend *backend,

  closed_sent = true;
  }

-qemu_mutex_lock(&be->chr_lock);
+qemu_mutex_lock(&chr->chr_write_lock);
  chr->be = NULL;
  qemu_chr_fe_connect(be, chr_new, &error_abort);

@@ -1389,14 +1382,14 @@ ChardevReturn *qmp_chardev_change(const char 
*id, ChardevBackend *backend,

  error_setg(errp, "Chardev '%s' change failed", chr_new->label);
  chr_new->be = NULL;
  qemu_chr_fe_connect(be, chr, &error_abort);
-qemu_mutex_unlock(&be->chr_lock);
+qemu_mutex_unlock(&chr->chr_write_lock);
  if (closed_sent) {
  qemu_chr_be_event(chr, CHR_EVENT_OPENED);
  }
  object_unref(OBJECT(chr_new));
  return NULL;
  }
-qemu_mutex_unlock(&be->chr_lock);
+qemu_mutex_unlock(&chr->chr_write_lock);

I wonder if we should rename 'chr_write_lock' to just 'lock' :)



hi,

but isn't there a potential race?

Assume thread-1 is somewhere in qemu_chr_write() and thread-2 is in
qmp_chardev_change().

T2 can change the chardev, and destroy the old one while T1 still has
the pointer ( = use after free).
Or T1 unlocks chr_write_lock, T2 acquires that in
qemu_chr_write_buffer(), then T1 destroys it together
with the chardev ( = undefined behaviour).

What I'm trying to say is that the critical section for the hotswap is
bigger than what chr_write_lock currently covers - we also need to cover
the be->chr pointer.
Or am I missing something?



I can try to add a test but can't quite see yet how to freeze the old
chardev somewhere in cc->chr_write() and hotswap it while it's there.


 >
 >
 > Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com>
 > <mailto:anton.nefe...@virtuozzo.com
<mailto:anton.nefe...@virtuozzo.com>>>
 > Reviewed-by: Vladimir Sementsov-Ogievskiy
mailto:vsement...@virtuozzo.com>
 > <mailto:vsement...@virtuozzo.com
<mailto:v

Re: [Qemu-devel] [PATCH v2 02/15] file-posix: support BDRV_REQ_ALLOCATE

2017-06-02 Thread Anton Nefedov


On 06/01/2017 10:54 PM, Eric Blake wrote:

On 06/01/2017 02:49 PM, Eric Blake wrote:

On 06/01/2017 10:14 AM, Anton Nefedov wrote:

Current write_zeroes implementation is good enough to satisfy this flag too

Signed-off-by: Anton Nefedov 
---
  block/file-posix.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)


Are we sure that fallocate() is always fast, or are there some file
systems where it is no faster than manually writing zeroes?  I'm worried
that blindly claiming BDRV_REQ_ALLOCATE may fail if we encounter a libc


not so much fail as in "break the guest", but fail as in "take far more
time than we were expecting, pessimising our behavior to worse than if
we had not tried the allocation at all"


or kernel-based fallback that takes a slow patch on our behalf.





I would expect such filesystems to not support fallocate.

Though I must admit I can't see anywhere in the documentation that it 
MUST be strictly faster than writing zeroes; it would look very strange

to me if there were a slowpath fallback somewhere past the libc.

/Anton



Re: [Qemu-devel] [PATCH v2 03/15] blkdebug: support BDRV_REQ_ALLOCATE

2017-06-02 Thread Anton Nefedov



On 06/01/2017 10:50 PM, Eric Blake wrote:

On 06/01/2017 10:14 AM, Anton Nefedov wrote:

Support the flag if the underlying BDS supports it

Signed-off-by: Anton Nefedov 
---
  block/blkdebug.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Shouldn't other passthrough drivers (like raw-format.c) make this change
as well?



Right.

Wonder why they even enumerate those instead of just

bs->supported_zero_flags = bs->file->bs->supported_zero_flags;



diff --git a/block/blkdebug.c b/block/blkdebug.c
index a5196e8..8b1401b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -415,7 +415,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
  
  bs->supported_write_flags = BDRV_REQ_FUA &

  bs->file->bs->supported_write_flags;
-bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+bs->supported_zero_flags =
+(BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_ALLOCATE) &
  bs->file->bs->supported_zero_flags;
  ret = -EINVAL;
  





/Anton



Re: [Qemu-devel] [PATCH v2 01/15] block: introduce BDRV_REQ_ALLOCATE flag

2017-06-02 Thread Anton Nefedov



On 06/01/2017 10:07 PM, Eric Blake wrote:

On 06/01/2017 10:14 AM, Anton Nefedov wrote:

The flag is supposed to indicate that the region of the disk image has
to be sufficiently allocated so it reads as zeroes. The call with the flag
set has to return -ENOTSUP if allocation cannot be done efficiently
(i.e. without falling back to writing actual buffers)

Signed-off-by: Anton Nefedov 
---
  block/io.c| 19 ---
  block/trace-events|  1 +
  include/block/block.h |  6 +-
  3 files changed, 22 insertions(+), 4 deletions(-)


You may want to 'git config diff.orderFile /path/to/file' (with a
suitably populated file) so that .h files come first in your diffs, as
that can aid reviewers.  At one point, there was a thread about adding
such a file to qemu.git proper for everyone to share, although it seems
to have stalled.



Thanks, will do



diff --git a/block/io.c b/block/io.c
index ed31810..d47efa9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1272,7 +1272,7 @@ static int coroutine_fn 
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  assert(!bs->supported_zero_flags);
  }
  
-if (ret == -ENOTSUP) {

+if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) {


I'd feel MUCH better if you first fixed the conditional just above this
point to ensure that if the caller requests BDRV_REQ_ALLOCATE that we do
not call bdrv->bdrv_co_pwrite_zeroes() unless bs->supported_zero_flags
also mentions this bit.

Remember, the existing semantics of .bdrv_co_pwrite_zeroes() merely
state that we must return -ENOTSUP unless we can guarantee that we read
back as zeroes, but puts no timing constraints on it.  A driver that has
not been retrofitted to understand the BDRV_REQ_ALLOCATE flag will
therefore risk taking too long.  Using bs->supported_zero_flags as your
gate is what will let you avoid calling into a driver that has not been
audited for fitting the new contract.



Absolutely; I have even added that check but must have lost that at some
point.
Meant to add that much earlier though, to bdrv_co_pwrite_zeroes()


  /* Fall back to bounce buffer if write zeroes is unsupported */
  BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
  
@@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,

  ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
  
  if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&

-!(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
-qemu_iovec_is_zero(qiov)) {
+!(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) &&
+drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) {
  flags |= BDRV_REQ_ZERO_WRITE;
  if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
  flags |= BDRV_REQ_MAY_UNMAP;
@@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild 
*child,
  
  assert(flags & BDRV_REQ_ZERO_WRITE);

  if (head_padding_bytes || tail_padding_bytes) {
+if (flags & BDRV_REQ_ALLOCATE) {
+return -ENOTSUP;
+}


Can we assert that BDRV_REQ_ALLOCATE will only be supplied by a caller
that is already using aligned values?  Or is that too strict?



as I understand the top driver should not care much about the child
driver alignment preferences? that's what the common bdrv_* interface is
there for


  buf = qemu_blockalign(bs, align);
  iov = (struct iovec) {
  .iov_base   = buf,
@@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
  return ret;
  }
  
+if (qiov && flags & BDRV_REQ_ALLOCATE) {

+/* allocation request with qiov provided doesn't make much sense */
+return -ENOTSUP;


Should this be an assertion (bug in the program for mixing things that
don't make sense) rather than just a runtime error return?



incline to agree here


+}
+
  bdrv_inc_in_flight(bs);
  /*
   * Align write if necessary by performing a read-modify-write cycle.
@@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, 
int64_t offset,
  {
  trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
  
+if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) {

+/* nonsense */
+return -ENOTSUP;
+}


Ditto.



yep


+
  if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
  flags &= ~BDRV_REQ_MAY_UNMAP;
  }
diff --git a/block/trace-events b/block/trace-events
index 9a71c7f..a15c2cc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void 
*opaque) "bs
  bdrv

[Qemu-devel] [PATCH v2 15/15] iotest 046: test simultaneous cluster write error case

2017-06-01 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
---
 tests/qemu-iotests/046 | 38 +-
 tests/qemu-iotests/046.out | 23 +++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046
index f2ebecf..c210b55 100755
--- a/tests/qemu-iotests/046
+++ b/tests/qemu-iotests/046
@@ -29,7 +29,8 @@ status=1  # failure is the default!
 
 _cleanup()
 {
-   _cleanup_test_img
+_cleanup_test_img
+rm "$TEST_DIR/blkdebug.conf"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -188,6 +189,37 @@ overlay_io | $QEMU_IO blkdebug::"$TEST_IMG" | 
_filter_qemu_io |\
sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g'
 
 echo
+echo "== Concurrency error case =="
+
+# 1. 1st request allocated the cluster, stop before it writes and updates L2
+# 2. 2nd request starts at the same cluster must complete write and start
+#waiting for the 1st to update L2
+# 3. Resume the 1st request to make it fail (injected error)
+# 4. 2nd request must wake and fail as well
+#1 cluster will end up leaked
+cat > "$TEST_DIR/blkdebug.conf" <

[Qemu-devel] [PATCH v2 14/15] qcow2: allow concurrent unaligned writes to the same clusters

2017-06-01 Thread Anton Nefedov
If COW area of a write request to unallocated cluster is empty,
concurrent write requests can be allowed with a little bit of
extra synchronization; so they don't have to wait until L2 is filled.

Let qcow2_cluster.c::handle_dependencies() do the most of the job:
  if there is an in-flight request to the same cluster,
  and the current request wants to write in its COW area,
  and its COW area is marked empty,
  - steal the allocated offset and write concurrently. Let the original
request update L2 later when it likes.

This gives an improvement for parallel misaligned writes to
unallocated clusters with no backing data:

HDD fio over xfs iodepth=4:
  seqwrite 4k:   18400 -> 22800 IOPS ( x1.24 )
  seqwrite 68k:   1600 ->  2300 IOPS ( x1.44 )

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 169 +++---
 block/qcow2.c |  28 -
 block/qcow2.h |  12 +++-
 3 files changed, 181 insertions(+), 28 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 2fa549d..03b2e6c 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -898,20 +898,32 @@ out:
 
 /*
  * Check if there already is an AIO write request in flight which allocates
- * the same cluster. In this case we need to wait until the previous
- * request has completed and updated the L2 table accordingly.
+ * the same cluster.
+ * In this case, check if that request has explicitly allowed to write
+ * in its COW area(s).
+ *   If yes - fill the meta to point to the same cluster.
+ *   If no  - we need to wait until the previous request has completed and
+ *updated the L2 table accordingly or
+ *has allowed writing in its COW area(s).
  * Returns:
  *   0   if there was no dependency. *cur_bytes indicates the number of
  *   bytes from guest_offset that can be read before the next
  *   dependency must be processed (or the request is complete).
- *   *m is not modified
+ *   *m, *host_offset are not modified
+ *
+ *   1   if there is a dependency but it is possible to write concurrently
+ *   *m is filled accordingly,
+ *   *cur_bytes may have decreased and describes
+ * the length of the area that can be written to,
+ *   *host_offset contains the starting host image offset to write to
  *
  *   -EAGAIN if we had to wait for another request. The caller
- *   must start over, so consider *cur_bytes undefined.
+ *   must start over, so consider *cur_bytes and *host_offset 
undefined.
  *   *m is not modified
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-uint64_t *cur_bytes, QCowL2Meta **m)
+   uint64_t *host_offset, uint64_t *cur_bytes,
+   QCowL2Meta **m)
 {
 BDRVQcow2State *s = bs->opaque;
 QCowL2Meta *old_alloc;
@@ -924,7 +936,7 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 const uint64_t old_start = l2meta_cow_start(old_alloc);
 const uint64_t old_end = l2meta_cow_end(old_alloc);
 
-if (end <= old_start || start >= old_end) {
+if (end <= old_start || start >= old_end || old_alloc->piggybacked) {
 /* No intersection */
 continue;
 }
@@ -936,21 +948,95 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 continue;
 }
 
-/* Stop if an l2meta already exists. After yielding, it wouldn't
- * be valid any more, so we'd have to clean up the old L2Metas
- * and deal with requests depending on them before starting to
- * gather new ones. Not worth the trouble. */
-if (*m) {
+/* offsets of the cluster we're intersecting in */
+const uint64_t cluster_start = start_of_cluster(s, start);
+const uint64_t cluster_end = cluster_start + s->cluster_size;
+
+const uint64_t old_data_start = old_start
++ old_alloc->cow_start.nb_bytes;
+const uint64_t old_data_end = old_alloc->offset
++ old_alloc->cow_end.offset;
+
+const bool conflict_in_data_area =
+end > old_data_start && start < old_data_end;
+const bool conflict_in_old_cow_start =
+/* 1). new write request area is before the old */
+start < old_data_start
+&& /* 2). old request did not allow writing in its cow area */
+!old_alloc->cow_start.reduced;
+const bool conflict_in_old_cow_end =
+/* 1). new write request area is after the old */
+start > old_data_start
+&& /* 2). old request did not allow writing in its cow area */
+!old_alloc->cow_end.reduced;
+
+if (conflict_in_data_a

[Qemu-devel] [PATCH v2 12/15] qcow2-cluster: slightly refactor handle_dependencies()

2017-06-01 Thread Anton Nefedov
  - assert the alignment on return if the allocation has to stop
(at the start of a running allocation)
  - make use of const specifiers for local variables

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 3dafd19..95e8862 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -900,15 +900,15 @@ out:
  * Check if there already is an AIO write request in flight which allocates
  * the same cluster. In this case we need to wait until the previous
  * request has completed and updated the L2 table accordingly.
- *
  * Returns:
  *   0   if there was no dependency. *cur_bytes indicates the number of
  *   bytes from guest_offset that can be read before the next
- *   dependency must be processed (or the request is complete)
+ *   dependency must be processed (or the request is complete).
+ *   *m is not modified
  *
- *   -EAGAIN if we had to wait for another request, previously gathered
- *   information on cluster allocation may be invalid now. The caller
- *   must start over anyway, so consider *cur_bytes undefined.
+ *   -EAGAIN if we had to wait for another request. The caller
+ *   must start over, so consider *cur_bytes undefined.
+ *   *m is not modified
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 uint64_t *cur_bytes, QCowL2Meta **m)
@@ -919,10 +919,10 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 
 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 
-uint64_t start = guest_offset;
-uint64_t end = start + bytes;
-uint64_t old_start = l2meta_cow_start(old_alloc);
-uint64_t old_end = l2meta_cow_end(old_alloc);
+const uint64_t start = guest_offset;
+const uint64_t end = start + bytes;
+const uint64_t old_start = l2meta_cow_start(old_alloc);
+const uint64_t old_end = l2meta_cow_end(old_alloc);
 
 if (end <= old_start || start >= old_end) {
 /* No intersection */
@@ -939,6 +939,8 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
  * and deal with requests depending on them before starting to
  * gather new ones. Not worth the trouble. */
 if (bytes == 0 && *m) {
+/* start must be cluster aligned at this point */
+assert(start == start_of_cluster(s, start));
 *cur_bytes = 0;
 return 0;
 }
-- 
2.7.4




[Qemu-devel] [PATCH v2 08/15] qcow2: truncate preallocated space

2017-06-01 Thread Anton Nefedov
From: "Denis V. Lunev" 

This could be done after calculation of the end of data and metadata in
the qcow2 image.

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2-cluster.c  | 9 +
 block/qcow2-refcount.c | 7 +++
 block/qcow2.c  | 8 
 block/qcow2.h  | 3 +++
 4 files changed, 27 insertions(+)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 88dd555..c39e825 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1958,3 +1958,12 @@ fail:
 g_free(l1_table);
 return ret;
 }
+
+void qcow2_update_data_end(BlockDriverState *bs, uint64_t off)
+{
+BDRVQcow2State *s = bs->opaque;
+
+if (s->data_end < off) {
+s->data_end = off;
+}
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 873a1d2..8156466 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -744,6 +744,9 @@ static int QEMU_WARN_UNUSED_RESULT 
update_refcount(BlockDriverState *bs,
 ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 if (ret < 0) {
 goto fail;
+} else {
+qcow2_update_data_end(bs, s->refcount_table_offset +
+s->refcount_table_size * sizeof(uint64_t));
 }
 }
 old_table_index = table_index;
@@ -865,6 +868,8 @@ retry:
 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 {
 return -EFBIG;
+} else {
+qcow2_update_data_end(bs, s->free_cluster_index << s->cluster_bits);
 }
 
 #ifdef DEBUG_ALLOC2
@@ -929,6 +934,8 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, 
uint64_t offset,
 
 if (ret < 0) {
 return ret;
+} else {
+qcow2_update_data_end(bs, offset + (nb_clusters << s->cluster_bits));
 }
 
 return i;
diff --git a/block/qcow2.c b/block/qcow2.c
index b090833..33e5455 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1194,6 +1194,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 }
 
+s->data_end = bdrv_getlength(bs->file->bs);
+
 #ifdef DEBUG_ALLOC
 {
 BdrvCheckResult result = {0};
@@ -1941,12 +1943,18 @@ static int qcow2_inactivate(BlockDriverState *bs)
 static void qcow2_close(BlockDriverState *bs)
 {
 BDRVQcow2State *s = bs->opaque;
+
 qemu_vfree(s->l1_table);
 /* else pre-write overlap checks in cache_destroy may crash */
 s->l1_table = NULL;
 
 if (!(s->flags & BDRV_O_INACTIVE)) {
 qcow2_inactivate(bs);
+
+/* truncate preallocated space */
+if (!bs->read_only && s->data_end < bdrv_getlength(bs->file->bs)) {
+bdrv_truncate(bs->file, s->data_end, NULL);
+}
 }
 
 cache_clean_timer_del(bs);
diff --git a/block/qcow2.h b/block/qcow2.h
index a0d222d..e28c54a 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -297,6 +297,7 @@ typedef struct BDRVQcow2State {
 char *image_backing_format;
 
 uint64_t prealloc_size;
+uint64_t data_end;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
@@ -607,4 +608,6 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache 
*c, uint64_t offset,
 void **table);
 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
+void qcow2_update_data_end(BlockDriverState *bs, uint64_t off);
+
 #endif
-- 
2.7.4




[Qemu-devel] [PATCH v2 13/15] qcow2-cluster: make handle_dependencies() logic easier to follow

2017-06-01 Thread Anton Nefedov
Avoid complicated nested conditions; return or continue asap instead.
The logic is not changed.

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 45 ++---
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 95e8862..2fa549d 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -926,32 +926,31 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 
 if (end <= old_start || start >= old_end) {
 /* No intersection */
-} else {
-if (start < old_start) {
-/* Stop at the start of a running allocation */
-bytes = old_start - start;
-} else {
-bytes = 0;
-}
+continue;
+}
 
-/* Stop if already an l2meta exists. After yielding, it wouldn't
- * be valid any more, so we'd have to clean up the old L2Metas
- * and deal with requests depending on them before starting to
- * gather new ones. Not worth the trouble. */
-if (bytes == 0 && *m) {
-/* start must be cluster aligned at this point */
-assert(start == start_of_cluster(s, start));
-*cur_bytes = 0;
-return 0;
-}
+if (start < old_start) {
+/* Stop at the start of a running allocation */
+bytes = old_start - start;
+/* ..if there is no other conflict, keep checking */
+continue;
+}
 
-if (bytes == 0) {
-/* Wait for the dependency to complete. We need to recheck
- * the free/allocated clusters when we continue. */
-qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
-return -EAGAIN;
-}
+/* Stop if an l2meta already exists. After yielding, it wouldn't
+ * be valid any more, so we'd have to clean up the old L2Metas
+ * and deal with requests depending on them before starting to
+ * gather new ones. Not worth the trouble. */
+if (*m) {
+/* start must be cluster aligned at this point */
+assert(start == start_of_cluster(s, start));
+*cur_bytes = 0;
+return 0;
 }
+
+/* Wait for the dependency to complete. We need to recheck
+ * the free/allocated clusters when we continue. */
+qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
+return -EAGAIN;
 }
 
 /* Make sure that existing clusters and new allocations are only used up to
-- 
2.7.4




[Qemu-devel] [PATCH v2 05/15] qcow2: do not COW the empty areas

2017-06-01 Thread Anton Nefedov
If COW area of the newly allocated cluster is zeroes, there is no reason
to write zero sectors in perform_cow() again now as whole clusters are
zeroed out in single chunks by handle_alloc_space().

Introduce QCowL2Meta field "reduced", since the existing fields
(offset and nb_bytes) still has to keep other write requests from
simultaneous writing in the area

iotest 060:
write to the discarded cluster does not trigger COW anymore.
so, break on write_aio event instead, will work for the test
(but write won't fail anymore, so update reference output)

iotest 066:
cluster-alignment areas that were not really COWed are now detected
as zeroes, hence the initial write has to be exactly the same size for
the maps to match

performance tests: ===

qemu-io,
  results in seconds to complete (less is better)
  random write 4k to empty image, no backing
HDD
  64k cluster
128M over 128M image:   160 -> 160 ( x1  )
128M over   2G image:86 ->  84 ( x1  )
128M over   8G image:40 ->  29 ( x1.4 )
  1M cluster
 32M over   8G image:58 ->  23 ( x2.5 )

SSD
  64k cluster
  2G over   2G image:71 ->  38 (  x1.9 )
512M over   8G image:85 ->   8 ( x10.6 )
  1M cluster
128M over  32G image:   314 ->   2 ( x157  )

  - improvement grows bigger the bigger the cluster size,
  - first data portions to the fresh image benefit the most
  (more chance to hit an unallocated cluster)
  - SSD improvement is close to the IO length reduction rate
  (e.g. writing only 4k instead of 64k) gives theoretical x16
  and practical x10 improvement)

fio tests over xfs, empty image (cluster 64k), no backing,

  first megabytes of random writes:
randwrite 4k, size=8g:

  HDD (io_size=128m) :  730 ->  1050 IOPS ( x1.45)
  SSD (io_size=512m) : 1500 ->  7000 IOPS ( x4.7 )

  random writes io_size==image_size:
randwrite 4k, size=2g io_size=2g:
   HDD   : 200 IOPS (no difference)
   SSD   : 7500 ->  9500 IOPS ( x1.3 )

  sequential write:
seqwrite 4k, size=4g, iodepth=4
   SSD   : 7000 -> 18000 IOPS ( x2.6 )

  - numbers are similar to qemu-io tests, slightly less improvement
  (damped by fs?)

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c  |  4 +++-
 block/qcow2.c  | 32 ++--
 block/qcow2.h  |  4 
 tests/qemu-iotests/060 |  2 +-
 tests/qemu-iotests/060.out |  3 ++-
 tests/qemu-iotests/066 |  2 +-
 tests/qemu-iotests/066.out |  4 ++--
 7 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d779ea1..ba8ce76 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -758,7 +758,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, 
Qcow2COWRegion *r)
 BDRVQcow2State *s = bs->opaque;
 int ret;
 
-if (r->nb_bytes == 0) {
+if (r->nb_bytes == 0 || r->reduced) {
 return 0;
 }
 
@@ -1267,10 +1267,12 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 .cow_start = {
 .offset = 0,
 .nb_bytes   = offset_into_cluster(s, guest_offset),
+.reduced= false,
 },
 .cow_end = {
 .offset = nb_bytes,
 .nb_bytes   = avail_bytes - nb_bytes,
+.reduced= false,
 },
 };
 qemu_co_queue_init(&(*m)->dependent_requests);
diff --git a/block/qcow2.c b/block/qcow2.c
index cd5efba..067bb87 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -64,6 +64,9 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
 
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+uint32_t count);
+
 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
 const QCowHeader *cow_header = (const void *)buf;
@@ -1575,10 +1578,30 @@ fail:
 return ret;
 }
 
+static void handle_cow_reduce(BlockDriverState *bs, QCowL2Meta *m)
+{
+if (bs->encrypted) {
+return;
+}
+if (!m->cow_start.reduced && m->cow_start.nb_bytes != 0 &&
+is_zero_sectors(bs,
+(m->offset + m->cow_start.offset) >> BDRV_SECTOR_BITS,
+m->cow_start.nb_bytes >> BDRV_SECTOR_BITS)) {
+m->cow_start.reduced = true;
+}
+if (!m->cow_end.reduced && m->cow_end.nb_bytes != 0 &&
+is_zero_sectors(bs,
+(m->offset + m->cow_end.offset) >> BDRV_SECTOR_BITS,
+m->cow_end.nb_bytes >> BDRV_SECTOR_BITS)) {
+m->cow_end.reduced = true;
+}
+}
+
 static void handle_alloc_space(BlockDriverState *bs, QCowL

[Qemu-devel] [PATCH v2 11/15] qcow2: fix misleading comment about L2 linking

2017-06-01 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ed65961..3dafd19 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -827,12 +827,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
 
 assert(l2_index + m->nb_clusters <= s->l2_size);
 for (i = 0; i < m->nb_clusters; i++) {
-/* if two concurrent writes happen to the same unallocated cluster
- * each write allocates separate cluster and writes data concurrently.
- * The first one to complete updates l2 table with pointer to its
- * cluster the second one has to do RMW (which is done above by
- * perform_cow()), update l2 table with its cluster pointer and free
- * old cluster. This is what this loop does */
+/* handle_dependencies() protects from normal cluster allocation
+ * collision; still L2 entry might be !0 in case of zero or compressed
+ * cluster reusage or writing over the snapshot
+ */
 if (l2_table[l2_index + i] != 0) {
 old_cluster[j++] = l2_table[l2_index + i];
 }
-- 
2.7.4




[Qemu-devel] [PATCH v2 07/15] qcow2: set inactive flag

2017-06-01 Thread Anton Nefedov
Qcow2State and BlockDriverState flags have to be in sync

Signed-off-by: Anton Nefedov 
---
 block/qcow2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index c6fb714..b090833 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1932,6 +1932,7 @@ static int qcow2_inactivate(BlockDriverState *bs)
 
 if (result == 0) {
 qcow2_mark_clean(bs);
+s->flags |= BDRV_O_INACTIVE;
 }
 
 return result;
-- 
2.7.4




[Qemu-devel] [PATCH v2 00/15] qcow2: space preallocation and COW improvements

2017-06-01 Thread Anton Nefedov
Changes in v2:
  - introduce new BDRV flag for write_zeroes()
  instead of using driver callback directly.
Skipped introducing new functions like bdrv_co_pallocate() for now:
  1. it seems ok to keep calling this write_zeroes() as zeroes
  are expected;
  2. most of the code can be reused now anyway, so changes to
  write_zeroes() path are not significant
  3. write_zeroes() alignment and max-request limits can also be reused

As a possible alternative we can have bdrv_co_pallocate() which can
switch to pwrite_zeroes(,flags|=BDRV_REQ_ALLOCATE) early.



This pull request is to address a few performance problems of qcow2 format:

  1. non cluster-aligned write requests (to unallocated clusters) explicitly
pad data with zeroes if there is no backing data. This can be avoided
and the whole clusters are preallocated and zeroed in a single
efficient write_zeroes() operation, also providing better host file
continuity

  2. moreover, efficient write_zeroes() operation can be used to preallocate
space megabytes ahead which gives noticeable improvement on some storage
types (e.g. distributed storages where space allocation operation is
expensive)

  3. preallocating/zeroing the clusters in advance makes possible to enable
simultaneous writes to the same unallocated cluster, which is beneficial
for parallel sequential write operations which are not cluster-aligned

Performance test results are added to commit messages (see patch 3, 12)

Anton Nefedov (11):
  block: introduce BDRV_REQ_ALLOCATE flag
  file-posix: support BDRV_REQ_ALLOCATE
  blkdebug: support BDRV_REQ_ALLOCATE
  qcow2: do not COW the empty areas
  qcow2: set inactive flag
  qcow2: handle_prealloc(): find out if area zeroed by earlier
preallocation
  qcow2: fix misleading comment about L2 linking
  qcow2-cluster: slightly refactor handle_dependencies()
  qcow2-cluster: make handle_dependencies() logic easier to follow
  qcow2: allow concurrent unaligned writes to the same clusters
  iotest 046: test simultaneous cluster write error case

Denis V. Lunev (3):
  qcow2: alloc space for COW in one chunk
  qcow2: preallocation at image expand
  qcow2: truncate preallocated space

Pavel Butsykin (1):
  qcow2: check space leak at the end of the image

 block/blkdebug.c   |   3 +-
 block/file-posix.c |   9 +-
 block/io.c |  19 ++-
 block/qcow2-cache.c|   3 +
 block/qcow2-cluster.c  | 218 +++--
 block/qcow2-refcount.c |  21 +++
 block/qcow2.c  | 273 -
 block/qcow2.h  |  26 
 block/trace-events |   1 +
 include/block/block.h  |   6 +-
 tests/qemu-iotests/026.out | 104 ++
 tests/qemu-iotests/026.out.nocache | 104 ++
 tests/qemu-iotests/029.out |   5 +-
 tests/qemu-iotests/046 |  38 +-
 tests/qemu-iotests/046.out |  23 
 tests/qemu-iotests/060 |   2 +-
 tests/qemu-iotests/060.out |  13 +-
 tests/qemu-iotests/061.out |   5 +-
 tests/qemu-iotests/066 |   2 +-
 tests/qemu-iotests/066.out |   9 +-
 tests/qemu-iotests/098.out |   7 +-
 tests/qemu-iotests/108.out |   5 +-
 tests/qemu-iotests/112.out |   5 +-
 23 files changed, 789 insertions(+), 112 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v2 06/15] qcow2: preallocation at image expand

2017-06-01 Thread Anton Nefedov
From: "Denis V. Lunev" 

This patch adds image preallocation at expand to provide better locality
of QCOW2 image file and optimize this procedure for some distributed
storages where this procedure is slow.

Image expand requests have to be suspended until the allocation is
performed which is done via special QCowL2Meta.
This meta is invisible to handle_dependencies() code.
This is the main reason for also calling preallocation before metadata
write: it might intersect with preallocation triggered by another IO,
and has to yield

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2-cache.c|   3 +
 block/qcow2-cluster.c  |   5 ++
 block/qcow2-refcount.c |  14 +
 block/qcow2.c  | 147 +
 block/qcow2.h  |   5 ++
 5 files changed, 174 insertions(+)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 1d25147..aa9da5f 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -204,6 +204,9 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, 
Qcow2Cache *c, int i)
 return ret;
 }
 
+/* check and preallocate extra space if touching a fresh metadata cluster 
*/
+qcow2_handle_prealloc(bs, c->entries[i].offset, s->cluster_size);
+
 if (c == s->refcount_block_cache) {
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
 } else if (c == s->l2_table_cache) {
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index ba8ce76..88dd555 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -108,6 +108,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t 
min_size,
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, new_l1_table_offset,
+  QEMU_ALIGN_UP(new_l1_size2, s->cluster_size));
+
 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
 for(i = 0; i < s->l1_size; i++)
 new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
@@ -1821,6 +1824,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState 
*bs, uint64_t *l1_table,
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, offset, s->cluster_size);
+
 ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
 if (ret < 0) {
 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 7c06061..873a1d2 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -547,6 +547,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
 }
 
 /* Write refcount blocks to disk */
+qcow2_handle_prealloc(bs, meta_offset, blocks_clusters * s->cluster_size);
+
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
 blocks_clusters * s->cluster_size);
@@ -561,6 +563,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
 cpu_to_be64s(&new_table[i]);
 }
 
+qcow2_handle_prealloc(bs, table_offset,
+  QEMU_ALIGN_UP(table_size * sizeof(uint64_t),
+s->cluster_size));
+
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 table_size * sizeof(uint64_t));
@@ -2104,6 +2110,8 @@ write_refblocks:
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, refblock_offset, s->cluster_size);
+
 /* The size of *refcount_table is always cluster-aligned, therefore the
  * write operation will not overflow */
 on_disk_refblock = (void *)((char *) *refcount_table +
@@ -2158,6 +2166,8 @@ write_refblocks:
 }
 
 assert(reftable_size < INT_MAX / sizeof(uint64_t));
+qcow2_handle_prealloc(bs, reftable_offset,
+  reftable_size * sizeof(uint64_t));
 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
   reftable_size * sizeof(uint64_t));
 if (ret < 0) {
@@ -2845,6 +2855,10 @@ int qcow2_change_refcount_order(BlockDriverState *bs, 
int refcount_order,
 cpu_to_be64s(&new_reftable[i]);
 }
 
+qcow2_handle_prealloc(bs, new_reftable_offset,
+  QEMU_ALIGN_UP(new_reftable_size * sizeof(uint64_t),
+s->cluster_size));
+
 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
   new_reftable_size * sizeof(uint64_t));
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 067bb87..c6fb714 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -464,6 +464,11 @@ static QemuOptsList qcow2_runtime_opts = {
 .type = QEMU_OPT_NUMBER,
 .help = "Clean unused cache entries after this time (in seconds)",
 },
+{
+.name = QCOW2_OPT_PRE

[Qemu-devel] [PATCH v2 09/15] qcow2: check space leak at the end of the image

2017-06-01 Thread Anton Nefedov
From: Pavel Butsykin 

Preallocating memory in the image may remain unused after the fall, for
the qcow2_check adds the ability to identify and fix it, so as not to
store extra memory on the host.

Signed-off-by: Pavel Butsykin 
Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2.c  |  31 +++
 tests/qemu-iotests/026.out | 104 -
 tests/qemu-iotests/026.out.nocache | 104 -
 tests/qemu-iotests/029.out |   5 +-
 tests/qemu-iotests/060.out |  10 +++-
 tests/qemu-iotests/061.out |   5 +-
 tests/qemu-iotests/066.out |   5 +-
 tests/qemu-iotests/098.out |   7 ++-
 tests/qemu-iotests/108.out |   5 +-
 tests/qemu-iotests/112.out |   5 +-
 10 files changed, 225 insertions(+), 56 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 33e5455..809102a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -322,6 +322,32 @@ int qcow2_mark_consistent(BlockDriverState *bs)
 return 0;
 }
 
+static int qcow2_check_extra_preallocation(BlockDriverState *bs,
+BdrvCheckResult *res, BdrvCheckMode fix)
+{
+BDRVQcow2State *s = bs->opaque;
+uint64_t img_size = bdrv_getlength(bs->file->bs);
+
+if (res->image_end_offset < img_size) {
+uint64_t count =
+DIV_ROUND_UP(img_size - res->image_end_offset, s->cluster_size);
+fprintf(stderr, "%s space leaked at the end of the image %jd\n",
+fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
+img_size - res->image_end_offset);
+res->leaks += count;
+if (fix & BDRV_FIX_LEAKS) {
+int ret = bdrv_truncate(bs->file, res->image_end_offset, NULL);
+if (ret < 0) {
+res->check_errors++;
+return ret;
+}
+res->leaks_fixed += count;
+}
+}
+
+return 0;
+}
+
 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
BdrvCheckMode fix)
 {
@@ -330,6 +356,11 @@ static int qcow2_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 return ret;
 }
 
+ret = qcow2_check_extra_preallocation(bs, result, fix);
+if (ret < 0) {
+return ret;
+}
+
 if (fix && result->check_errors == 0 && result->corruptions == 0) {
 ret = qcow2_mark_clean(bs);
 if (ret < 0) {
diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out
index 86a50a2..e8cf348 100644
--- a/tests/qemu-iotests/026.out
+++ b/tests/qemu-iotests/026.out
@@ -5,7 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 5; imm: off; once: on; write
 write failed: Input/output error
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 5; imm: off; once: on; write -b
@@ -33,7 +36,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 28; imm: off; once: on; write -b
@@ -181,7 +187,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 5; imm: off; once: on; write
 write failed: Input/output error
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 5; imm: off; once: on; write -b
@@ -207,7 +216,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 28; imm: off; once: on; write -b
@@ -468,20 +480,27 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: refblock_alloc_hookup; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked a

[Qemu-devel] [PATCH v2 03/15] blkdebug: support BDRV_REQ_ALLOCATE

2017-06-01 Thread Anton Nefedov
Support the flag if the underlying BDS supports it

Signed-off-by: Anton Nefedov 
---
 block/blkdebug.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index a5196e8..8b1401b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -415,7 +415,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 
 bs->supported_write_flags = BDRV_REQ_FUA &
 bs->file->bs->supported_write_flags;
-bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+bs->supported_zero_flags =
+(BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_ALLOCATE) &
 bs->file->bs->supported_zero_flags;
 ret = -EINVAL;
 
-- 
2.7.4




[Qemu-devel] [PATCH v2 02/15] file-posix: support BDRV_REQ_ALLOCATE

2017-06-01 Thread Anton Nefedov
Current write_zeroes implementation is good enough to satisfy this flag too

Signed-off-by: Anton Nefedov 
---
 block/file-posix.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index de2d3a2..117bbee 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -527,7 +527,6 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 
 s->has_discard = true;
 s->has_write_zeroes = true;
-bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
 if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
 s->needs_alignment = true;
 }
@@ -577,6 +576,11 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 }
 #endif
 
+bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+if (s->has_write_zeroes || s->has_fallocate) {
+bs->supported_zero_flags |= BDRV_REQ_ALLOCATE;
+}
+
 ret = 0;
 fail:
 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -1390,6 +1394,9 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData 
*aiocb)
 }
 #endif
 
+if (!s->has_fallocate) {
+aiocb->bs->supported_zero_flags &= ~BDRV_REQ_ALLOCATE;
+}
 return -ENOTSUP;
 }
 
-- 
2.7.4




[Qemu-devel] [PATCH v2 01/15] block: introduce BDRV_REQ_ALLOCATE flag

2017-06-01 Thread Anton Nefedov
The flag is supposed to indicate that the region of the disk image has
to be sufficiently allocated so it reads as zeroes. The call with the flag
set has to return -ENOTSUP if allocation cannot be done efficiently
(i.e. without falling back to writing actual buffers)

Signed-off-by: Anton Nefedov 
---
 block/io.c| 19 ---
 block/trace-events|  1 +
 include/block/block.h |  6 +-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index ed31810..d47efa9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1272,7 +1272,7 @@ static int coroutine_fn 
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 assert(!bs->supported_zero_flags);
 }
 
-if (ret == -ENOTSUP) {
+if (ret == -ENOTSUP && !(flags & BDRV_REQ_ALLOCATE)) {
 /* Fall back to bounce buffer if write zeroes is unsupported */
 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
 
@@ -1355,8 +1355,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild 
*child,
 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
 
 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
-!(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
-qemu_iovec_is_zero(qiov)) {
+!(flags & BDRV_REQ_ZERO_WRITE) && !(flags & BDRV_REQ_ALLOCATE) &&
+drv->bdrv_co_pwrite_zeroes && qemu_iovec_is_zero(qiov)) {
 flags |= BDRV_REQ_ZERO_WRITE;
 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
 flags |= BDRV_REQ_MAY_UNMAP;
@@ -1436,6 +1436,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild 
*child,
 
 assert(flags & BDRV_REQ_ZERO_WRITE);
 if (head_padding_bytes || tail_padding_bytes) {
+if (flags & BDRV_REQ_ALLOCATE) {
+return -ENOTSUP;
+}
 buf = qemu_blockalign(bs, align);
 iov = (struct iovec) {
 .iov_base   = buf,
@@ -1534,6 +1537,11 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
 return ret;
 }
 
+if (qiov && flags & BDRV_REQ_ALLOCATE) {
+/* allocation request with qiov provided doesn't make much sense */
+return -ENOTSUP;
+}
+
 bdrv_inc_in_flight(bs);
 /*
  * Align write if necessary by performing a read-modify-write cycle.
@@ -1665,6 +1673,11 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, 
int64_t offset,
 {
 trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
 
+if (flags & BDRV_REQ_MAY_UNMAP && flags & BDRV_REQ_ALLOCATE) {
+/* nonsense */
+return -ENOTSUP;
+}
+
 if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
 flags &= ~BDRV_REQ_MAY_UNMAP;
 }
diff --git a/block/trace-events b/block/trace-events
index 9a71c7f..a15c2cc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -15,6 +15,7 @@ bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, 
void *opaque) "bs
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num 
%"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num 
%"PRId64" nb_sectors %d"
 bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p 
offset %"PRId64" count %d flags %#x"
+bdrv_co_allocate(void *bs, int64_t offset, int count) "bs %p offset %"PRId64" 
count %d"
 bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t 
cluster_offset, unsigned int cluster_bytes) "bs %p offset %"PRId64" bytes %u 
cluster_offset %"PRId64" cluster_bytes %u"
 
 # block/stream.c
diff --git a/include/block/block.h b/include/block/block.h
index 9b355e9..53a357c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -65,9 +65,13 @@ typedef enum {
 BDRV_REQ_NO_SERIALISING = 0x8,
 BDRV_REQ_FUA= 0x10,
 BDRV_REQ_WRITE_COMPRESSED   = 0x20,
+/* BDRV_REQ_ALLOCATE is used to indicate that the driver is to
+ * efficiently allocate the space so it reads as zeroes or return an error
+ */
+BDRV_REQ_ALLOCATE   = 0x40,
 
 /* Mask of valid flags */
-BDRV_REQ_MASK   = 0x3f,
+BDRV_REQ_MASK   = 0x7f,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
-- 
2.7.4




[Qemu-devel] [PATCH v2 04/15] qcow2: alloc space for COW in one chunk

2017-06-01 Thread Anton Nefedov
From: "Denis V. Lunev" 

Currently each single write operation can result in 3 write operations
if guest offsets are not cluster aligned. One write is performed for the
real payload and two for COW-ed areas. Thus the data possibly lays
non-contiguously on the host filesystem. This will reduce further
sequential read performance significantly.

The patch allocates the space in the file with cluster granularity,
ensuring
  1. better host offset locality
  2. less space allocation operations
 (which can be expensive on distributed storage)

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index b3ba5da..cd5efba 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,24 @@ fail:
 return ret;
 }
 
+static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+BDRVQcow2State *s = bs->opaque;
+QCowL2Meta *m;
+
+for (m = l2meta; m != NULL; m = m->next) {
+uint64_t bytes = m->nb_clusters << s->cluster_bits;
+
+if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+continue;
+}
+
+/* try to alloc host space in one chunk for better locality */
+bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset, bytes,
+  BDRV_REQ_ALLOCATE);
+}
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
@@ -1656,8 +1674,12 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 if (ret < 0) {
 goto fail;
 }
-
 qemu_co_mutex_unlock(&s->lock);
+
+if (bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE) {
+handle_alloc_space(bs, l2meta);
+}
+
 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
 trace_qcow2_writev_data(qemu_coroutine_self(),
 cluster_offset + offset_in_cluster);
-- 
2.7.4




[Qemu-devel] [PATCH v2 10/15] qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation

2017-06-01 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 2 ++
 block/qcow2.c | 8 +++-
 block/qcow2.h | 4 
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c39e825..ed65961 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1143,6 +1143,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 {
 BDRVQcow2State *s = bs->opaque;
+const uint64_t old_data_end = s->data_end;
 int l2_index;
 uint64_t *l2_table;
 uint64_t entry;
@@ -1264,6 +1265,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 .alloc_offset   = alloc_cluster_offset,
 .offset = start_of_cluster(s, guest_offset),
 .nb_clusters= nb_clusters,
+.clusters_are_trailing = alloc_cluster_offset >= old_data_end,
 
 .keep_old_clusters  = keep_old_clusters,
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 809102a..92d0af6 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1686,7 +1686,13 @@ restart:
 
 if (end <= bdrv_getlength(file)) {
 /* No need to care, file size will not be changed */
-return false;
+
+/* We're safe to assume that the area is zeroes if the area
+ * was allocated at the end of data (s->data_end).
+ * In this case, the only way for file length to be bigger is that
+ * the area was preallocated by another request.
+ */
+return m->clusters_are_trailing;
 }
 
 meta = g_alloca(sizeof(*meta));
diff --git a/block/qcow2.h b/block/qcow2.h
index e28c54a..2fd8510 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -333,6 +333,10 @@ typedef struct QCowL2Meta
 /** Do not free the old clusters */
 bool keep_old_clusters;
 
+/** True if the area is allocated after the end of data area
+ *  (i.e. >= s->data_end), which means that it is zeroed */
+bool clusters_are_trailing;
+
 /**
  * Requests that overlap with this allocation and wait to be restarted
  * when the allocating request has completed.
-- 
2.7.4




Re: [Qemu-devel] [PATCH v3 10/13] hmp: add hmp analogue for qmp-chardev-change

2017-06-01 Thread Anton Nefedov



On 06/01/2017 10:12 AM, Marc-André Lureau wrote:

Hi

On Tue, May 30, 2017 at 6:08 PM Anton Nefedov 
mailto:anton.nefe...@virtuozzo.com>> wrote:


Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com>>
Reviewed-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>>
CC: Dr. David Alan Gilbert mailto:dgilb...@redhat.com>>
---
  chardev/char.c|  2 +-
  hmp-commands.hx   | 16 
  hmp.c | 34 ++
  hmp.h |  1 +
  include/sysemu/char.h | 12 
  5 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/chardev/char.c b/chardev/char.c
index 2d6e204..1e2a2dd 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -888,7 +888,7 @@ help_string_append(const char *name, void *opaque)
  g_string_append_printf(str, "\n%s", name);
  }

-static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error
**errp)
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
  {
  Error *local_err = NULL;
  const ChardevClass *cc;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index baeac47..19bfddd 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as
the -chardev command line switch.
  ETEXI

  {
+.name   = "chardev-change",
+.args_type  = "id:s,args:s",
+.params = "id args",
+.help   = "change chardev",
+.cmd= hmp_chardev_change,
+},
+
+STEXI
+@item chardev-change args
+@findex chardev-change
+chardev_change accepts existing chardev @var{id} and then the same
arguments


chardev_change vs chardev-change. Apparently, there is this difference 
for chardev-add too




Done.


+as the -chardev command line switch (except for "id").
+
+ETEXI
+
+{
  .name   = "chardev-remove",
  .args_type  = "id:s",
  .params = "id",
diff --git a/hmp.c b/hmp.c
index 20f5dab..7660495 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2224,6 +2224,40 @@ void hmp_chardev_add(Monitor *mon, const
QDict *qdict)
  hmp_handle_error(mon, &err);
  }

+void hmp_chardev_change(Monitor *mon, const QDict *qdict)
+{
+const char *args = qdict_get_str(qdict, "args");
+const char *id;
+Error *err = NULL;
+ChardevBackend *backend = NULL;
+ChardevReturn *ret = NULL;
+QemuOpts *opts =
qemu_opts_parse_noisily(qemu_find_opts("chardev"), args,
+ true);
+if (!opts) {
+error_setg(&err, "Parsing chardev args failed");
+goto end;
+}
+
+id = qdict_get_str(qdict, "id");
+if (qemu_opts_id(opts)) {
+error_setg(&err, "Unexpected 'id' parameter");


enlighten me, how is this triggered?



correct variant:

  chardev-change char2 pipe,path=/tmp/cent.fifo

vs. (without this check)

  (qemu) chardev-change char2 pipe,id=char2,path=/tmp/cent.fifo
  chardev-change char2 pipe,id=char2,path=/tmp/cent.fifo
  Duplicate ID 'char2' for chardev
  Parsing chardev args failed

or

  (qemu) chardev-change char2 pipe,id=newchar3,path=/tmp/cent.fifo

the command would succeed but the user might expect the id to change,
but it wouldn't


+goto end;
+}
+
+backend = qemu_chr_parse_opts(opts, &err);
+if (!backend) {
+goto end;
+}
+
+ret = qmp_chardev_change(id, backend, &err);
+
+end:
+qapi_free_ChardevReturn(ret);
+qapi_free_ChardevBackend(backend);
+qemu_opts_del(opts);
+hmp_handle_error(mon, &err);
+}
+
  void hmp_chardev_remove(Monitor *mon, const QDict *qdict)
  {
  Error *local_err = NULL;
diff --git a/hmp.h b/hmp.h
index d8b94ce..23e035c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const
QDict *qdict);
  void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
  void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
  void hmp_chardev_add(Monitor *mon, const QDict *qdict);
+void hmp_chardev_change(Monitor *mon, const QDict *qdict);
  void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
  void hmp_qemu_io(Monitor *mon, const QDict *qdict);
  void hmp_cpu_add(Monitor *mon, const QDict *qdict);
diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 342f531..18fcd26 100644
--- a/include/sysemu/char.h
+

Re: [Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap

2017-06-01 Thread Anton Nefedov



On 05/31/2017 10:20 PM, Marc-André Lureau wrote:

Hi

On Tue, May 30, 2017 at 6:12 PM Anton Nefedov 
mailto:anton.nefe...@virtuozzo.com>> wrote:


This patch adds a possibility to change a char device without a frontend
removal.

1. Ideally, it would have to happen transparently to a frontend, i.e.
frontend would continue its regular operation.
However, backends are not stateless and are set up by the frontends
via qemu_chr_fe_<> functions, and it's not (generally) possible to
replay
that setup entirely in a backend code, as different chardevs respond
to the setup calls differently, so do frontends work differently basing
on those setup responses.
Moreover, some frontend can generally get and save the backend pointer
(qemu_chr_fe_get_driver()), and it will become invalid after backend
change.

So, a frontend which would like to support chardev hotswap has to
register
a "backend change" handler, and redo its backend setup there.

2. Write path can be used by multiple threads and thus protected with
chr_write_lock.
So hotswap also has to be protected so write functions won't access
a backend being replaced.


Tbh, I don't understand the need for a different lock. Could you 
explain? Even better would be to write a test that shows in which way 
the lock helps.




hi Marc-André,

The existing chr_write_lock belongs to Chardev.
For the hotswap case, we need to ensure that be->chr won't change and
the old Chardev (together with its mutex) won't be destroyed while it's
used in the write functions.

Maybe we could move the lock to CharBackend, instead of creating a new
one. But I guess this
 1. won't work for mux, where multiple CharBackends share the same
Chardev
 2. won't work for some chardevs, like pty which uses the lock for the
timer handler

Sorry if I'm not explaining clearly enough or maybe I'm missing some
easier solution?

I can try to add a test but can't quite see yet how to freeze the old
chardev somewhere in cc->chr_write() and hotswap it while it's there.





Signed-off-by: Anton Nefedov mailto:anton.nefe...@virtuozzo.com>>
Reviewed-by: Vladimir Sementsov-Ogievskiy mailto:vsement...@virtuozzo.com>>


patch looks good otherwise

---
  chardev/char.c| 126
++
  include/sysemu/char.h |   9 
  qapi-schema.json  |  40 
  3 files changed, 165 insertions(+), 10 deletions(-)

--
Marc-André Lureau


/Anton



Re: [Qemu-devel] [PATCH v1 05/13] qcow2: set inactive flag

2017-05-31 Thread Anton Nefedov

On 05/26/2017 11:11 AM, Kevin Wolf wrote:

Am 19.05.2017 um 11:34 hat Anton Nefedov geschrieben:

Qcow2State and BlockDriverState flags have to be in sync

Signed-off-by: Anton Nefedov 
---
  block/qcow2.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 6e7ce96..07c1706 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1939,6 +1939,7 @@ static int qcow2_inactivate(BlockDriverState *bs)
  
  if (result == 0) {

  qcow2_mark_clean(bs);
+s->flags |= BDRV_O_INACTIVE;
  }


Good catch.

But can't we simply use bs->open_flags and completely get rid of
s->flags?

Kevin



the problem is that qcow2_close() or qcow2_inactivate() have to
consider the flags that the image was opened with,
but bs->open_flags (at least in invalidate_cache() case) at that point
contain the new flags already

/Anton



[Qemu-devel] [PATCH v3 09/13] test-char: add hotswap test

2017-05-30 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
---
 tests/test-char.c | 68 +++
 1 file changed, 68 insertions(+)

diff --git a/tests/test-char.c b/tests/test-char.c
index ed6b18f..cd54f88 100644
--- a/tests/test-char.c
+++ b/tests/test-char.c
@@ -635,6 +635,73 @@ static void char_invalid_test(void)
 g_assert_null(chr);
 }
 
+static int chardev_change(void *opaque)
+{
+return 0;
+}
+
+static int chardev_change_denied(void *opaque)
+{
+return -1;
+}
+
+static void char_hotswap_test(void)
+{
+char *chr_args;
+Chardev *chr;
+CharBackend be;
+
+gchar *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL);
+char *filename = g_build_filename(tmp_path, "file", NULL);
+ChardevFile file = { .out = filename };
+ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE,
+   .u.file.data = &file };
+
+int port;
+int sock = make_udp_socket(&port);
+g_assert_cmpint(sock, >, 0);
+
+chr_args = g_strdup_printf("udp:127.0.0.1:%d", port);
+
+chr = qemu_chr_new("chardev", chr_args);
+qemu_chr_fe_init(&be, chr, &error_abort);
+
+/* check that chardev operates correctly */
+char_udp_test_internal(chr, sock);
+
+/* set the handler that denies the hotswap */
+qemu_chr_fe_set_handlers(&be, NULL, NULL,
+ NULL, chardev_change_denied, NULL, NULL, true);
+
+/* now, change is denied and has to keep the old backend operating */
+qmp_chardev_change("chardev", &backend, NULL);
+g_assert(be.chr == chr);
+
+char_udp_test_internal(chr, sock);
+
+/* now allow the change */
+qemu_chr_fe_set_handlers(&be, NULL, NULL,
+ NULL, chardev_change, NULL, NULL, true);
+
+/* has to succeed now */
+qmp_chardev_change("chardev", &backend, &error_abort);
+g_assert(be.chr != chr);
+
+close(sock);
+chr = be.chr;
+
+/* run the file chardev test */
+char_file_test_internal(chr, filename);
+
+object_unparent(OBJECT(chr));
+
+g_unlink(filename);
+g_free(filename);
+g_rmdir(tmp_path);
+g_free(tmp_path);
+g_free(chr_args);
+}
+
 int main(int argc, char **argv)
 {
 qemu_init_main_loop(&error_abort);
@@ -666,6 +733,7 @@ int main(int argc, char **argv)
 #endif
 g_test_add_func("/char/socket", char_socket_test);
 g_test_add_func("/char/udp", char_udp_test);
+g_test_add_func("/char/hotswap", char_hotswap_test);
 
 return g_test_run();
 }
-- 
2.7.4




[Qemu-devel] [PATCH v3 08/13] test-char: split char_file_test

2017-05-30 Thread Anton Nefedov
makes it possible to test the existing chardev-file

Signed-off-by: Anton Nefedov 
---
 tests/test-char.c | 127 +-
 1 file changed, 79 insertions(+), 48 deletions(-)

diff --git a/tests/test-char.c b/tests/test-char.c
index ad0752a..ed6b18f 100644
--- a/tests/test-char.c
+++ b/tests/test-char.c
@@ -484,75 +484,103 @@ static void char_udp_test(void)
 char_udp_test_internal(NULL, 0);
 }
 
-static void char_file_test(void)
+#ifndef _WIN32
+static void char_file_fifo_test(void)
 {
+Chardev *chr;
+CharBackend be;
 char *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL);
+char *fifo = g_build_filename(tmp_path, "fifo", NULL);
 char *out = g_build_filename(tmp_path, "out", NULL);
-char *contents = NULL;
-ChardevFile file = { .out = out };
+ChardevFile file = { .in = fifo,
+ .has_in = true,
+ .out = out };
 ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE,
.u.file.data = &file };
+FeHandler fe = { 0, };
+int fd, ret;
+
+if (mkfifo(fifo, 0600) < 0) {
+abort();
+}
+
+fd = open(fifo, O_RDWR);
+ret = write(fd, "fifo-in", 8);
+g_assert_cmpint(ret, ==, 8);
+
+chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend,
+   &error_abort);
+
+qemu_chr_fe_init(&be, chr, &error_abort);
+qemu_chr_fe_set_handlers(&be,
+ fe_can_read,
+ fe_read,
+ fe_event,
+ NULL,
+ &fe, NULL, true);
+
+main_loop();
+
+close(fd);
+
+g_assert_cmpint(fe.read_count, ==, 8);
+g_assert_cmpstr(fe.read_buf, ==, "fifo-in");
+
+qemu_chr_fe_deinit(&be);
+object_unref(OBJECT(chr));
+
+g_unlink(fifo);
+g_free(fifo);
+g_unlink(out);
+g_free(out);
+g_rmdir(tmp_path);
+g_free(tmp_path);
+}
+#endif
+
+static void char_file_test_internal(Chardev *ext_chr, const char *filepath)
+{
+char *tmp_path = g_dir_make_tmp("qemu-test-char.XX", NULL);
+char *out;
 Chardev *chr;
+char *contents = NULL;
+ChardevFile file = {};
+ChardevBackend backend = { .type = CHARDEV_BACKEND_KIND_FILE,
+   .u.file.data = &file };
 gsize length;
 int ret;
 
-chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend,
-   &error_abort);
+if (ext_chr) {
+chr = ext_chr;
+out = g_strdup(filepath);
+file.out = out;
+} else {
+out = g_build_filename(tmp_path, "out", NULL);
+file.out = out;
+chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend,
+   &error_abort);
+}
 ret = qemu_chr_write_all(chr, (uint8_t *)"hello!", 6);
 g_assert_cmpint(ret, ==, 6);
-object_unref(OBJECT(chr));
 
 ret = g_file_get_contents(out, &contents, &length, NULL);
 g_assert(ret == TRUE);
 g_assert_cmpint(length, ==, 6);
 g_assert(strncmp(contents, "hello!", 6) == 0);
-g_free(contents);
-
-#ifndef _WIN32
-{
-CharBackend be;
-FeHandler fe = { 0, };
-char *fifo = g_build_filename(tmp_path, "fifo", NULL);
-int fd;
-
-if (mkfifo(fifo, 0600) < 0) {
-abort();
-}
-
-fd = open(fifo, O_RDWR);
-ret = write(fd, "fifo-in", 8);
-g_assert_cmpint(ret, ==, 8);
-
-file.in = fifo;
-file.has_in = true;
-chr = qemu_chardev_new(NULL, TYPE_CHARDEV_FILE, &backend,
-   &error_abort);
-
-qemu_chr_fe_init(&be, chr, &error_abort);
-qemu_chr_fe_set_handlers(&be,
- fe_can_read,
- fe_read,
- fe_event,
- NULL,
- &fe, NULL, true);
-
-main_loop();
 
-close(fd);
-
-g_assert_cmpint(fe.read_count, ==, 8);
-g_assert_cmpstr(fe.read_buf, ==, "fifo-in");
-qemu_chr_fe_deinit(&be);
+if (!ext_chr) {
 object_unref(OBJECT(chr));
-g_unlink(fifo);
-g_free(fifo);
+g_unlink(out);
+g_free(out);
 }
-#endif
-
-g_unlink(out);
+g_free(contents);
 g_rmdir(tmp_path);
 g_free(tmp_path);
-g_free(out);
+}
+
+static void char_file_test(void)
+{
+char_file_test_internal(NULL, NULL);
 }
 
 static void char_null_test(void)
@@ -633,6 +661,9 @@ int main(int argc, char **argv)
 g_test_add_func("/char/pipe", char_pipe_test);
 #endif
 g_test_add_func("/char/file", char_file_test);
+#ifndef _WIN32
+g_test_add_func("/char/file-fifo", char_file_fifo_test);
+#endif
 g_test_add_func("/char/socket", char_socket_test);
 g_test_add_func("/char/udp", char_udp_test);
 
-- 
2.7.4




[Qemu-devel] [PATCH v3 04/13] char: forbid direct chardevice access for hotswap devices

2017-05-30 Thread Anton Nefedov
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support
should not access CharDriver ptr directly as CharDriver might change.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c|  7 +++
 include/sysemu/char.h | 10 ++
 2 files changed, 17 insertions(+)

diff --git a/chardev/char.c b/chardev/char.c
index ba1a5f5..1eed934 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -484,9 +484,16 @@ static Notifier muxes_realize_notify = {
 
 Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 {
+/* this is unsafe for the users that support chardev hotswap */
+assert(be->chr_be_change == NULL);
 return be->chr;
 }
 
+bool qemu_chr_fe_backend_connected(CharBackend *be)
+{
+return !!be->chr;
+}
+
 static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 014ebbc..117d628 100644
--- a/include/sysemu/char.h
+++ b/include/sysemu/char.h
@@ -391,10 +391,20 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error 
**errp);
  *
  * Returns the driver associated with a CharBackend or NULL if no
  * associated Chardev.
+ * Note: avoid this function as the driver should never be accessed directly,
+ *   especially by the frontends that support chardevice hotswap.
+ *   Consider qemu_chr_fe_backend_connected() to check for driver existence
  */
 Chardev *qemu_chr_fe_get_driver(CharBackend *be);
 
 /**
+ * @qemu_chr_fe_backend_connected:
+ *
+ * Returns true if there is a chardevice associated with @be.
+ */
+bool qemu_chr_fe_backend_connected(CharBackend *be);
+
+/**
  * @qemu_chr_fe_deinit:
  *
  * Dissociate the CharBackend from the Chardev.
-- 
2.7.4




[Qemu-devel] [PATCH v3 03/13] char: chardevice hotswap

2017-05-30 Thread Anton Nefedov
This patch adds a possibility to change a char device without a frontend
removal.

1. Ideally, it would have to happen transparently to a frontend, i.e.
frontend would continue its regular operation.
However, backends are not stateless and are set up by the frontends
via qemu_chr_fe_<> functions, and it's not (generally) possible to replay
that setup entirely in a backend code, as different chardevs respond
to the setup calls differently, so do frontends work differently basing
on those setup responses.
Moreover, some frontend can generally get and save the backend pointer
(qemu_chr_fe_get_driver()), and it will become invalid after backend change.

So, a frontend which would like to support chardev hotswap has to register
a "backend change" handler, and redo its backend setup there.

2. Write path can be used by multiple threads and thus protected with
chr_write_lock.
So hotswap also has to be protected so write functions won't access
a backend being replaced.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c| 126 ++
 include/sysemu/char.h |   9 
 qapi-schema.json  |  40 
 3 files changed, 165 insertions(+), 10 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 1b097b3..ba1a5f5 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -132,12 +132,16 @@ static bool qemu_chr_replay(Chardev *chr)
 
 int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+Chardev *s;
 ChardevClass *cc;
 int ret;
 
+qemu_mutex_lock(&be->chr_lock);
+s = be->chr;
+
 if (!s) {
-return 0;
+ret = 0;
+goto end;
 }
 
 if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_PLAY) {
@@ -145,7 +149,7 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, 
int len)
 replay_char_write_event_load(&ret, &offset);
 assert(offset <= len);
 qemu_chr_fe_write_buffer(s, buf, offset, &offset);
-return ret;
+goto end;
 }
 
 cc = CHARDEV_GET_CLASS(s);
@@ -161,7 +165,9 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, 
int len)
 if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_RECORD) {
 replay_char_write_event_save(ret, ret < 0 ? 0 : ret);
 }
-
+
+end:
+qemu_mutex_unlock(&be->chr_lock);
 return ret;
 }
 
@@ -191,13 +197,16 @@ int qemu_chr_write_all(Chardev *s, const uint8_t *buf, 
int len)
 
 int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+Chardev *s;
+int ret;
 
-if (!s) {
-return 0;
-}
+qemu_mutex_lock(&be->chr_lock);
+
+s = be->chr;
+ret = s ? qemu_chr_write_all(s, buf, len) : 0;
 
-return qemu_chr_write_all(s, buf, len);
+qemu_mutex_unlock(&be->chr_lock);
+return ret;
 }
 
 int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len)
@@ -478,7 +487,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 return be->chr;
 }
 
-bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
 
@@ -507,6 +516,16 @@ unavailable:
 return false;
 }
 
+bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+{
+if (!qemu_chr_fe_connect(b, s, errp)) {
+return false;
+}
+
+qemu_mutex_init(&b->chr_lock);
+return true;
+}
+
 static bool qemu_chr_is_busy(Chardev *s)
 {
 if (CHARDEV_IS_MUX(s)) {
@@ -531,6 +550,7 @@ void qemu_chr_fe_deinit(CharBackend *b)
 d->backends[b->tag] = NULL;
 }
 b->chr = NULL;
+qemu_mutex_destroy(&b->chr_lock);
 }
 }
 
@@ -1306,6 +1326,92 @@ ChardevReturn *qmp_chardev_add(const char *id, 
ChardevBackend *backend,
 return ret;
 }
 
+ChardevReturn *qmp_chardev_change(const char *id, ChardevBackend *backend,
+  Error **errp)
+{
+CharBackend *be;
+const ChardevClass *cc;
+Chardev *chr, *chr_new;
+bool closed_sent = false;
+ChardevReturn *ret;
+
+chr = qemu_chr_find(id);
+if (!chr) {
+error_setg(errp, "Chardev '%s' does not exist", id);
+return NULL;
+}
+
+if (CHARDEV_IS_MUX(chr)) {
+error_setg(errp, "Mux device hotswap not supported yet");
+return NULL;
+}
+
+if (qemu_chr_replay(chr)) {
+error_setg(errp,
+"Chardev '%s' cannot be changed in record/replay mode", id);
+return NULL;
+}
+
+be = chr->be;
+if (!be) {
+/* easy case */
+object_unparent(OBJECT(chr));
+return qmp_chardev_add(id, backend, errp);
+}
+
+if (!be->chr_be_change) {
+error_setg(errp, "Cha

[Qemu-devel] [PATCH v3 11/13] virtio-console: chardev hotswap support

2017-05-30 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Amit Shah 
---
 hw/char/virtio-console.c | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c
index cf7331d..bd74669 100644
--- a/hw/char/virtio-console.c
+++ b/hw/char/virtio-console.c
@@ -49,7 +49,7 @@ static ssize_t flush_buf(VirtIOSerialPort *port,
 VirtConsole *vcon = VIRTIO_CONSOLE(port);
 ssize_t ret;
 
-if (!qemu_chr_fe_get_driver(&vcon->chr)) {
+if (!qemu_chr_fe_backend_connected(&vcon->chr)) {
 /* If there's no backend, we can just say we consumed all data. */
 return len;
 }
@@ -163,12 +163,35 @@ static void chr_event(void *opaque, int event)
 }
 }
 
+static int chr_be_change(void *opaque)
+{
+VirtConsole *vcon = opaque;
+VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(vcon);
+VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port);
+
+if (k->is_console) {
+qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
+ NULL, chr_be_change, vcon, NULL, true);
+} else {
+qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
+ chr_event, chr_be_change, vcon, NULL, false);
+}
+
+if (vcon->watch) {
+g_source_remove(vcon->watch);
+vcon->watch = qemu_chr_fe_add_watch(&vcon->chr,
+G_IO_OUT | G_IO_HUP,
+chr_write_unblocked, vcon);
+}
+
+return 0;
+}
+
 static void virtconsole_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev);
 VirtConsole *vcon = VIRTIO_CONSOLE(dev);
 VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(dev);
-Chardev *chr = qemu_chr_fe_get_driver(&vcon->chr);
 
 if (port->id == 0 && !k->is_console) {
 error_setg(errp, "Port number 0 on virtio-serial devices reserved "
@@ -176,7 +199,7 @@ static void virtconsole_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
-if (chr) {
+if (qemu_chr_fe_backend_connected(&vcon->chr)) {
 /*
  * For consoles we don't block guest data transfer just
  * because nothing is connected - we'll just let it go
@@ -188,11 +211,13 @@ static void virtconsole_realize(DeviceState *dev, Error 
**errp)
  */
 if (k->is_console) {
 qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
- NULL, NULL, vcon, NULL, true);
+ NULL, chr_be_change,
+ vcon, NULL, true);
 virtio_serial_open(port);
 } else {
 qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
- chr_event, NULL, vcon, NULL, false);
+ chr_event, chr_be_change,
+ vcon, NULL, false);
 }
 }
 }
-- 
2.7.4




[Qemu-devel] [PATCH v3 01/13] char: move QemuOpts->ChardevBackend translation to a separate func

2017-05-30 Thread Anton Nefedov
parse function will be used by the following patch

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c | 70 --
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 4e24dc3..3a0f543 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -854,17 +854,13 @@ help_string_append(const char *name, void *opaque)
 g_string_append_printf(str, "\n%s", name);
 }
 
-Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
-Error **errp)
+static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
-Chardev *chr;
 int i;
 ChardevBackend *backend = NULL;
 const char *name = qemu_opt_get(opts, "backend");
-const char *id = qemu_opts_id(opts);
-char *bid = NULL;
 
 if (name == NULL) {
 error_setg(errp, "chardev: \"%s\" missing backend",
@@ -872,21 +868,6 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 return NULL;
 }
 
-if (is_help_option(name)) {
-GString *str = g_string_new("");
-
-chardev_name_foreach(help_string_append, str);
-
-error_report("Available chardev backend types: %s", str->str);
-g_string_free(str, true);
-exit(0);
-}
-
-if (id == NULL) {
-error_setg(errp, "chardev: no id specified");
-return NULL;
-}
-
 for (i = 0; i < ARRAY_SIZE(chardev_alias_table); i++) {
 if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) {
 name = chardev_alias_table[i].typename;
@@ -902,16 +883,12 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 backend = g_new0(ChardevBackend, 1);
 backend->type = CHARDEV_BACKEND_KIND_NULL;
 
-if (qemu_opt_get_bool(opts, "mux", 0)) {
-bid = g_strdup_printf("%s-base", id);
-}
-
-chr = NULL;
 if (cc->parse) {
 cc->parse(opts, backend, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
+qapi_free_ChardevBackend(backend);
+return NULL;
 }
 } else {
 ChardevCommon *ccom = g_new0(ChardevCommon, 1);
@@ -919,6 +896,47 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 backend->u.null.data = ccom; /* Any ChardevCommon member would work */
 }
 
+return backend;
+}
+
+Chardev *qemu_chr_new_from_opts(QemuOpts *opts, Error **errp)
+{
+const ChardevClass *cc;
+Chardev *chr = NULL;
+ChardevBackend *backend = NULL;
+const char *name = qemu_opt_get(opts, "backend");
+const char *id = qemu_opts_id(opts);
+char *bid = NULL;
+
+if (name && is_help_option(name)) {
+GString *str = g_string_new("");
+
+chardev_name_foreach(help_string_append, str);
+
+error_report("Available chardev backend types: %s", str->str);
+g_string_free(str, true);
+exit(0);
+}
+
+if (id == NULL) {
+error_setg(errp, "chardev: no id specified");
+return NULL;
+}
+
+backend = qemu_chr_parse_opts(opts, errp);
+if (backend == NULL) {
+return NULL;
+}
+
+cc = char_get_class(name, errp);
+if (cc == NULL) {
+goto out;
+}
+
+if (qemu_opt_get_bool(opts, "mux", 0)) {
+bid = g_strdup_printf("%s-base", id);
+}
+
 chr = qemu_chardev_new(bid ? bid : id,
object_class_get_name(OBJECT_CLASS(cc)),
backend, errp);
-- 
2.7.4




[Qemu-devel] [PATCH v3 10/13] hmp: add hmp analogue for qmp-chardev-change

2017-05-30 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Dr. David Alan Gilbert 
---
 chardev/char.c|  2 +-
 hmp-commands.hx   | 16 
 hmp.c | 34 ++
 hmp.h |  1 +
 include/sysemu/char.h | 12 
 5 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/chardev/char.c b/chardev/char.c
index 2d6e204..1e2a2dd 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -888,7 +888,7 @@ help_string_append(const char *name, void *opaque)
 g_string_append_printf(str, "\n%s", name);
 }
 
-static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts, Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index baeac47..19bfddd 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as the -chardev 
command line switch.
 ETEXI
 
 {
+.name   = "chardev-change",
+.args_type  = "id:s,args:s",
+.params = "id args",
+.help   = "change chardev",
+.cmd= hmp_chardev_change,
+},
+
+STEXI
+@item chardev-change args
+@findex chardev-change
+chardev_change accepts existing chardev @var{id} and then the same arguments
+as the -chardev command line switch (except for "id").
+
+ETEXI
+
+{
 .name   = "chardev-remove",
 .args_type  = "id:s",
 .params = "id",
diff --git a/hmp.c b/hmp.c
index 20f5dab..7660495 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2224,6 +2224,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict)
 hmp_handle_error(mon, &err);
 }
 
+void hmp_chardev_change(Monitor *mon, const QDict *qdict)
+{
+const char *args = qdict_get_str(qdict, "args");
+const char *id;
+Error *err = NULL;
+ChardevBackend *backend = NULL;
+ChardevReturn *ret = NULL;
+QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args,
+ true);
+if (!opts) {
+error_setg(&err, "Parsing chardev args failed");
+goto end;
+}
+
+id = qdict_get_str(qdict, "id");
+if (qemu_opts_id(opts)) {
+error_setg(&err, "Unexpected 'id' parameter");
+goto end;
+}
+
+backend = qemu_chr_parse_opts(opts, &err);
+if (!backend) {
+goto end;
+}
+
+ret = qmp_chardev_change(id, backend, &err);
+
+end:
+qapi_free_ChardevReturn(ret);
+qapi_free_ChardevBackend(backend);
+qemu_opts_del(opts);
+hmp_handle_error(mon, &err);
+}
+
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict)
 {
 Error *local_err = NULL;
diff --git a/hmp.h b/hmp.h
index d8b94ce..23e035c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
 void hmp_chardev_add(Monitor *mon, const QDict *qdict);
+void hmp_chardev_change(Monitor *mon, const QDict *qdict);
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
 void hmp_qemu_io(Monitor *mon, const QDict *qdict);
 void hmp_cpu_add(Monitor *mon, const QDict *qdict);
diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 342f531..18fcd26 100644
--- a/include/sysemu/char.h
+++ b/include/sysemu/char.h
@@ -131,6 +131,18 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend);
 
 /**
+ * @qemu_chr_parse_opts:
+ *
+ * Parse the options to the ChardevBackend struct.
+ *
+ * @opts
+ *
+ * Returns: a new backend
+ */
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
+Error **errp);
+
+/**
  * @qemu_chr_new:
  *
  * Create a new character backend from a URI.
-- 
2.7.4




[Qemu-devel] [PATCH v3 00/13] chardevice hotswap

2017-05-30 Thread Anton Nefedov
Changed in v3:
  - minor remarks to patch 1 applied
  - patch 3: avoid using bottom-half, handle syncronously
As mentioned, it gets thing complicated and is only a problem for
a monitor-connected chardev hotswap and that is not supported for now
  - tests added (patches 6-9)



This serie is a v2 of the February submit
http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html

The interface is changed as requested and the changes are slightly reworked
and split into separate patches.



The patchset adds support of the character device change without
a frontend device removal.
Yet isa-serial and virtio-serial frontends are supported.

The feature can be helpful for e.g. Windows debug allowing to
establish connection to a live VM from VM with WinDbg.

Anton Nefedov (13):
  char: move QemuOpts->ChardevBackend translation to a separate func
  char: add backend hotswap handler
  char: chardevice hotswap
  char: forbid direct chardevice access for hotswap devices
  char: avoid chardevice direct access
  test-char: unref chardev-udp after test
  test-char: split char_udp_test
  test-char: split char_file_test
  test-char: add hotswap test
  hmp: add hmp analogue for qmp-chardev-change
  virtio-console: chardev hotswap support
  serial: move TIOCM update to a separate function
  serial: chardev hotswap support

 backends/rng-egd.c  |   2 +-
 chardev/char-mux.c  |   1 +
 chardev/char.c  | 212 ---
 gdbstub.c   |   4 +-
 hmp-commands.hx |  16 +++
 hmp.c   |  34 ++
 hmp.h   |   1 +
 hw/arm/pxa2xx.c |   3 +-
 hw/arm/strongarm.c  |   4 +-
 hw/char/bcm2835_aux.c   |   2 +-
 hw/char/cadence_uart.c  |   4 +-
 hw/char/debugcon.c  |   4 +-
 hw/char/digic-uart.c|   2 +-
 hw/char/escc.c  |   8 +-
 hw/char/etraxfs_ser.c   |   2 +-
 hw/char/exynos4210_uart.c   |   4 +-
 hw/char/grlib_apbuart.c |   4 +-
 hw/char/imx_serial.c|   2 +-
 hw/char/ipoctal232.c|   4 +-
 hw/char/lm32_juart.c|   2 +-
 hw/char/lm32_uart.c |   2 +-
 hw/char/mcf_uart.c  |   2 +-
 hw/char/milkymist-uart.c|   2 +-
 hw/char/parallel.c  |   2 +-
 hw/char/pl011.c |   2 +-
 hw/char/sclpconsole-lm.c|   4 +-
 hw/char/sclpconsole.c   |   4 +-
 hw/char/serial.c|  63 ---
 hw/char/sh_serial.c |   4 +-
 hw/char/spapr_vty.c |   4 +-
 hw/char/stm32f2xx_usart.c   |   3 +-
 hw/char/terminal3270.c  |   4 +-
 hw/char/virtio-console.c|  35 +-
 hw/char/xen_console.c   |   4 +-
 hw/char/xilinx_uartlite.c   |   2 +-
 hw/ipmi/ipmi_bmc_extern.c   |   4 +-
 hw/mips/boston.c|   2 +-
 hw/mips/mips_malta.c|   2 +-
 hw/misc/ivshmem.c   |   6 +-
 hw/usb/ccid-card-passthru.c |   6 +-
 hw/usb/dev-serial.c |   7 +-
 hw/usb/redirect.c   |   7 +-
 include/sysemu/char.h   |  43 
 monitor.c   |   4 +-
 net/colo-compare.c  |  14 ++-
 net/filter-mirror.c |   8 +-
 net/slirp.c |   2 +-
 net/vhost-user.c|   7 +-
 qapi-schema.json|  40 +++
 qtest.c |   2 +-
 tests/test-char.c   | 263 +---
 tests/vhost-user-test.c |   2 +-
 52 files changed, 669 insertions(+), 202 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v3 06/13] test-char: unref chardev-udp after test

2017-05-30 Thread Anton Nefedov
this is only not a problem if the test is last in a suite,
otherwise it makes the following main_loop() calls to fail

Signed-off-by: Anton Nefedov 
---
 tests/test-char.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test-char.c b/tests/test-char.c
index f3b377f..d63d3d2 100644
--- a/tests/test-char.c
+++ b/tests/test-char.c
@@ -456,6 +456,8 @@ static void char_udp_test(void)
 
 close(sock);
 g_free(tmp);
+qemu_chr_fe_deinit(&be);
+object_unparent(OBJECT(chr));
 }
 
 static void char_file_test(void)
-- 
2.7.4




[Qemu-devel] [PATCH v3 13/13] serial: chardev hotswap support

2017-05-30 Thread Anton Nefedov
for a backend change, a number of ioctls has to be replayed to sync
the current setup of a frontend to a backend tty. This is hopefully
enough so we don't have to track, store and replay the whole original
control byte sequence.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Michael S. Tsirkin 
CC: Paolo Bonzini 
---
 hw/char/serial.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index 1e6bdeb..ed01637 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -891,9 +891,37 @@ static void serial_reset(void *opaque)
 s->msr &= ~UART_MSR_ANY_DELTA;
 }
 
+static int serial_be_change(void *opaque)
+{
+SerialState *s = opaque;
+
+qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
+ serial_event, serial_be_change, s, NULL, true);
+
+serial_update_parameters(s);
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK,
+  &s->last_break_enable);
+
+s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0;
+serial_update_msl(s);
+
+if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) {
+serial_update_tiocm(s);
+}
+
+if (s->watch_tag > 0) {
+g_source_remove(s->watch_tag);
+s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP,
+ serial_watch_cb, s);
+}
+
+return 0;
+}
+
 void serial_realize_core(SerialState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create serial device, empty char device");
 return;
 }
@@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp)
 qemu_register_reset(serial_reset, s);
 
 qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
- serial_event, NULL, s, NULL, true);
+ serial_event, serial_be_change, s, NULL, true);
 fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH);
 fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH);
 serial_reset(s);
-- 
2.7.4




[Qemu-devel] [PATCH v3 07/13] test-char: split char_udp_test

2017-05-30 Thread Anton Nefedov
makes it possible to test the existing chardev-udp

Signed-off-by: Anton Nefedov 
---
 tests/test-char.c | 58 +++
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/tests/test-char.c b/tests/test-char.c
index d63d3d2..ad0752a 100644
--- a/tests/test-char.c
+++ b/tests/test-char.c
@@ -413,16 +413,11 @@ static void char_pipe_test(void)
 }
 #endif
 
-static void char_udp_test(void)
+static int make_udp_socket(int *port)
 {
-struct sockaddr_in addr = { 0, }, other;
-SocketIdleData d = { 0, };
-Chardev *chr;
-CharBackend be;
+struct sockaddr_in addr = { 0, };
 socklen_t alen = sizeof(addr);
 int ret, sock = qemu_socket(PF_INET, SOCK_DGRAM, 0);
-char buf[10];
-char *tmp;
 
 g_assert_cmpint(sock, >, 0);
 addr.sin_family = AF_INET ;
@@ -433,19 +428,41 @@ static void char_udp_test(void)
 ret = getsockname(sock, (struct sockaddr *)&addr, &alen);
 g_assert_cmpint(ret, ==, 0);
 
-tmp = g_strdup_printf("udp:127.0.0.1:%d",
-  ntohs(addr.sin_port));
-chr = qemu_chr_new("client", tmp);
-g_assert_nonnull(chr);
+*port = ntohs(addr.sin_port);
+return sock;
+}
+
+static void char_udp_test_internal(Chardev *reuse_chr, int sock)
+{
+struct sockaddr_in other;
+SocketIdleData d = { 0, };
+Chardev *chr;
+CharBackend *be;
+socklen_t alen = sizeof(other);
+int ret;
+char buf[10];
+char *tmp = NULL;
+
+if (reuse_chr) {
+chr = reuse_chr;
+be = chr->be;
+} else {
+int port;
+sock = make_udp_socket(&port);
+tmp = g_strdup_printf("udp:127.0.0.1:%d", port);
+chr = qemu_chr_new("client", tmp);
+g_assert_nonnull(chr);
+
+be = g_alloca(sizeof(CharBackend));
+qemu_chr_fe_init(be, chr, &error_abort);
+}
 
 d.chr = chr;
-qemu_chr_fe_init(&be, chr, &error_abort);
-qemu_chr_fe_set_handlers(&be, socket_can_read_hello, socket_read_hello,
+qemu_chr_fe_set_handlers(be, socket_can_read_hello, socket_read_hello,
  NULL, NULL, &d, NULL, true);
 ret = qemu_chr_write_all(chr, (uint8_t *)"hello", 5);
 g_assert_cmpint(ret, ==, 5);
 
-alen = sizeof(addr);
 ret = recvfrom(sock, buf, sizeof(buf), 0,
(struct sockaddr *)&other, &alen);
 g_assert_cmpint(ret, ==, 5);
@@ -454,10 +471,17 @@ static void char_udp_test(void)
 
 main_loop();
 
-close(sock);
+if (!reuse_chr) {
+close(sock);
+qemu_chr_fe_deinit(be);
+object_unparent(OBJECT(chr));
+}
 g_free(tmp);
-qemu_chr_fe_deinit(&be);
-object_unparent(OBJECT(chr));
+}
+
+static void char_udp_test(void)
+{
+char_udp_test_internal(NULL, 0);
 }
 
 static void char_file_test(void)
-- 
2.7.4




[Qemu-devel] [PATCH v3 12/13] serial: move TIOCM update to a separate function

2017-05-30 Thread Anton Nefedov
will be used by the following patch

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Michael S. Tsirkin 
CC: Paolo Bonzini 
Reviewed-by: Marc-André Lureau 
---
 hw/char/serial.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index d8d34d0..1e6bdeb 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -312,6 +312,24 @@ static void serial_write_fcr(SerialState *s, uint8_t val)
 }
 }
 
+static void serial_update_tiocm(SerialState *s)
+{
+int flags;
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags);
+
+flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR);
+
+if (s->mcr & UART_MCR_RTS) {
+flags |= CHR_TIOCM_RTS;
+}
+if (s->mcr & UART_MCR_DTR) {
+flags |= CHR_TIOCM_DTR;
+}
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags);
+}
+
 static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val,
 unsigned size)
 {
@@ -426,24 +444,13 @@ static void serial_ioport_write(void *opaque, hwaddr 
addr, uint64_t val,
 break;
 case 4:
 {
-int flags;
 int old_mcr = s->mcr;
 s->mcr = val & 0x1f;
 if (val & UART_MCR_LOOP)
 break;
 
 if (s->poll_msl >= 0 && old_mcr != s->mcr) {
-
-qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags);
-
-flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR);
-
-if (val & UART_MCR_RTS)
-flags |= CHR_TIOCM_RTS;
-if (val & UART_MCR_DTR)
-flags |= CHR_TIOCM_DTR;
-
-qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags);
+serial_update_tiocm(s);
 /* Update the modem status after a one-character-send 
wait-time, since there may be a response
from the device/computer at the other end of the serial 
line */
 timer_mod(s->modem_status_poll, 
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->char_transmit_time);
-- 
2.7.4




[Qemu-devel] [PATCH v3 05/13] char: avoid chardevice direct access

2017-05-30 Thread Anton Nefedov
frontends should avoid accessing CharDriver struct where possible

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c  | 5 +
 gdbstub.c   | 2 +-
 hw/arm/strongarm.c  | 2 +-
 hw/char/cadence_uart.c  | 2 +-
 hw/char/debugcon.c  | 2 +-
 hw/char/escc.c  | 6 +++---
 hw/char/exynos4210_uart.c   | 2 +-
 hw/char/grlib_apbuart.c | 2 +-
 hw/char/ipoctal232.c| 2 +-
 hw/char/parallel.c  | 2 +-
 hw/char/sclpconsole-lm.c| 2 +-
 hw/char/sclpconsole.c   | 2 +-
 hw/char/sh_serial.c | 2 +-
 hw/char/spapr_vty.c | 2 +-
 hw/char/terminal3270.c  | 2 +-
 hw/char/xen_console.c   | 2 +-
 hw/ipmi/ipmi_bmc_extern.c   | 2 +-
 hw/misc/ivshmem.c   | 4 ++--
 hw/usb/ccid-card-passthru.c | 4 ++--
 hw/usb/dev-serial.c | 5 ++---
 hw/usb/redirect.c   | 5 ++---
 include/sysemu/char.h   | 7 +++
 net/filter-mirror.c | 2 +-
 23 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 1eed934..2d6e204 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -494,6 +494,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be)
 return !!be->chr;
 }
 
+bool qemu_chr_fe_backend_open(CharBackend *be)
+{
+return be->chr && be->chr->be_open;
+}
+
 static bool qemu_chr_fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
diff --git a/gdbstub.c b/gdbstub.c
index 1ac0489..68cbe8a 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2001,7 +2001,7 @@ int gdbserver_start(const char *device)
NULL, &error_abort);
 monitor_init(mon_chr, 0);
 } else {
-if (qemu_chr_fe_get_driver(&s->chr)) {
+if (qemu_chr_fe_backend_connected(&s->chr)) {
 object_unparent(OBJECT(qemu_chr_fe_get_driver(&s->chr)));
 }
 mon_chr = s->mon_chr;
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index bec093d..9d7cf21 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -1105,7 +1105,7 @@ static void strongarm_uart_tx(void *opaque)
 
 if (s->utcr3 & UTCR3_LBM) /* loopback */ {
 strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1);
-} else if (qemu_chr_fe_get_driver(&s->chr)) {
+} else if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1);
diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c
index 71867b3..19636c0 100644
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -278,7 +278,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, 
GIOCondition cond,
 int ret;
 
 /* instant drain the fifo when there's no back-end */
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 s->tx_count = 0;
 return FALSE;
 }
diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c
index 6d95297..bd0d4f0 100644
--- a/hw/char/debugcon.c
+++ b/hw/char/debugcon.c
@@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = {
 
 static void debugcon_realize_core(DebugconState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create debugcon device, empty char device");
 return;
 }
diff --git a/hw/char/escc.c b/hw/char/escc.c
index aa882b6..dbbeb4a 100644
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -416,7 +416,7 @@ static void escc_update_parameters(ChannelState *s)
 int speed, parity, data_bits, stop_bits;
 QEMUSerialSetParams ssp;
 
-if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser)
+if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser)
 return;
 
 if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) {
@@ -556,7 +556,7 @@ static void escc_mem_write(void *opaque, hwaddr addr,
 trace_escc_mem_writeb_data(CHN_C(s), val);
 s->tx = val;
 if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled
-if (qemu_chr_fe_get_driver(&s->chr)) {
+if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx, 1);
@@ -1012,7 +1012,7 @@ static void escc_realize(DeviceState *dev, Error **errp)
   ESCC_SIZE << s->it_shift);
 
 for (i = 0; i < 2; i++) {
-if (qemu_chr_fe_get_driver(&s->chn[i].chr)) {
+if (qemu_chr_fe_backend_connected(&s->chn[i].chr)) {
   

[Qemu-devel] [PATCH v3 02/13] char: add backend hotswap handler

2017-05-30 Thread Anton Nefedov
Frontends should have an interface to setup the handler of a backend change.
The interface will be used in the next commits

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Marc-André Lureau 
---
 backends/rng-egd.c  |  2 +-
 chardev/char-mux.c  |  1 +
 chardev/char.c  |  4 +++-
 gdbstub.c   |  2 +-
 hw/arm/pxa2xx.c |  3 ++-
 hw/arm/strongarm.c  |  2 +-
 hw/char/bcm2835_aux.c   |  2 +-
 hw/char/cadence_uart.c  |  2 +-
 hw/char/debugcon.c  |  2 +-
 hw/char/digic-uart.c|  2 +-
 hw/char/escc.c  |  2 +-
 hw/char/etraxfs_ser.c   |  2 +-
 hw/char/exynos4210_uart.c   |  2 +-
 hw/char/grlib_apbuart.c |  2 +-
 hw/char/imx_serial.c|  2 +-
 hw/char/ipoctal232.c|  2 +-
 hw/char/lm32_juart.c|  2 +-
 hw/char/lm32_uart.c |  2 +-
 hw/char/mcf_uart.c  |  2 +-
 hw/char/milkymist-uart.c|  2 +-
 hw/char/pl011.c |  2 +-
 hw/char/sclpconsole-lm.c|  2 +-
 hw/char/sclpconsole.c   |  2 +-
 hw/char/serial.c|  2 +-
 hw/char/sh_serial.c |  2 +-
 hw/char/spapr_vty.c |  2 +-
 hw/char/stm32f2xx_usart.c   |  3 ++-
 hw/char/terminal3270.c  |  2 +-
 hw/char/virtio-console.c|  4 ++--
 hw/char/xen_console.c   |  2 +-
 hw/char/xilinx_uartlite.c   |  2 +-
 hw/ipmi/ipmi_bmc_extern.c   |  2 +-
 hw/mips/boston.c|  2 +-
 hw/mips/mips_malta.c|  2 +-
 hw/misc/ivshmem.c   |  2 +-
 hw/usb/ccid-card-passthru.c |  2 +-
 hw/usb/dev-serial.c |  2 +-
 hw/usb/redirect.c   |  2 +-
 include/sysemu/char.h   |  5 +
 monitor.c   |  4 ++--
 net/colo-compare.c  | 14 --
 net/filter-mirror.c |  6 +++---
 net/slirp.c |  2 +-
 net/vhost-user.c|  7 ---
 qtest.c |  2 +-
 tests/test-char.c   | 14 ++
 tests/vhost-user-test.c |  2 +-
 47 files changed, 78 insertions(+), 59 deletions(-)

diff --git a/backends/rng-egd.c b/backends/rng-egd.c
index 380b19a..0b0e945 100644
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp)
 
 /* FIXME we should resubmit pending requests when the CDS reconnects. */
 qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read,
- rng_egd_chr_read, NULL, s, NULL, true);
+ rng_egd_chr_read, NULL, NULL, s, NULL, true);
 }
 
 static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
diff --git a/chardev/char-mux.c b/chardev/char-mux.c
index 37d42c6..5849ea5 100644
--- a/chardev/char-mux.c
+++ b/chardev/char-mux.c
@@ -278,6 +278,7 @@ void mux_chr_set_handlers(Chardev *chr, GMainContext 
*context)
  mux_chr_can_read,
  mux_chr_read,
  mux_chr_event,
+ NULL,
  chr,
  context, true);
 }
diff --git a/chardev/char.c b/chardev/char.c
index 3a0f543..1b097b3 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -522,7 +522,7 @@ void qemu_chr_fe_deinit(CharBackend *b)
 assert(b);
 
 if (b->chr) {
-qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true);
+qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true);
 if (b->chr->be == b) {
 b->chr->be = NULL;
 }
@@ -538,6 +538,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
   IOCanReadHandler *fd_can_read,
   IOReadHandler *fd_read,
   IOEventHandler *fd_event,
+  BackendChangeHandler *be_change,
   void *opaque,
   GMainContext *context,
   bool set_open)
@@ -561,6 +562,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
 b->chr_can_read = fd_can_read;
 b->chr_read = fd_read;
 b->chr_event = fd_event;
+b->chr_be_change = be_change;
 b->opaque = opaque;
 if (cc->chr_update_read_handler) {
 cc->chr_update_read_handler(s, context);
diff --git a/gdbstub.c b/gdbstub.c
index 86eed4f..1ac0489 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2013,7 +2013,7 @@ int gdbserver_start(const char *device)
 if (chr) {
 qemu_chr_fe_init(&s->chr, chr, &error_abort);
 qemu_chr_fe_set_handlers(&s->chr, gdb_chr_can_receive, gdb_chr_receive,
- gdb_chr_event, NULL, NULL, true);
+ gdb_chr_event, NULL, NULL, NULL, true);
 }
 s->state = chr ? RS_IDLE : RS_INACTIVE;
 s->mon_chr = mon_chr;
diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index eea551d..3e51882 10

Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW

2017-05-26 Thread Anton Nefedov

On 05/26/2017 04:08 PM, Alberto Garcia wrote:

On Fri 26 May 2017 02:47:55 PM CEST, Anton Nefedov wrote:

Tried the another machine; about 10% improvement here

   [...]

[root@localhost ~]# fio --name=randwrite --blocksize=4k
--filename=/dev/sdb --rw=randwrite --direct=1 --ioengine=libaio
--size=2g --io_size=32m


In my tests I sometimes detected slight performance decreases in that
HDD scenario but using 'write' instead of 'randwrite' (and --runtime=60
instead of --io_size).

Can you try and see how that works for you?

Berto



For me it keeps giving pretty much the same result before and after

 write: io=512736KB, bw=8545.4KB/s, iops=2136, runt= 60004msec

/Anton



Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW

2017-05-26 Thread Anton Nefedov


On 05/26/2017 01:17 PM, Kevin Wolf wrote:

Am 24.05.2017 um 18:09 hat Anton Nefedov geschrieben:

I agree; as mentioned we have similar patches and they don't conflict much.
We noticed a performance regression on HDD though, for the
presumably optimized case (random 4k write over a large backed
image); so the patches were put on hold.


You're talking about your own patches that should do the same thing,
right? Can you re-do the same test with Berto's patches? Maybe there was
just an implementation glitch in yours.

This approach should very obviously result in a performance improvement,
and the patches are relatively simple, so I'm very much inclined to
merge this as soon as possible.

Kevin




Tried the another machine; about 10% improvement here

[root@dpclient centos-7.3-x86_64]# qemu-img info ./harddisk2.hdd
image: ./harddisk2.hdd
file format: qcow2
virtual size: 2.0G (2147483648 bytes)
disk size: 260K
cluster_size: 65536
backing file: harddisk2.hdd.base (actual path: ./harddisk2.hdd.base)
Format specific information:
compat: 1.1
lazy refcounts: false
refcount bits: 16
corrupt: false
[root@dpclient centos-7.3-x86_64]# qemu-img info ./harddisk2.hdd.base
image: ./harddisk2.hdd.base
file format: qcow2
virtual size: 2.0G (2147483648 bytes)
disk size: 2.0G
cluster_size: 65536
Format specific information:
compat: 1.1
lazy refcounts: true
refcount bits: 16
corrupt: false
[root@dpclient centos-7.3-x86_64]# filefrag ./harddisk2.hdd.base
./harddisk2.hdd.base: 3 extents found


[root@localhost ~]# fio --name=randwrite --blocksize=4k
--filename=/dev/sdb --rw=randwrite --direct=1 --ioengine=libaio
--size=2g --io_size=32m

/* master */
write: io=32768KB, bw=372785B/s, iops=91, runt= 90010msec
/* Berto's patches */
write: io=32768KB, bw=404304B/s, iops=98, runt= 82993msec


/Anton



Re: [Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk

2017-05-26 Thread Anton Nefedov



On 05/26/2017 11:57 AM, Denis V. Lunev wrote:

On 05/26/2017 11:11 AM, Kevin Wolf wrote:

Am 19.05.2017 um 11:34 hat Anton Nefedov geschrieben:

From: "Denis V. Lunev" 

Currently each single write operation can result in 3 write operations
if guest offsets are not cluster aligned. One write is performed for the
real payload and two for COW-ed areas. Thus the data possibly lays
non-contiguously on the host filesystem. This will reduce further
sequential read performance significantly.

The patch allocates the space in the file with cluster granularity,
ensuring
   1. better host offset locality
   2. less space allocation operations
  (which can be expensive on distributed storages)

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 

I don't think this is the right approach. You end up with two
write operations: One write_zeroes and then one data write. If the
backend actually supports efficient zero writes, write_zeroes won't
necessarily allocate space, but writing data will definitely split
the already existing allocation. If anything, we need a new
bdrv_allocate() or something that would call fallocate() instead of
abusing write_zeroes.

great idea. Very nice then.



hi Kevin, thanks a lot for your comments.

The driver's callback is commented as:

/*
 * Efficiently zero a region of the disk image.  Typically an image 
format

 * would use a compact metadata representation to implement this.  This
 * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
 * will be called instead.
 */
int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
int64_t offset, int count, BdrvRequestFlags flags);


so it looks like one may expect either efficiency or ENOTSUP from that
one? but not necessarily from the safe "facade" bdrv_co_pwrite_zeroes()
which falls back to pwritev()

/*
 * Efficiently zero a region of the disk image.  Note that this is a 
regular

 * I/O request like read or write and should have a reasonable size.  This
 * function is not suitable for zeroing the entire image in a single 
request

 * because it may allocate memory for the entire region.
 */
int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
   int count, BdrvRequestFlags flags);


(maybe we should have even more explicit comment here)


Then let's indeed create a new bdrv_allocate(), which will result in
bdrv_co_pwrite_zeroes(flags|=ZERO_EFFICIENTLY), so we'll reuse most of
the code and employ the same driver callback, but return ENOTSUP if it
fails?



It seems much clearer to me that simply unifying the three write
requests into a single one is an improvement. And it's easy to do, I
even had a patch once to do this. The reason that I didn't send it was
that it seemed to conflict with the data cache approach

These changes help a lot on 2 patterns:
- writing 4Kb into the middle of the block. The bigger the block size,
   more we gain
- sequential write, which is not sector aligned and comes in 2 requests:
   we perform allocation, which should be significantly faster than actual
   write and after that both parts of the write can be executed in parallel.
At my opinion both cases are frequent and important.

But OK, the code should be improved, you are absolutely right here.


  block/qcow2.c | 32 +++-
  1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index a8d61f0..2e6a0ec 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,32 @@ fail:
  return ret;
  }
  
+static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)

+{
+BDRVQcow2State *s = bs->opaque;
+BlockDriverState *file = bs->file->bs;
+QCowL2Meta *m;
+int ret;
+
+for (m = l2meta; m != NULL; m = m->next) {
+uint64_t bytes = m->nb_clusters << s->cluster_bits;
+
+if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+continue;
+}
+
+/* try to alloc host space in one chunk for better locality */
+ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 
0);

No. This is what you bypass:

* All sanity checks that the block layer does

* bdrv_inc/dec_in_flight(), which is required for drain to work
   correctly. Not doing this will cause crashes.

* tracked_request_begin/end(), mark_request_serialising() and
   wait_serialising_requests(), which are required for serialising
   requests to work correctly

* Ensuring correct request alignment for file. This means that e.g.
   qcow2 with cluster size 512 on a host with a 4k native disk will
   break.

* blkdebug events

* before_write_notifiers. Not calling these will cause corrupted backups
   if someone backups file.

* Dirty bitmap updates

* Updating write_gen, wr_highest_offset and total_sectors

* Ensuring tha

Re: [Qemu-devel] [PATCH v2 3/9] char: chardevice hotswap

2017-05-25 Thread Anton Nefedov

Hi



Hi Marc-André, thanks for reviewing, pls see answers below


On Fri, May 19, 2017 at 4:51 PM Anton Nefedov 
wrote:


This patch adds a possibility to change a char device without a frontend
removal.

1. Ideally, it would have to happen transparently to a frontend, i.e.
frontend would continue its regular operation.
However, backends are not stateless and are set up by the frontends
via qemu_chr_fe_<> functions, and it's not (generally) possible to replay
that setup entirely in a backend code, as different chardevs respond
to the setup calls differently, so do frontends work differently basing
on those setup responses.
Moreover, some frontend can generally get and save the backend pointer
(qemu_chr_fe_get_driver()), and it will become invalid after backend
change.

So, a frontend which would like to support chardev hotswap has to register
a "backend change" handler, and redo its backend setup there.

2. Write path can be used by multiple threads and thus protected with
chr_write_lock.
So hotswap also has to be protected so write functions won't access
a backend being replaced.



Why not use the chr_write_lock then? (rename it chr_lock?)



chr_write_lock is a Chardev property, it will be gone together with a
swapped device.

We could remove chr_write_lock and leave only the new be->chr_lock, 
since it covers everything; I guess the only problem is mux, where

multiple CharBackends share the same Chardev.

Would smth like this be better?:

static void char_lock(CharBackend *be)
{
qemu_mutex_lock(be->chr && CHARDEV_IS_MUX(be->chr) ?
&be->chr->chr_lock : &be->chr_lock);
}

and use this instead of lock(be->chr_lock) and remove chr_write_lock.
(This would rely on a fact that mux is never hotswapped so be->chr_lock
is not needed).





3. Hotswap function can be called from e.g. a read handler of a monitor
socket. This can cause troubles so it's safer to defer execution to
a bottom-half (however, it means we cannot return some of the errors
synchronously - but most of them we can)



What kind of troubles?



if someone tried to hotswap monitor, it would look like:

(gdb) bt
#0 qmp_chardev_change (id=id@entry=0x7fc24bee7a60 "charmonitor",
backend=backend@entry=0x7fc24cc28290, errp=errp@entry=0x7ffde4d8d4f0)
at /mnt/code/us-qemu/chardev/char.c:1438
#1 0x7fc249fd914b in hmp_chardev_change (mon=,
qdict=) at /mnt/code/us-qemu/hmp.c:2252
#2 0x7fc249edeece in handle_hmp_command (mon=mon@entry=0x7fc24bef87c0,
cmdline=0x7fc24bf2345f "charmonitor
socket,path=/vz/vmprivate/centosos/mon.socket,server,nowait") at
/mnt/code/us-qemu/monitor.c:3100
#3 0x7fc249ee02fa in monitor_command_cb (opaque=0x7fc24bef87c0,
cmdline=, readline_opaque=)
at /mnt/code/us-qemu/monitor.c:3897
#4 0x7fc24a242428 in readline_handle_byte (rs=0x7fc24bf23450,
ch=) at /mnt/code/us-qemu/util/readline.c:393
#5 0x7fc249edf0e2 in monitor_read (opaque=,
buf=, size=)
at /mnt/code/us-qemu/monitor.c:3880
#6 0x7fc24a1deb2b in tcp_chr_read (chan=,
cond=, opaque=)
at /mnt/code/us-qemu/chardev/char-socket.c:441
#7 0x7fc240997d7a in g_main_dispatch (context=0x7fc24beda950) at
gmain.c:3152
#8 g_main_context_dispatch (context=context@entry=0x7fc24beda950) at
gmain.c:3767
#9 0x7fc24a22fe7c in glib_pollfds_poll () at
/mnt/code/us-qemu/util/main-loop.c:213
#10 os_host_main_loop_wait (timeout=) at
/mnt/code/us-qemu/util/main-loop.c:261
#11 main_loop_wait (nonblocking=nonblocking@entry=0) at
/mnt/code/us-qemu/util/main-loop.c:517
#12 0x7fc249e9a19a in main_loop () at /mnt/code/us-qemu/vl.c:1900
#13 main (argc=, argv=, envp=) at /mnt/code/us-qemu/vl.c:4732

i.e. hotswap the device that is in tcp_chr_read (frame #6), and can
potentially be accessed (with old pointer on a stack) after it is
destroyed by qmp_chardev_change()

however, there is no support for monitor hotswap in the patchset. Maybe
it's not worth to mess with bottom-halves at all? It causes problems,
like those mentioned in your further comments




Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c| 147
++
 include/sysemu/char.h |  10 
 qapi-schema.json  |  40 ++
 3 files changed, 187 insertions(+), 10 deletions(-)

-bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+static bool fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {



I would rather keep a qemu_chr prefix for consistency



Done.




 int tag = 0;

@@ -507,6 +516,17 @@ unavailable:
 return false;
 }

+bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+{
+if (!fe_connect(b, s, errp)) {
+return false;
+}
+
+qemu_mutex_init(&b->chr_lock);
+b->hotswap_bh = NULL;
+return true;
+}
+
 static bool qemu_chr_is_busy(Chardev *s)
 {
 if (CHARDEV_IS_MUX(s)) {
@@ -531,6 +551,10 @@ void qem

Re: [Qemu-devel] [PATCH v1 04/13] qcow2: preallocation at image expand

2017-05-24 Thread Anton Nefedov



On 05/22/2017 10:29 PM, Eric Blake wrote:

On 05/19/2017 04:34 AM, Anton Nefedov wrote:

From: "Denis V. Lunev" 

This patch adds image preallocation at expand to provide better locality
of QCOW2 image file and optimize this procedure for some distributed
storages where this procedure is slow.

Image expand requests have to be suspended until the allocation is
performed which is done via special QCowL2Meta.
This meta is invisible to handle_dependencies() code.
This is the main reason for also calling preallocation before metadata
write: it might intersect with preallocation triggered by another IO,
and has to yield


How does this interact with Max's work on preallocated truncate?
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg00267.html



Seems like Max's patchset makes it possible to manually (with qemu-img)
allocate a continuous guest address space in advance?

Preallocation in this series is a little bit different:
- it's in-flight, and it fallocates (using drv->write_zeroes) space
ahead in the underlying image as qcow2 writes beyond EOF.
- it doesn't link L2. So it leaves the image 'sparse' on qcow2 level.

If you ask me, these 2 patchsets are not contradictory, and probably
won't even merge-conflict :)

/Anton




Re: [Qemu-devel] [PATCH 7/7] qcow2: Merge the writing of the COW regions with the guest data

2017-05-24 Thread Anton Nefedov

If the guest tries to write data that results on the allocation of a
new cluster, instead of writing the guest data first and then the data
from the COW regions, write everything together using one single I/O
operation.

This can improve the write performance by 25% or more, depending on
several factors such as the media type, the cluster size and the I/O
request size.

Signed-off-by: Alberto Garcia 
---
 block/qcow2-cluster.c | 34 ++
 block/qcow2.c | 58
++-
 block/qcow2.h |  7 +++
 3 files changed, 80 insertions(+), 19 deletions(-)

-/* And now we can write everything */
-qemu_iovec_reset(&qiov);
-qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
-ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
-if (ret < 0) {
-goto fail;
+/* And now we can write everything. If we have the guest data we
+ * can write everything in one single operation */
+if (m->data_qiov) {
+qemu_iovec_reset(&qiov);
+qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
+qemu_iovec_concat(&qiov, m->data_qiov, 0, data_bytes);
+qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);


Can it be a problem if (m->data_qiov->niov == IOV_MAX)?
We had to add merge-iovecs code for the case (maybe there's better
solution?)

Also, will this work if allocation is split into several l2metas?
e.g. one has cow_start.nb_bytes and another has cow_end.nb_bytes

/Anton



Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW

2017-05-24 Thread Anton Nefedov



On 05/24/2017 07:20 PM, Alberto Garcia wrote:

On Wed 24 May 2017 06:09:42 PM CEST, Anton Nefedov wrote:


I agree; as mentioned we have similar patches and they don't conflict
much.  We noticed a performance regression on HDD though, for the
presumably optimized case (random 4k write over a large backed image);
so the patches were put on hold.


Interesting, I think that scenario was noticeably faster in my
tests. What cluster size(s) and image size(s) were you using?


64k cluster, 2g image, write 32m in portions of 4k at random offsets

/Anton



Re: [Qemu-devel] [PATCH 0/7] qcow2: Reduce the number of I/O ops when doing COW

2017-05-24 Thread Anton Nefedov


On 05/24/2017 05:20 PM, Alberto Garcia wrote:

On Tue 23 May 2017 04:36:52 PM CEST, Eric Blake wrote:


here's a patch series that rewrites the copy-on-write code in the
qcow2 driver to reduce the number of I/O operations.


And it competes with Denis and Anton's patches:
https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg04547.html

What plan of attack should we take on merging the best parts of these
two series?


I took a look at that series and unless I overlooked something important
it seems that there's actually not that much overlap. Denis and Anton's
series deals with cluster preallocation and mine reduces the number of
I/O operations when there's a COW scenario. Most of my modifications are
in the peform_cow() function, but they barely touch that one.

I think we can review both separately, either of us can rebase our
series on top of the other, I don't expect big changes or conflicts.

Berto



I agree; as mentioned we have similar patches and they don't conflict much.
We noticed a performance regression on HDD though, for the presumably 
optimized case (random 4k write over a large backed image); so the 
patches were put on hold.

SSD looked fine though.

/Anton



Re: [Qemu-devel] [PATCH v1 02/13] qcow2: is_zero_sectors(): return true if area is outside of backing file

2017-05-23 Thread Anton Nefedov



On 05/22/2017 10:14 PM, Eric Blake wrote:

On 05/22/2017 02:12 PM, Eric Blake wrote:


+++ b/block/qcow2.c
@@ -2482,7 +2482,7 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t 
start,
 int64_t res;

 if (start + count > bs->total_sectors) {
-count = bs->total_sectors - start;
+count = start < bs->total_sectors ? bs->total_sectors - start : 0;
 }

 if (!count) {
@@ -2490,7 +2490,9 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t 
start,
 }
 res = bdrv_get_block_status_above(bs, NULL, start, count,
   &nr, &file);
-return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
+return res >= 0
+&& (((res & BDRV_BLOCK_ZERO) && nr == count)
+|| nr == 0);


The logic makes sense to me, although the formatting is unusual (we tend
to split && and || with the operator at the tail of the previous line,
not the head of the new line).

This quick check may make me revisit whether it is worth my my RFC
series about adding BDRV_BLOCK_EOF for more quickly tracking reads
beyond EOF (my solution in that series was able to make the same change
to test 154, but by doing it at the block layer instead of the qcow2.c
code).  https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg01127.html


Actually, re-reading my RFC - I was able to change 6 instances in test
154, while your tweak only affected 2 instances (you still left four
instances that were allocating).  So my approach may still be more
optimal in the long run.



Yes, looks like your approach is more general; let's drop this one then

/Anton



Re: [Qemu-devel] [PATCH v1 03/13] qcow2: do not COW the empty areas

2017-05-23 Thread Anton Nefedov



On 05/22/2017 10:24 PM, Eric Blake wrote:

On 05/19/2017 04:34 AM, Anton Nefedov wrote:

If COW area of the newly allocated cluster is zeroes, there is no reason
to write zero sectors in perform_cow() again now as whole clusters are
zeroed out in single chunks by handle_alloc_space().


But that's only true if you can guarantee that handle_alloc_space()
succeeded at ensuring the cluster reads as zeroes.  If you silently
ignore errors (which is what patch 1/13 does), you risk assuming that
the cluster reads as zeroes when in reality it does not, and then you
have corrupted data.



Sure; COW is only skipped if pwrite_zeroes() from patch 1/13
succeeds


The idea of avoiding a COW of areas that read as zero at the source when
the destination also already reads as zeroes makes sense, but I'm not
convinced that this patch is safe as written.



/Anton



Re: [Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk

2017-05-23 Thread Anton Nefedov



On 05/22/2017 10:00 PM, Eric Blake wrote:

On 05/19/2017 04:34 AM, Anton Nefedov wrote:

From: "Denis V. Lunev" 

Currently each single write operation can result in 3 write operations
if guest offsets are not cluster aligned. One write is performed for the
real payload and two for COW-ed areas. Thus the data possibly lays
non-contiguously on the host filesystem. This will reduce further
sequential read performance significantly.

The patch allocates the space in the file with cluster granularity,
ensuring
  1. better host offset locality
  2. less space allocation operations
 (which can be expensive on distributed storages)


s/storages/storage/



Done.



Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)




diff --git a/block/qcow2.c b/block/qcow2.c
index a8d61f0..2e6a0ec 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,32 @@ fail:
 return ret;
 }

+static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+BDRVQcow2State *s = bs->opaque;
+BlockDriverState *file = bs->file->bs;
+QCowL2Meta *m;
+int ret;
+
+for (m = l2meta; m != NULL; m = m->next) {
+uint64_t bytes = m->nb_clusters << s->cluster_bits;
+
+if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+continue;
+}
+
+/* try to alloc host space in one chunk for better locality */
+ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 
0);


Are we guaranteed that this is a fast operation?  (That is, it either
results in a hole or an error, and doesn't waste time tediously writing
actual zeroes)



well, block_int.h reads:

/*
* Efficiently zero a region of the disk image. Typically an image format
* would use a compact metadata representation to implement this. This
* function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
* will be called instead.
*/
int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
int64_t offset, int count, BdrvRequestFlags flags);


(and that's why the driver function is used directly, bypassing the 
'safe' bdrv interface that would try to write zeroes no matter the cost)


As far as I checked the drivers mostly follow the idea


+
+if (ret != 0) {
+continue;
+}


Supposing we are using a file system that doesn't support holes, then
ret will not be zero, and we ended up not allocating anything after all.
 Is that a problem that we are just blindly continuing the loop as our
reaction to the error?

/reads further

I guess not - you aren't reacting to any error call, but merely using
the side effect that an allocation happened for speed when it worked,
and ignoring failure (you get the old behavior of the write() now
causing the allocation) when it didn't.



yes, exactly


+
+file->total_sectors = MAX(file->total_sectors,
+  (m->alloc_offset + bytes) / 
BDRV_SECTOR_SIZE);
+}
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
@@ -1656,8 +1682,12 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 if (ret < 0) {
 goto fail;
 }
-
 qemu_co_mutex_unlock(&s->lock);
+
+if (bs->file->bs->drv->bdrv_co_pwrite_zeroes != NULL) {
+handle_alloc_space(bs, l2meta);
+}


Is it really a good idea to be modifying the underlying protocol image
outside of the mutex?



as far as I understand, qcow2 usually modifies the underlying image
outside of the mutex? I guess it's qcow2 metadata that we wouldn't want
to touch unlocked


At any rate, it looks like your patch is doing a best-effort write
zeroes as an attempt to trigger consecutive allocation of the entire
cluster in the underlying protocol right after a cluster has been
allocated at the qcow2 format layer.  Which means there are more
syscalls now than there were previously, but now when we do three
write() calls at offsets B, A, C, those three calls are into file space
that was allocated earlier by the write zeroes, rather than fresh calls
into unallocated space that is likely to trigger up to three disjoint
allocations.

As a discussion point, wouldn't we achieve the same effect of less
fragmentation if we instead collect our data into a bounce buffer, and
only then do a single write() (or more likely, a writev() where the iov
is set up to reconstruct a single buffer on the syscall, but where the
source data is still at different offsets)?  We'd be avoiding the extra
syscalls of pre-allocating the cluster, and while our write() call is
still c

[Qemu-devel] [PATCH v2 7/9] serial: move TIOCM update to a separate function

2017-05-19 Thread Anton Nefedov
will be used by the following patch

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Michael S. Tsirkin 
CC: Paolo Bonzini 
---
 hw/char/serial.c | 31 +++
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index d8d34d0..1e6bdeb 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -312,6 +312,24 @@ static void serial_write_fcr(SerialState *s, uint8_t val)
 }
 }
 
+static void serial_update_tiocm(SerialState *s)
+{
+int flags;
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags);
+
+flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR);
+
+if (s->mcr & UART_MCR_RTS) {
+flags |= CHR_TIOCM_RTS;
+}
+if (s->mcr & UART_MCR_DTR) {
+flags |= CHR_TIOCM_DTR;
+}
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags);
+}
+
 static void serial_ioport_write(void *opaque, hwaddr addr, uint64_t val,
 unsigned size)
 {
@@ -426,24 +444,13 @@ static void serial_ioport_write(void *opaque, hwaddr 
addr, uint64_t val,
 break;
 case 4:
 {
-int flags;
 int old_mcr = s->mcr;
 s->mcr = val & 0x1f;
 if (val & UART_MCR_LOOP)
 break;
 
 if (s->poll_msl >= 0 && old_mcr != s->mcr) {
-
-qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_GET_TIOCM, &flags);
-
-flags &= ~(CHR_TIOCM_RTS | CHR_TIOCM_DTR);
-
-if (val & UART_MCR_RTS)
-flags |= CHR_TIOCM_RTS;
-if (val & UART_MCR_DTR)
-flags |= CHR_TIOCM_DTR;
-
-qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_TIOCM, &flags);
+serial_update_tiocm(s);
 /* Update the modem status after a one-character-send 
wait-time, since there may be a response
from the device/computer at the other end of the serial 
line */
 timer_mod(s->modem_status_poll, 
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->char_transmit_time);
-- 
2.7.4




[Qemu-devel] [PATCH v2 9/9] char: avoid chardevice direct access

2017-05-19 Thread Anton Nefedov
frontends should avoid accessing CharDriver struct where possible

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c  | 5 +
 gdbstub.c   | 2 +-
 hw/arm/strongarm.c  | 2 +-
 hw/char/cadence_uart.c  | 2 +-
 hw/char/debugcon.c  | 2 +-
 hw/char/escc.c  | 6 +++---
 hw/char/exynos4210_uart.c   | 2 +-
 hw/char/grlib_apbuart.c | 2 +-
 hw/char/ipoctal232.c| 2 +-
 hw/char/parallel.c  | 2 +-
 hw/char/sclpconsole-lm.c| 2 +-
 hw/char/sclpconsole.c   | 2 +-
 hw/char/sh_serial.c | 2 +-
 hw/char/spapr_vty.c | 2 +-
 hw/char/terminal3270.c  | 2 +-
 hw/char/xen_console.c   | 2 +-
 hw/ipmi/ipmi_bmc_extern.c   | 2 +-
 hw/misc/ivshmem.c   | 4 ++--
 hw/usb/ccid-card-passthru.c | 4 ++--
 hw/usb/dev-serial.c | 5 ++---
 hw/usb/redirect.c   | 5 ++---
 include/sysemu/char.h   | 7 +++
 net/filter-mirror.c | 2 +-
 23 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 36d6f36..e43d840 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -494,6 +494,11 @@ bool qemu_chr_fe_backend_connected(CharBackend *be)
 return !!be->chr;
 }
 
+bool qemu_chr_fe_backend_open(CharBackend *be)
+{
+return be->chr && be->chr->be_open;
+}
+
 static bool fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
diff --git a/gdbstub.c b/gdbstub.c
index 1ac0489..68cbe8a 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2001,7 +2001,7 @@ int gdbserver_start(const char *device)
NULL, &error_abort);
 monitor_init(mon_chr, 0);
 } else {
-if (qemu_chr_fe_get_driver(&s->chr)) {
+if (qemu_chr_fe_backend_connected(&s->chr)) {
 object_unparent(OBJECT(qemu_chr_fe_get_driver(&s->chr)));
 }
 mon_chr = s->mon_chr;
diff --git a/hw/arm/strongarm.c b/hw/arm/strongarm.c
index bec093d..9d7cf21 100644
--- a/hw/arm/strongarm.c
+++ b/hw/arm/strongarm.c
@@ -1105,7 +1105,7 @@ static void strongarm_uart_tx(void *opaque)
 
 if (s->utcr3 & UTCR3_LBM) /* loopback */ {
 strongarm_uart_receive(s, &s->tx_fifo[s->tx_start], 1);
-} else if (qemu_chr_fe_get_driver(&s->chr)) {
+} else if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx_fifo[s->tx_start], 1);
diff --git a/hw/char/cadence_uart.c b/hw/char/cadence_uart.c
index 71867b3..19636c0 100644
--- a/hw/char/cadence_uart.c
+++ b/hw/char/cadence_uart.c
@@ -278,7 +278,7 @@ static gboolean cadence_uart_xmit(GIOChannel *chan, 
GIOCondition cond,
 int ret;
 
 /* instant drain the fifo when there's no back-end */
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 s->tx_count = 0;
 return FALSE;
 }
diff --git a/hw/char/debugcon.c b/hw/char/debugcon.c
index 6d95297..bd0d4f0 100644
--- a/hw/char/debugcon.c
+++ b/hw/char/debugcon.c
@@ -87,7 +87,7 @@ static const MemoryRegionOps debugcon_ops = {
 
 static void debugcon_realize_core(DebugconState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create debugcon device, empty char device");
 return;
 }
diff --git a/hw/char/escc.c b/hw/char/escc.c
index aa882b6..dbbeb4a 100644
--- a/hw/char/escc.c
+++ b/hw/char/escc.c
@@ -416,7 +416,7 @@ static void escc_update_parameters(ChannelState *s)
 int speed, parity, data_bits, stop_bits;
 QEMUSerialSetParams ssp;
 
-if (!qemu_chr_fe_get_driver(&s->chr) || s->type != ser)
+if (!qemu_chr_fe_backend_connected(&s->chr) || s->type != ser)
 return;
 
 if (s->wregs[W_TXCTRL1] & TXCTRL1_PAREN) {
@@ -556,7 +556,7 @@ static void escc_mem_write(void *opaque, hwaddr addr,
 trace_escc_mem_writeb_data(CHN_C(s), val);
 s->tx = val;
 if (s->wregs[W_TXCTRL2] & TXCTRL2_TXEN) { // tx enabled
-if (qemu_chr_fe_get_driver(&s->chr)) {
+if (qemu_chr_fe_backend_connected(&s->chr)) {
 /* XXX this blocks entire thread. Rewrite to use
  * qemu_chr_fe_write and background I/O callbacks */
 qemu_chr_fe_write_all(&s->chr, &s->tx, 1);
@@ -1012,7 +1012,7 @@ static void escc_realize(DeviceState *dev, Error **errp)
   ESCC_SIZE << s->it_shift);
 
 for (i = 0; i < 2; i++) {
-if (qemu_chr_fe_get_driver(&s->chn[i].chr)) {
+if (qemu_chr_fe_backend_connected(&s->chn[i].chr)) {

[Qemu-devel] [PATCH v2 4/9] hmp: add hmp analogue for qmp-chardev-change

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Dr. David Alan Gilbert 
---
 chardev/char.c|  4 ++--
 hmp-commands.hx   | 16 
 hmp.c | 34 ++
 hmp.h |  1 +
 include/sysemu/char.h | 12 
 5 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index bac5e1c..0483f19 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -880,8 +880,8 @@ help_string_append(const char *name, void *opaque)
 g_string_append_printf(str, "\n%s", name);
 }
 
-static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
-   Error **errp)
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
+Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 0aca984..0f2a059 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1727,6 +1727,22 @@ chardev_add accepts the same parameters as the -chardev 
command line switch.
 ETEXI
 
 {
+.name   = "chardev-change",
+.args_type  = "id:s,args:s",
+.params = "id args",
+.help   = "change chardev",
+.cmd= hmp_chardev_change,
+},
+
+STEXI
+@item chardev-change args
+@findex chardev-change
+chardev_change accepts existing chardev @var{id} and then the same arguments
+as the -chardev command line switch (except for "id").
+
+ETEXI
+
+{
 .name   = "chardev-remove",
 .args_type  = "id:s",
 .params = "id",
diff --git a/hmp.c b/hmp.c
index 3dceaf8..f7d0b38 100644
--- a/hmp.c
+++ b/hmp.c
@@ -2209,6 +2209,40 @@ void hmp_chardev_add(Monitor *mon, const QDict *qdict)
 hmp_handle_error(mon, &err);
 }
 
+void hmp_chardev_change(Monitor *mon, const QDict *qdict)
+{
+const char *args = qdict_get_str(qdict, "args");
+const char *id;
+Error *err = NULL;
+ChardevBackend *backend = NULL;
+ChardevReturn *ret = NULL;
+QemuOpts *opts = qemu_opts_parse_noisily(qemu_find_opts("chardev"), args,
+ true);
+if (!opts) {
+error_setg(&err, "Parsing chardev args failed");
+goto end;
+}
+
+id = qdict_get_str(qdict, "id");
+if (qemu_opts_id(opts)) {
+error_setg(&err, "Unexpected 'id' parameter");
+goto end;
+}
+
+backend = qemu_chr_parse_opts(opts, &err);
+if (!backend) {
+goto end;
+}
+
+ret = qmp_chardev_change(id, backend, &err);
+
+end:
+qapi_free_ChardevReturn(ret);
+qapi_free_ChardevBackend(backend);
+qemu_opts_del(opts);
+hmp_handle_error(mon, &err);
+}
+
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict)
 {
 Error *local_err = NULL;
diff --git a/hmp.h b/hmp.h
index d8b94ce..23e035c 100644
--- a/hmp.h
+++ b/hmp.h
@@ -102,6 +102,7 @@ void hmp_nbd_server_start(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
 void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
 void hmp_chardev_add(Monitor *mon, const QDict *qdict);
+void hmp_chardev_change(Monitor *mon, const QDict *qdict);
 void hmp_chardev_remove(Monitor *mon, const QDict *qdict);
 void hmp_qemu_io(Monitor *mon, const QDict *qdict);
 void hmp_cpu_add(Monitor *mon, const QDict *qdict);
diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 68c7876..92ae57e 100644
--- a/include/sysemu/char.h
+++ b/include/sysemu/char.h
@@ -132,6 +132,18 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon *backend);
 
 /**
+ * @qemu_chr_parse_opts:
+ *
+ * Parse the options to the ChardevBackend struct.
+ *
+ * @opts
+ *
+ * Returns: a new backend
+ */
+ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
+Error **errp);
+
+/**
  * @qemu_chr_new:
  *
  * Create a new character backend from a URI.
-- 
2.7.4




[Qemu-devel] [PATCH v2 6/9] virtio-console: chardev hotswap support

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Amit Shah 
---
 hw/char/virtio-console.c | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c
index cf7331d..bd74669 100644
--- a/hw/char/virtio-console.c
+++ b/hw/char/virtio-console.c
@@ -49,7 +49,7 @@ static ssize_t flush_buf(VirtIOSerialPort *port,
 VirtConsole *vcon = VIRTIO_CONSOLE(port);
 ssize_t ret;
 
-if (!qemu_chr_fe_get_driver(&vcon->chr)) {
+if (!qemu_chr_fe_backend_connected(&vcon->chr)) {
 /* If there's no backend, we can just say we consumed all data. */
 return len;
 }
@@ -163,12 +163,35 @@ static void chr_event(void *opaque, int event)
 }
 }
 
+static int chr_be_change(void *opaque)
+{
+VirtConsole *vcon = opaque;
+VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(vcon);
+VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port);
+
+if (k->is_console) {
+qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
+ NULL, chr_be_change, vcon, NULL, true);
+} else {
+qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
+ chr_event, chr_be_change, vcon, NULL, false);
+}
+
+if (vcon->watch) {
+g_source_remove(vcon->watch);
+vcon->watch = qemu_chr_fe_add_watch(&vcon->chr,
+G_IO_OUT | G_IO_HUP,
+chr_write_unblocked, vcon);
+}
+
+return 0;
+}
+
 static void virtconsole_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev);
 VirtConsole *vcon = VIRTIO_CONSOLE(dev);
 VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(dev);
-Chardev *chr = qemu_chr_fe_get_driver(&vcon->chr);
 
 if (port->id == 0 && !k->is_console) {
 error_setg(errp, "Port number 0 on virtio-serial devices reserved "
@@ -176,7 +199,7 @@ static void virtconsole_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 
-if (chr) {
+if (qemu_chr_fe_backend_connected(&vcon->chr)) {
 /*
  * For consoles we don't block guest data transfer just
  * because nothing is connected - we'll just let it go
@@ -188,11 +211,13 @@ static void virtconsole_realize(DeviceState *dev, Error 
**errp)
  */
 if (k->is_console) {
 qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
- NULL, NULL, vcon, NULL, true);
+ NULL, chr_be_change,
+ vcon, NULL, true);
 virtio_serial_open(port);
 } else {
 qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read,
- chr_event, NULL, vcon, NULL, false);
+ chr_event, chr_be_change,
+ vcon, NULL, false);
 }
 }
 }
-- 
2.7.4




[Qemu-devel] [PATCH v2 2/9] char: add backend hotswap handler

2017-05-19 Thread Anton Nefedov
Frontends should have an interface to setup the handler of a backend change.
The interface will be used in the next commits

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 backends/rng-egd.c  |  2 +-
 chardev/char-mux.c  |  1 +
 chardev/char.c  |  4 +++-
 gdbstub.c   |  2 +-
 hw/arm/pxa2xx.c |  3 ++-
 hw/arm/strongarm.c  |  2 +-
 hw/char/bcm2835_aux.c   |  2 +-
 hw/char/cadence_uart.c  |  2 +-
 hw/char/debugcon.c  |  2 +-
 hw/char/digic-uart.c|  2 +-
 hw/char/escc.c  |  2 +-
 hw/char/etraxfs_ser.c   |  2 +-
 hw/char/exynos4210_uart.c   |  2 +-
 hw/char/grlib_apbuart.c |  2 +-
 hw/char/imx_serial.c|  2 +-
 hw/char/ipoctal232.c|  2 +-
 hw/char/lm32_juart.c|  2 +-
 hw/char/lm32_uart.c |  2 +-
 hw/char/mcf_uart.c  |  2 +-
 hw/char/milkymist-uart.c|  2 +-
 hw/char/pl011.c |  2 +-
 hw/char/sclpconsole-lm.c|  2 +-
 hw/char/sclpconsole.c   |  2 +-
 hw/char/serial.c|  2 +-
 hw/char/sh_serial.c |  2 +-
 hw/char/spapr_vty.c |  2 +-
 hw/char/stm32f2xx_usart.c   |  3 ++-
 hw/char/terminal3270.c  |  2 +-
 hw/char/virtio-console.c|  4 ++--
 hw/char/xen_console.c   |  2 +-
 hw/char/xilinx_uartlite.c   |  2 +-
 hw/ipmi/ipmi_bmc_extern.c   |  2 +-
 hw/mips/boston.c|  2 +-
 hw/mips/mips_malta.c|  2 +-
 hw/misc/ivshmem.c   |  2 +-
 hw/usb/ccid-card-passthru.c |  2 +-
 hw/usb/dev-serial.c |  2 +-
 hw/usb/redirect.c   |  2 +-
 include/sysemu/char.h   |  5 +
 monitor.c   |  4 ++--
 net/colo-compare.c  | 14 --
 net/filter-mirror.c |  6 +++---
 net/slirp.c |  2 +-
 net/vhost-user.c|  7 ---
 qtest.c |  2 +-
 tests/test-char.c   | 14 ++
 tests/vhost-user-test.c |  2 +-
 47 files changed, 78 insertions(+), 59 deletions(-)

diff --git a/backends/rng-egd.c b/backends/rng-egd.c
index 380b19a..0b0e945 100644
--- a/backends/rng-egd.c
+++ b/backends/rng-egd.c
@@ -106,7 +106,7 @@ static void rng_egd_opened(RngBackend *b, Error **errp)
 
 /* FIXME we should resubmit pending requests when the CDS reconnects. */
 qemu_chr_fe_set_handlers(&s->chr, rng_egd_chr_can_read,
- rng_egd_chr_read, NULL, s, NULL, true);
+ rng_egd_chr_read, NULL, NULL, s, NULL, true);
 }
 
 static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp)
diff --git a/chardev/char-mux.c b/chardev/char-mux.c
index 37d42c6..5849ea5 100644
--- a/chardev/char-mux.c
+++ b/chardev/char-mux.c
@@ -278,6 +278,7 @@ void mux_chr_set_handlers(Chardev *chr, GMainContext 
*context)
  mux_chr_can_read,
  mux_chr_read,
  mux_chr_event,
+ NULL,
  chr,
  context, true);
 }
diff --git a/chardev/char.c b/chardev/char.c
index 684cccd..ae60950 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -522,7 +522,7 @@ void qemu_chr_fe_deinit(CharBackend *b)
 assert(b);
 
 if (b->chr) {
-qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, true);
+qemu_chr_fe_set_handlers(b, NULL, NULL, NULL, NULL, NULL, NULL, true);
 if (b->chr->be == b) {
 b->chr->be = NULL;
 }
@@ -538,6 +538,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
   IOCanReadHandler *fd_can_read,
   IOReadHandler *fd_read,
   IOEventHandler *fd_event,
+  BackendChangeHandler *be_change,
   void *opaque,
   GMainContext *context,
   bool set_open)
@@ -561,6 +562,7 @@ void qemu_chr_fe_set_handlers(CharBackend *b,
 b->chr_can_read = fd_can_read;
 b->chr_read = fd_read;
 b->chr_event = fd_event;
+b->chr_be_change = be_change;
 b->opaque = opaque;
 if (cc->chr_update_read_handler) {
 cc->chr_update_read_handler(s, context);
diff --git a/gdbstub.c b/gdbstub.c
index 86eed4f..1ac0489 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2013,7 +2013,7 @@ int gdbserver_start(const char *device)
 if (chr) {
 qemu_chr_fe_init(&s->chr, chr, &error_abort);
 qemu_chr_fe_set_handlers(&s->chr, gdb_chr_can_receive, gdb_chr_receive,
- gdb_chr_event, NULL, NULL, true);
+ gdb_chr_event, NULL, NULL, NULL, true);
 }
 s->state = chr ? RS_IDLE : RS_INACTIVE;
 s->mon_chr = mon_chr;
diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c
index eea551d..3e51882 100644
--- a/hw/a

[Qemu-devel] [PATCH v2 3/9] char: chardevice hotswap

2017-05-19 Thread Anton Nefedov
This patch adds a possibility to change a char device without a frontend
removal.

1. Ideally, it would have to happen transparently to a frontend, i.e.
frontend would continue its regular operation.
However, backends are not stateless and are set up by the frontends
via qemu_chr_fe_<> functions, and it's not (generally) possible to replay
that setup entirely in a backend code, as different chardevs respond
to the setup calls differently, so do frontends work differently basing
on those setup responses.
Moreover, some frontend can generally get and save the backend pointer
(qemu_chr_fe_get_driver()), and it will become invalid after backend change.

So, a frontend which would like to support chardev hotswap has to register
a "backend change" handler, and redo its backend setup there.

2. Write path can be used by multiple threads and thus protected with
chr_write_lock.
So hotswap also has to be protected so write functions won't access
a backend being replaced.

3. Hotswap function can be called from e.g. a read handler of a monitor
socket. This can cause troubles so it's safer to defer execution to
a bottom-half (however, it means we cannot return some of the errors
synchronously - but most of them we can)

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c| 147 ++
 include/sysemu/char.h |  10 
 qapi-schema.json  |  40 ++
 3 files changed, 187 insertions(+), 10 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index ae60950..bac5e1c 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -132,12 +132,16 @@ static bool qemu_chr_replay(Chardev *chr)
 
 int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+Chardev *s;
 ChardevClass *cc;
 int ret;
 
+qemu_mutex_lock(&be->chr_lock);
+s = be->chr;
+
 if (!s) {
-return 0;
+ret = 0;
+goto end;
 }
 
 if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_PLAY) {
@@ -145,7 +149,7 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, 
int len)
 replay_char_write_event_load(&ret, &offset);
 assert(offset <= len);
 qemu_chr_fe_write_buffer(s, buf, offset, &offset);
-return ret;
+goto end;
 }
 
 cc = CHARDEV_GET_CLASS(s);
@@ -161,7 +165,9 @@ int qemu_chr_fe_write(CharBackend *be, const uint8_t *buf, 
int len)
 if (qemu_chr_replay(s) && replay_mode == REPLAY_MODE_RECORD) {
 replay_char_write_event_save(ret, ret < 0 ? 0 : ret);
 }
-
+
+end:
+qemu_mutex_unlock(&be->chr_lock);
 return ret;
 }
 
@@ -191,13 +197,16 @@ int qemu_chr_write_all(Chardev *s, const uint8_t *buf, 
int len)
 
 int qemu_chr_fe_write_all(CharBackend *be, const uint8_t *buf, int len)
 {
-Chardev *s = be->chr;
+Chardev *s;
+int ret;
 
-if (!s) {
-return 0;
-}
+qemu_mutex_lock(&be->chr_lock);
 
-return qemu_chr_write_all(s, buf, len);
+s = be->chr;
+ret = s ? qemu_chr_write_all(s, buf, len) : 0;
+
+qemu_mutex_unlock(&be->chr_lock);
+return ret;
 }
 
 int qemu_chr_fe_read_all(CharBackend *be, uint8_t *buf, int len)
@@ -478,7 +487,7 @@ Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 return be->chr;
 }
 
-bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+static bool fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
 
@@ -507,6 +516,17 @@ unavailable:
 return false;
 }
 
+bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error **errp)
+{
+if (!fe_connect(b, s, errp)) {
+return false;
+}
+
+qemu_mutex_init(&b->chr_lock);
+b->hotswap_bh = NULL;
+return true;
+}
+
 static bool qemu_chr_is_busy(Chardev *s)
 {
 if (CHARDEV_IS_MUX(s)) {
@@ -531,6 +551,10 @@ void qemu_chr_fe_deinit(CharBackend *b)
 d->backends[b->tag] = NULL;
 }
 b->chr = NULL;
+qemu_mutex_destroy(&b->chr_lock);
+if (b->hotswap_bh) {
+qemu_bh_delete(b->hotswap_bh);
+}
 }
 }
 
@@ -1308,6 +1332,109 @@ ChardevReturn *qmp_chardev_add(const char *id, 
ChardevBackend *backend,
 return ret;
 }
 
+static void chardev_change_bh(void *opaque)
+{
+Chardev *chr_new = opaque;
+const char *id = chr_new->label;
+Chardev *chr = qemu_chr_find(id);
+CharBackend *be = chr->be;
+bool closed_sent = false;
+
+if (!be) {
+/* disconnected since we checked: ok, less work for us */
+goto end;
+}
+
+if (chr->be_open && !chr_new->be_open) {
+qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+closed_sent = true;
+}
+
+qemu_mutex_lock(&be->chr_lock);
+chr->be = NULL;
+fe_connect(be, chr_new, &error_abort);
+
+if (be->chr_be_chan

[Qemu-devel] [PATCH v2 1/9] char: move QemuOpts->ChardevBackend translation to a separate func

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c | 72 +-
 1 file changed, 46 insertions(+), 26 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 4e24dc3..684cccd 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -854,17 +854,14 @@ help_string_append(const char *name, void *opaque)
 g_string_append_printf(str, "\n%s", name);
 }
 
-Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
-Error **errp)
+static ChardevBackend *qemu_chr_parse_opts(QemuOpts *opts,
+   Error **errp)
 {
 Error *local_err = NULL;
 const ChardevClass *cc;
-Chardev *chr;
 int i;
 ChardevBackend *backend = NULL;
 const char *name = qemu_opt_get(opts, "backend");
-const char *id = qemu_opts_id(opts);
-char *bid = NULL;
 
 if (name == NULL) {
 error_setg(errp, "chardev: \"%s\" missing backend",
@@ -872,21 +869,6 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 return NULL;
 }
 
-if (is_help_option(name)) {
-GString *str = g_string_new("");
-
-chardev_name_foreach(help_string_append, str);
-
-error_report("Available chardev backend types: %s", str->str);
-g_string_free(str, true);
-exit(0);
-}
-
-if (id == NULL) {
-error_setg(errp, "chardev: no id specified");
-return NULL;
-}
-
 for (i = 0; i < ARRAY_SIZE(chardev_alias_table); i++) {
 if (g_strcmp0(chardev_alias_table[i].alias, name) == 0) {
 name = chardev_alias_table[i].typename;
@@ -902,16 +884,12 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 backend = g_new0(ChardevBackend, 1);
 backend->type = CHARDEV_BACKEND_KIND_NULL;
 
-if (qemu_opt_get_bool(opts, "mux", 0)) {
-bid = g_strdup_printf("%s-base", id);
-}
-
-chr = NULL;
 if (cc->parse) {
 cc->parse(opts, backend, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
+qapi_free_ChardevBackend(backend);
+return NULL;
 }
 } else {
 ChardevCommon *ccom = g_new0(ChardevCommon, 1);
@@ -919,6 +897,48 @@ Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
 backend->u.null.data = ccom; /* Any ChardevCommon member would work */
 }
 
+return backend;
+}
+
+Chardev *qemu_chr_new_from_opts(QemuOpts *opts,
+Error **errp)
+{
+const ChardevClass *cc;
+Chardev *chr = NULL;
+ChardevBackend *backend = NULL;
+const char *name = qemu_opt_get(opts, "backend");
+const char *id = qemu_opts_id(opts);
+char *bid = NULL;
+
+if (name && is_help_option(name)) {
+GString *str = g_string_new("");
+
+chardev_name_foreach(help_string_append, str);
+
+error_report("Available chardev backend types: %s", str->str);
+g_string_free(str, true);
+exit(0);
+}
+
+if (id == NULL) {
+error_setg(errp, "chardev: no id specified");
+return NULL;
+}
+
+backend = qemu_chr_parse_opts(opts, errp);
+if (backend == NULL) {
+return NULL;
+}
+
+cc = char_get_class(name, errp);
+if (cc == NULL) {
+goto out;
+}
+
+if (qemu_opt_get_bool(opts, "mux", 0)) {
+bid = g_strdup_printf("%s-base", id);
+}
+
 chr = qemu_chardev_new(bid ? bid : id,
object_class_get_name(OBJECT_CLASS(cc)),
backend, errp);
-- 
2.7.4




[Qemu-devel] [PATCH v2 8/9] serial: chardev hotswap support

2017-05-19 Thread Anton Nefedov
for a backend change, a number of ioctls has to be replayed to sync
the current setup of a frontend to a backend tty. This is hopefully
enough so we don't have to track, store and replay the whole original
control byte sequence.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
CC: Michael S. Tsirkin 
CC: Paolo Bonzini 
---
 hw/char/serial.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/hw/char/serial.c b/hw/char/serial.c
index 1e6bdeb..ed01637 100644
--- a/hw/char/serial.c
+++ b/hw/char/serial.c
@@ -891,9 +891,37 @@ static void serial_reset(void *opaque)
 s->msr &= ~UART_MSR_ANY_DELTA;
 }
 
+static int serial_be_change(void *opaque)
+{
+SerialState *s = opaque;
+
+qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
+ serial_event, serial_be_change, s, NULL, true);
+
+serial_update_parameters(s);
+
+qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_BREAK,
+  &s->last_break_enable);
+
+s->poll_msl = (s->ier & UART_IER_MSI) ? 1 : 0;
+serial_update_msl(s);
+
+if (s->poll_msl >= 0 && !(s->mcr & UART_MCR_LOOP)) {
+serial_update_tiocm(s);
+}
+
+if (s->watch_tag > 0) {
+g_source_remove(s->watch_tag);
+s->watch_tag = qemu_chr_fe_add_watch(&s->chr, G_IO_OUT | G_IO_HUP,
+ serial_watch_cb, s);
+}
+
+return 0;
+}
+
 void serial_realize_core(SerialState *s, Error **errp)
 {
-if (!qemu_chr_fe_get_driver(&s->chr)) {
+if (!qemu_chr_fe_backend_connected(&s->chr)) {
 error_setg(errp, "Can't create serial device, empty char device");
 return;
 }
@@ -904,7 +932,7 @@ void serial_realize_core(SerialState *s, Error **errp)
 qemu_register_reset(serial_reset, s);
 
 qemu_chr_fe_set_handlers(&s->chr, serial_can_receive1, serial_receive1,
- serial_event, NULL, s, NULL, true);
+ serial_event, serial_be_change, s, NULL, true);
 fifo8_create(&s->recv_fifo, UART_FIFO_LENGTH);
 fifo8_create(&s->xmit_fifo, UART_FIFO_LENGTH);
 serial_reset(s);
-- 
2.7.4




[Qemu-devel] [PATCH v2 0/9] chardevice hotswap

2017-05-19 Thread Anton Nefedov
This serie is a v2 of the February submit
http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg01989.html

The interface is changed as requested and the changes are slightly reworked
and split into separate patches.



The patchset adds support of the character device change without
a frontend device removal.
Yet isa-serial and virtio-serial frontends are supported.

The feature can be helpful for e.g. Windows debug allowing to
establish connection to a live VM from VM with WinDbg.

Anton Nefedov (9):
  char: move QemuOpts->ChardevBackend translation to a separate func
  char: add backend hotswap handler
  char: chardevice hotswap
  hmp: add hmp analogue for qmp-chardev-change
  char: forbid direct chardevice access for hotswap devices
  virtio-console: chardev hotswap support
  serial: move TIOCM update to a separate function
  serial: chardev hotswap support
  char: avoid chardevice direct access

 backends/rng-egd.c  |   2 +-
 chardev/char-mux.c  |   1 +
 chardev/char.c  | 235 +---
 gdbstub.c   |   4 +-
 hmp-commands.hx |  16 +++
 hmp.c   |  34 +++
 hmp.h   |   1 +
 hw/arm/pxa2xx.c |   3 +-
 hw/arm/strongarm.c  |   4 +-
 hw/char/bcm2835_aux.c   |   2 +-
 hw/char/cadence_uart.c  |   4 +-
 hw/char/debugcon.c  |   4 +-
 hw/char/digic-uart.c|   2 +-
 hw/char/escc.c  |   8 +-
 hw/char/etraxfs_ser.c   |   2 +-
 hw/char/exynos4210_uart.c   |   4 +-
 hw/char/grlib_apbuart.c |   4 +-
 hw/char/imx_serial.c|   2 +-
 hw/char/ipoctal232.c|   4 +-
 hw/char/lm32_juart.c|   2 +-
 hw/char/lm32_uart.c |   2 +-
 hw/char/mcf_uart.c  |   2 +-
 hw/char/milkymist-uart.c|   2 +-
 hw/char/parallel.c  |   2 +-
 hw/char/pl011.c |   2 +-
 hw/char/sclpconsole-lm.c|   4 +-
 hw/char/sclpconsole.c   |   4 +-
 hw/char/serial.c|  63 +---
 hw/char/sh_serial.c |   4 +-
 hw/char/spapr_vty.c |   4 +-
 hw/char/stm32f2xx_usart.c   |   3 +-
 hw/char/terminal3270.c  |   4 +-
 hw/char/virtio-console.c|  35 ++-
 hw/char/xen_console.c   |   4 +-
 hw/char/xilinx_uartlite.c   |   2 +-
 hw/ipmi/ipmi_bmc_extern.c   |   4 +-
 hw/mips/boston.c|   2 +-
 hw/mips/mips_malta.c|   2 +-
 hw/misc/ivshmem.c   |   6 +-
 hw/usb/ccid-card-passthru.c |   6 +-
 hw/usb/dev-serial.c |   7 +-
 hw/usb/redirect.c   |   7 +-
 include/sysemu/char.h   |  44 +
 monitor.c   |   4 +-
 net/colo-compare.c  |  14 +--
 net/filter-mirror.c |   8 +-
 net/slirp.c |   2 +-
 net/vhost-user.c|   7 +-
 qapi-schema.json|  40 
 qtest.c |   2 +-
 tests/test-char.c   |  14 ++-
 tests/vhost-user-test.c |   2 +-
 52 files changed, 506 insertions(+), 140 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v2 5/9] char: forbid direct chardevice access for hotswap devices

2017-05-19 Thread Anton Nefedov
qemu_chr_fe_get_driver() is unsafe, frontends with hotswap support
should not access CharDriver ptr directly as CharDriver might change.

Signed-off-by: Anton Nefedov 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 chardev/char.c|  7 +++
 include/sysemu/char.h | 10 ++
 2 files changed, 17 insertions(+)

diff --git a/chardev/char.c b/chardev/char.c
index 0483f19..36d6f36 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -484,9 +484,16 @@ static Notifier muxes_realize_notify = {
 
 Chardev *qemu_chr_fe_get_driver(CharBackend *be)
 {
+/* this is unsafe for the users that support chardev hotswap */
+assert(be->chr_be_change == NULL);
 return be->chr;
 }
 
+bool qemu_chr_fe_backend_connected(CharBackend *be)
+{
+return !!be->chr;
+}
+
 static bool fe_connect(CharBackend *b, Chardev *s, Error **errp)
 {
 int tag = 0;
diff --git a/include/sysemu/char.h b/include/sysemu/char.h
index 92ae57e..fa21535 100644
--- a/include/sysemu/char.h
+++ b/include/sysemu/char.h
@@ -404,10 +404,20 @@ bool qemu_chr_fe_init(CharBackend *b, Chardev *s, Error 
**errp);
  *
  * Returns the driver associated with a CharBackend or NULL if no
  * associated Chardev.
+ * Note: avoid this function as the driver should never be accessed directly,
+ *   especially by the frontends that support chardevice hotswap.
+ *   Consider qemu_chr_fe_backend_connected() to check for driver existence
  */
 Chardev *qemu_chr_fe_get_driver(CharBackend *be);
 
 /**
+ * @qemu_chr_fe_backend_connected:
+ *
+ * Returns true if there is a chardevice associated with @be.
+ */
+bool qemu_chr_fe_backend_connected(CharBackend *be);
+
+/**
  * @qemu_chr_fe_deinit:
  *
  * Dissociate the CharBackend from the Chardev.
-- 
2.7.4




[Qemu-devel] [PATCH v1 10/13] qcow2-cluster: slightly refactor handle_dependencies()

2017-05-19 Thread Anton Nefedov
  - assert the alignment on return if the allocation has to stop
(at the start of a running allocation)
  - make use of const specifiers for local variables

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4204db9..03d6f7e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -900,15 +900,15 @@ out:
  * Check if there already is an AIO write request in flight which allocates
  * the same cluster. In this case we need to wait until the previous
  * request has completed and updated the L2 table accordingly.
- *
  * Returns:
  *   0   if there was no dependency. *cur_bytes indicates the number of
  *   bytes from guest_offset that can be read before the next
- *   dependency must be processed (or the request is complete)
+ *   dependency must be processed (or the request is complete).
+ *   *m is not modified
  *
- *   -EAGAIN if we had to wait for another request, previously gathered
- *   information on cluster allocation may be invalid now. The caller
- *   must start over anyway, so consider *cur_bytes undefined.
+ *   -EAGAIN if we had to wait for another request. The caller
+ *   must start over, so consider *cur_bytes undefined.
+ *   *m is not modified
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
 uint64_t *cur_bytes, QCowL2Meta **m)
@@ -919,10 +919,10 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 
 QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
 
-uint64_t start = guest_offset;
-uint64_t end = start + bytes;
-uint64_t old_start = l2meta_cow_start(old_alloc);
-uint64_t old_end = l2meta_cow_end(old_alloc);
+const uint64_t start = guest_offset;
+const uint64_t end = start + bytes;
+const uint64_t old_start = l2meta_cow_start(old_alloc);
+const uint64_t old_end = l2meta_cow_end(old_alloc);
 
 if (end <= old_start || start >= old_end) {
 /* No intersection */
@@ -939,6 +939,8 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
  * and deal with requests depending on them before starting to
  * gather new ones. Not worth the trouble. */
 if (bytes == 0 && *m) {
+/* start must be cluster aligned at this point */
+assert(start == start_of_cluster(s, start));
 *cur_bytes = 0;
 return 0;
 }
-- 
2.7.4




[Qemu-devel] [PATCH v1 08/13] qcow2: handle_prealloc(): find out if area zeroed by earlier preallocation

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 2 ++
 block/qcow2.c | 8 +++-
 block/qcow2.h | 4 
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index b2879b9..25210cd 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1143,6 +1143,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
 {
 BDRVQcow2State *s = bs->opaque;
+const uint64_t old_data_end = s->data_end;
 int l2_index;
 uint64_t *l2_table;
 uint64_t entry;
@@ -1264,6 +1265,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 .alloc_offset   = alloc_cluster_offset,
 .offset = start_of_cluster(s, guest_offset),
 .nb_clusters= nb_clusters,
+.clusters_are_trailing = alloc_cluster_offset >= old_data_end,
 
 .keep_old_clusters  = keep_old_clusters,
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 503f0dc..97a66a0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1684,7 +1684,13 @@ restart:
 
 if (end <= bdrv_getlength(file)) {
 /* No need to care, file size will not be changed */
-return false;
+
+/* We're safe to assume that the area is zeroes if the area
+ * was allocated at the end of data (s->data_end).
+ * In this case, the only way for file length to be bigger is that
+ * the area was preallocated by another request.
+ */
+return m->clusters_are_trailing;
 }
 
 meta = g_alloca(sizeof(*meta));
diff --git a/block/qcow2.h b/block/qcow2.h
index e28c54a..2fd8510 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -333,6 +333,10 @@ typedef struct QCowL2Meta
 /** Do not free the old clusters */
 bool keep_old_clusters;
 
+/** True if the area is allocated after the end of data area
+ *  (i.e. >= s->data_end), which means that it is zeroed */
+bool clusters_are_trailing;
+
 /**
  * Requests that overlap with this allocation and wait to be restarted
  * when the allocating request has completed.
-- 
2.7.4




[Qemu-devel] [PATCH v1 06/13] qcow2: truncate preallocated space

2017-05-19 Thread Anton Nefedov
From: "Denis V. Lunev" 

This could be done after calculation of the end of data and metadata in
the qcow2 image.

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2-cluster.c  | 9 +
 block/qcow2-refcount.c | 7 +++
 block/qcow2.c  | 8 
 block/qcow2.h  | 3 +++
 4 files changed, 27 insertions(+)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index a4b6d40..b2879b9 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1957,3 +1957,12 @@ fail:
 g_free(l1_table);
 return ret;
 }
+
+void qcow2_update_data_end(BlockDriverState *bs, uint64_t off)
+{
+BDRVQcow2State *s = bs->opaque;
+
+if (s->data_end < off) {
+s->data_end = off;
+}
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 873a1d2..8156466 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -744,6 +744,9 @@ static int QEMU_WARN_UNUSED_RESULT 
update_refcount(BlockDriverState *bs,
 ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
 if (ret < 0) {
 goto fail;
+} else {
+qcow2_update_data_end(bs, s->refcount_table_offset +
+s->refcount_table_size * sizeof(uint64_t));
 }
 }
 old_table_index = table_index;
@@ -865,6 +868,8 @@ retry:
 s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
 {
 return -EFBIG;
+} else {
+qcow2_update_data_end(bs, s->free_cluster_index << s->cluster_bits);
 }
 
 #ifdef DEBUG_ALLOC2
@@ -929,6 +934,8 @@ int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, 
uint64_t offset,
 
 if (ret < 0) {
 return ret;
+} else {
+qcow2_update_data_end(bs, offset + (nb_clusters << s->cluster_bits));
 }
 
 return i;
diff --git a/block/qcow2.c b/block/qcow2.c
index 07c1706..7b4359b 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1192,6 +1192,8 @@ static int qcow2_do_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 }
 
+s->data_end = bdrv_getlength(bs->file->bs);
+
 #ifdef DEBUG_ALLOC
 {
 BdrvCheckResult result = {0};
@@ -1948,12 +1950,18 @@ static int qcow2_inactivate(BlockDriverState *bs)
 static void qcow2_close(BlockDriverState *bs)
 {
 BDRVQcow2State *s = bs->opaque;
+
 qemu_vfree(s->l1_table);
 /* else pre-write overlap checks in cache_destroy may crash */
 s->l1_table = NULL;
 
 if (!(s->flags & BDRV_O_INACTIVE)) {
 qcow2_inactivate(bs);
+
+/* truncate preallocated space */
+if (!bs->read_only && s->data_end < bdrv_getlength(bs->file->bs)) {
+bdrv_truncate(bs->file, s->data_end, NULL);
+}
 }
 
 cache_clean_timer_del(bs);
diff --git a/block/qcow2.h b/block/qcow2.h
index a0d222d..e28c54a 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -297,6 +297,7 @@ typedef struct BDRVQcow2State {
 char *image_backing_format;
 
 uint64_t prealloc_size;
+uint64_t data_end;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
@@ -607,4 +608,6 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache 
*c, uint64_t offset,
 void **table);
 void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table);
 
+void qcow2_update_data_end(BlockDriverState *bs, uint64_t off);
+
 #endif
-- 
2.7.4




[Qemu-devel] [PATCH v1 12/13] qcow2: allow concurrent unaligned writes to the same clusters

2017-05-19 Thread Anton Nefedov
If COW area of a write request to unallocated cluster is empty,
concurrent write requests can be allowed with a little bit of
extra synchronization; so they don't have to wait until L2 is filled.

Let qcow2_cluster.c::handle_dependencies() do the most of the job:
  if there is an in-flight request to the same cluster,
  and the current request wants to write in its COW area,
  and its COW area is marked empty,
  - steal the allocated offset and write concurrently. Let the original
request update L2 later when it likes.

This gives an improvement for parallel misaligned writes to
unallocated clusters with no backing data:

HDD fio over xfs iodepth=4:
  seqwrite 4k:   18400 -> 22800 IOPS ( x1.24 )
  seqwrite 68k:   1600 ->  2300 IOPS ( x1.44 )

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 169 +++---
 block/qcow2.c |  28 -
 block/qcow2.h |  12 +++-
 3 files changed, 181 insertions(+), 28 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c0974e8..7cffdd4 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -898,20 +898,32 @@ out:
 
 /*
  * Check if there already is an AIO write request in flight which allocates
- * the same cluster. In this case we need to wait until the previous
- * request has completed and updated the L2 table accordingly.
+ * the same cluster.
+ * In this case, check if that request has explicitly allowed to write
+ * in its COW area(s).
+ *   If yes - fill the meta to point to the same cluster.
+ *   If no  - we need to wait until the previous request has completed and
+ *updated the L2 table accordingly or
+ *has allowed writing in its COW area(s).
  * Returns:
  *   0   if there was no dependency. *cur_bytes indicates the number of
  *   bytes from guest_offset that can be read before the next
  *   dependency must be processed (or the request is complete).
- *   *m is not modified
+ *   *m, *host_offset are not modified
+ *
+ *   1   if there is a dependency but it is possible to write concurrently
+ *   *m is filled accordingly,
+ *   *cur_bytes may have decreased and describes
+ * the length of the area that can be written to,
+ *   *host_offset contains the starting host image offset to write to
  *
  *   -EAGAIN if we had to wait for another request. The caller
- *   must start over, so consider *cur_bytes undefined.
+ *   must start over, so consider *cur_bytes and *host_offset 
undefined.
  *   *m is not modified
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-uint64_t *cur_bytes, QCowL2Meta **m)
+   uint64_t *host_offset, uint64_t *cur_bytes,
+   QCowL2Meta **m)
 {
 BDRVQcow2State *s = bs->opaque;
 QCowL2Meta *old_alloc;
@@ -924,7 +936,7 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 const uint64_t old_start = l2meta_cow_start(old_alloc);
 const uint64_t old_end = l2meta_cow_end(old_alloc);
 
-if (end <= old_start || start >= old_end) {
+if (end <= old_start || start >= old_end || old_alloc->piggybacked) {
 /* No intersection */
 continue;
 }
@@ -936,21 +948,95 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 continue;
 }
 
-/* Stop if already an l2meta exists. After yielding, it wouldn't
- * be valid any more, so we'd have to clean up the old L2Metas
- * and deal with requests depending on them before starting to
- * gather new ones. Not worth the trouble. */
-if (*m) {
+/* offsets of the cluster we're intersecting in */
+const uint64_t cluster_start = start_of_cluster(s, start);
+const uint64_t cluster_end = cluster_start + s->cluster_size;
+
+const uint64_t old_data_start = old_start
++ old_alloc->cow_start.nb_bytes;
+const uint64_t old_data_end = old_alloc->offset
++ old_alloc->cow_end.offset;
+
+const bool conflict_in_data_area =
+end > old_data_start && start < old_data_end;
+const bool conflict_in_old_cow_start =
+/* 1). new write request area is before the old */
+start < old_data_start
+&& /* 2). old request did not allow writing in its cow area */
+!old_alloc->cow_start.reduced;
+const bool conflict_in_old_cow_end =
+/* 1). new write request area is after the old */
+start > old_data_start
+&& /* 2). old request did not allow writing in its cow area */
+!old_alloc->cow_end.reduced;
+
+if (conflict_in_data_a

[Qemu-devel] [PATCH v1 13/13] iotest 046: test simultaneous cluster write error case

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
---
 tests/qemu-iotests/046 | 38 +-
 tests/qemu-iotests/046.out | 23 +++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046
index f2ebecf..c210b55 100755
--- a/tests/qemu-iotests/046
+++ b/tests/qemu-iotests/046
@@ -29,7 +29,8 @@ status=1  # failure is the default!
 
 _cleanup()
 {
-   _cleanup_test_img
+_cleanup_test_img
+rm "$TEST_DIR/blkdebug.conf"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -188,6 +189,37 @@ overlay_io | $QEMU_IO blkdebug::"$TEST_IMG" | 
_filter_qemu_io |\
sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g'
 
 echo
+echo "== Concurrency error case =="
+
+# 1. 1st request allocated the cluster, stop before it writes and updates L2
+# 2. 2nd request starts at the same cluster must complete write and start
+#waiting for the 1st to update L2
+# 3. Resume the 1st request to make it fail (injected error)
+# 4. 2nd request must wake and fail as well
+#1 cluster will end up leaked
+cat > "$TEST_DIR/blkdebug.conf" <

[Qemu-devel] [PATCH v1 02/13] qcow2: is_zero_sectors(): return true if area is outside of backing file

2017-05-19 Thread Anton Nefedov
in such case, bdrv_get_block_status() shall return 0, *nr == 0

iotest 154 updated accordingly: write-zeroes tail alignment can be detected
as zeroes now, so pwrite_zeroes succeeds

Signed-off-by: Anton Nefedov 
---
 block/qcow2.c  | 6 --
 tests/qemu-iotests/154.out | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 2e6a0ec..b885dfc 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2482,7 +2482,7 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t 
start,
 int64_t res;
 
 if (start + count > bs->total_sectors) {
-count = bs->total_sectors - start;
+count = start < bs->total_sectors ? bs->total_sectors - start : 0;
 }
 
 if (!count) {
@@ -2490,7 +2490,9 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t 
start,
 }
 res = bdrv_get_block_status_above(bs, NULL, start, count,
   &nr, &file);
-return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == count;
+return res >= 0
+&& (((res & BDRV_BLOCK_ZERO) && nr == count)
+|| nr == 0);
 }
 
 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
diff --git a/tests/qemu-iotests/154.out b/tests/qemu-iotests/154.out
index d8485ee..259340e 100644
--- a/tests/qemu-iotests/154.out
+++ b/tests/qemu-iotests/154.out
@@ -322,7 +322,7 @@ wrote 1024/1024 bytes at offset 134218240
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 2048/2048 bytes allocated at offset 128 MiB
 [{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET}]
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 
backing_file=TEST_DIR/t.IMGFMT.base
 wrote 2048/2048 bytes at offset 134217728
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
@@ -348,7 +348,7 @@ wrote 1024/1024 bytes at offset 134218240
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 2048/2048 bytes allocated at offset 128 MiB
 [{ "start": 0, "length": 134217728, "depth": 1, "zero": true, "data": false},
-{ "start": 134217728, "length": 2048, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET}]
+{ "start": 134217728, "length": 2048, "depth": 0, "zero": true, "data": false}]
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134219776 
backing_file=TEST_DIR/t.IMGFMT.base
 wrote 2048/2048 bytes at offset 134217728
 2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-- 
2.7.4




[Qemu-devel] [PATCH v1 11/13] qcow2-cluster: make handle_dependencies() logic easier to follow

2017-05-19 Thread Anton Nefedov
Avoid complicated nested conditions; return or continue asap instead.
The logic is not changed.

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 45 ++---
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 03d6f7e..c0974e8 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -926,32 +926,31 @@ static int handle_dependencies(BlockDriverState *bs, 
uint64_t guest_offset,
 
 if (end <= old_start || start >= old_end) {
 /* No intersection */
-} else {
-if (start < old_start) {
-/* Stop at the start of a running allocation */
-bytes = old_start - start;
-} else {
-bytes = 0;
-}
+continue;
+}
 
-/* Stop if already an l2meta exists. After yielding, it wouldn't
- * be valid any more, so we'd have to clean up the old L2Metas
- * and deal with requests depending on them before starting to
- * gather new ones. Not worth the trouble. */
-if (bytes == 0 && *m) {
-/* start must be cluster aligned at this point */
-assert(start == start_of_cluster(s, start));
-*cur_bytes = 0;
-return 0;
-}
+if (start < old_start) {
+/* Stop at the start of a running allocation */
+bytes = old_start - start;
+/* ..if there is no other conflict, keep checking */
+continue;
+}
 
-if (bytes == 0) {
-/* Wait for the dependency to complete. We need to recheck
- * the free/allocated clusters when we continue. */
-qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
-return -EAGAIN;
-}
+/* Stop if already an l2meta exists. After yielding, it wouldn't
+ * be valid any more, so we'd have to clean up the old L2Metas
+ * and deal with requests depending on them before starting to
+ * gather new ones. Not worth the trouble. */
+if (*m) {
+/* start must be cluster aligned at this point */
+assert(start == start_of_cluster(s, start));
+*cur_bytes = 0;
+return 0;
 }
+
+/* Wait for the dependency to complete. We need to recheck
+ * the free/allocated clusters when we continue. */
+qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
+return -EAGAIN;
 }
 
 /* Make sure that existing clusters and new allocations are only used up to
-- 
2.7.4




[Qemu-devel] [PATCH v1 05/13] qcow2: set inactive flag

2017-05-19 Thread Anton Nefedov
Qcow2State and BlockDriverState flags have to be in sync

Signed-off-by: Anton Nefedov 
---
 block/qcow2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 6e7ce96..07c1706 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1939,6 +1939,7 @@ static int qcow2_inactivate(BlockDriverState *bs)
 
 if (result == 0) {
 qcow2_mark_clean(bs);
+s->flags |= BDRV_O_INACTIVE;
 }
 
 return result;
-- 
2.7.4




[Qemu-devel] [PATCH v1 07/13] qcow2: check space leak at the end of the image

2017-05-19 Thread Anton Nefedov
From: Pavel Butsykin 

Preallocating memory in the image may remain unused after the fall, for
the qcow2_check adds the ability to identify and fix it, so as not to
store extra memory on the host.

Signed-off-by: Pavel Butsykin 
Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2.c  |  31 +++
 tests/qemu-iotests/026.out | 104 -
 tests/qemu-iotests/026.out.nocache | 104 -
 tests/qemu-iotests/029.out |   5 +-
 tests/qemu-iotests/060.out |  10 +++-
 tests/qemu-iotests/061.out |   5 +-
 tests/qemu-iotests/066.out |   5 +-
 tests/qemu-iotests/098.out |   7 ++-
 tests/qemu-iotests/108.out |   5 +-
 tests/qemu-iotests/112.out |   5 +-
 10 files changed, 225 insertions(+), 56 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 7b4359b..503f0dc 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -322,6 +322,32 @@ int qcow2_mark_consistent(BlockDriverState *bs)
 return 0;
 }
 
+static int qcow2_check_extra_preallocation(BlockDriverState *bs,
+BdrvCheckResult *res, BdrvCheckMode fix)
+{
+BDRVQcow2State *s = bs->opaque;
+uint64_t img_size = bdrv_getlength(bs->file->bs);
+
+if (res->image_end_offset < img_size) {
+uint64_t count =
+DIV_ROUND_UP(img_size - res->image_end_offset, s->cluster_size);
+fprintf(stderr, "%s space leaked at the end of the image %jd\n",
+fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
+img_size - res->image_end_offset);
+res->leaks += count;
+if (fix & BDRV_FIX_LEAKS) {
+int ret = bdrv_truncate(bs->file, res->image_end_offset, NULL);
+if (ret < 0) {
+res->check_errors++;
+return ret;
+}
+res->leaks_fixed += count;
+}
+}
+
+return 0;
+}
+
 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
BdrvCheckMode fix)
 {
@@ -330,6 +356,11 @@ static int qcow2_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 return ret;
 }
 
+ret = qcow2_check_extra_preallocation(bs, result, fix);
+if (ret < 0) {
+return ret;
+}
+
 if (fix && result->check_errors == 0 && result->corruptions == 0) {
 ret = qcow2_mark_clean(bs);
 if (ret < 0) {
diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out
index 86a50a2..e8cf348 100644
--- a/tests/qemu-iotests/026.out
+++ b/tests/qemu-iotests/026.out
@@ -5,7 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 5; imm: off; once: on; write
 write failed: Input/output error
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 5; imm: off; once: on; write -b
@@ -33,7 +36,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l1_update; errno: 28; imm: off; once: on; write -b
@@ -181,7 +187,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 5; imm: off; once: on; write
 write failed: Input/output error
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 5; imm: off; once: on; write -b
@@ -207,7 +216,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked at the end of the image 1024
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc_write; errno: 28; imm: off; once: on; write -b
@@ -468,20 +480,27 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: refblock_alloc_hookup; errno: 28; imm: off; once: on; write
 write failed: No space left on device
-No errors were found on the image.
+ERROR space leaked a

[Qemu-devel] [PATCH v1 01/13] qcow2: alloc space for COW in one chunk

2017-05-19 Thread Anton Nefedov
From: "Denis V. Lunev" 

Currently each single write operation can result in 3 write operations
if guest offsets are not cluster aligned. One write is performed for the
real payload and two for COW-ed areas. Thus the data possibly lays
non-contiguously on the host filesystem. This will reduce further
sequential read performance significantly.

The patch allocates the space in the file with cluster granularity,
ensuring
  1. better host offset locality
  2. less space allocation operations
 (which can be expensive on distributed storages)

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index a8d61f0..2e6a0ec 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1575,6 +1575,32 @@ fail:
 return ret;
 }
 
+static void handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+BDRVQcow2State *s = bs->opaque;
+BlockDriverState *file = bs->file->bs;
+QCowL2Meta *m;
+int ret;
+
+for (m = l2meta; m != NULL; m = m->next) {
+uint64_t bytes = m->nb_clusters << s->cluster_bits;
+
+if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
+continue;
+}
+
+/* try to alloc host space in one chunk for better locality */
+ret = file->drv->bdrv_co_pwrite_zeroes(file, m->alloc_offset, bytes, 
0);
+
+if (ret != 0) {
+continue;
+}
+
+file->total_sectors = MAX(file->total_sectors,
+  (m->alloc_offset + bytes) / 
BDRV_SECTOR_SIZE);
+}
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
  uint64_t bytes, QEMUIOVector *qiov,
  int flags)
@@ -1656,8 +1682,12 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 if (ret < 0) {
 goto fail;
 }
-
 qemu_co_mutex_unlock(&s->lock);
+
+if (bs->file->bs->drv->bdrv_co_pwrite_zeroes != NULL) {
+handle_alloc_space(bs, l2meta);
+}
+
 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
 trace_qcow2_writev_data(qemu_coroutine_self(),
 cluster_offset + offset_in_cluster);
-- 
2.7.4




[Qemu-devel] [PATCH v1 09/13] qcow2: fix misleading comment about L2 linking

2017-05-19 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 25210cd..4204db9 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -827,12 +827,10 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
 
 assert(l2_index + m->nb_clusters <= s->l2_size);
 for (i = 0; i < m->nb_clusters; i++) {
-/* if two concurrent writes happen to the same unallocated cluster
- * each write allocates separate cluster and writes data concurrently.
- * The first one to complete updates l2 table with pointer to its
- * cluster the second one has to do RMW (which is done above by
- * perform_cow()), update l2 table with its cluster pointer and free
- * old cluster. This is what this loop does */
+/* handle_dependencies() protects from normal cluster allocation
+ * collision; still L2 entry might be !0 in case of zero or compressed
+ * cluster reusage or writing over the snapshot
+ */
 if (l2_table[l2_index + i] != 0) {
 old_cluster[j++] = l2_table[l2_index + i];
 }
-- 
2.7.4




[Qemu-devel] [PATCH v1 04/13] qcow2: preallocation at image expand

2017-05-19 Thread Anton Nefedov
From: "Denis V. Lunev" 

This patch adds image preallocation at expand to provide better locality
of QCOW2 image file and optimize this procedure for some distributed
storages where this procedure is slow.

Image expand requests have to be suspended until the allocation is
performed which is done via special QCowL2Meta.
This meta is invisible to handle_dependencies() code.
This is the main reason for also calling preallocation before metadata
write: it might intersect with preallocation triggered by another IO,
and has to yield

Signed-off-by: Denis V. Lunev 
Signed-off-by: Anton Nefedov 
---
 block/qcow2-cache.c|   3 +
 block/qcow2-cluster.c  |   5 ++
 block/qcow2-refcount.c |  14 +
 block/qcow2.c  | 151 +
 block/qcow2.h  |   5 ++
 5 files changed, 178 insertions(+)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 1d25147..aa9da5f 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -204,6 +204,9 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, 
Qcow2Cache *c, int i)
 return ret;
 }
 
+/* check and preallocate extra space if touching a fresh metadata cluster 
*/
+qcow2_handle_prealloc(bs, c->entries[i].offset, s->cluster_size);
+
 if (c == s->refcount_block_cache) {
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
 } else if (c == s->l2_table_cache) {
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index cf18dee..a4b6d40 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -108,6 +108,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t 
min_size,
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, new_l1_table_offset,
+  QEMU_ALIGN_UP(new_l1_size2, s->cluster_size));
+
 BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
 for(i = 0; i < s->l1_size; i++)
 new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
@@ -1820,6 +1823,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState 
*bs, uint64_t *l1_table,
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, offset, s->cluster_size);
+
 ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
 if (ret < 0) {
 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 7c06061..873a1d2 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -547,6 +547,8 @@ static int alloc_refcount_block(BlockDriverState *bs,
 }
 
 /* Write refcount blocks to disk */
+qcow2_handle_prealloc(bs, meta_offset, blocks_clusters * s->cluster_size);
+
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
 ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
 blocks_clusters * s->cluster_size);
@@ -561,6 +563,10 @@ static int alloc_refcount_block(BlockDriverState *bs,
 cpu_to_be64s(&new_table[i]);
 }
 
+qcow2_handle_prealloc(bs, table_offset,
+  QEMU_ALIGN_UP(table_size * sizeof(uint64_t),
+s->cluster_size));
+
 BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
 ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
 table_size * sizeof(uint64_t));
@@ -2104,6 +2110,8 @@ write_refblocks:
 goto fail;
 }
 
+qcow2_handle_prealloc(bs, refblock_offset, s->cluster_size);
+
 /* The size of *refcount_table is always cluster-aligned, therefore the
  * write operation will not overflow */
 on_disk_refblock = (void *)((char *) *refcount_table +
@@ -2158,6 +2166,8 @@ write_refblocks:
 }
 
 assert(reftable_size < INT_MAX / sizeof(uint64_t));
+qcow2_handle_prealloc(bs, reftable_offset,
+  reftable_size * sizeof(uint64_t));
 ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
   reftable_size * sizeof(uint64_t));
 if (ret < 0) {
@@ -2845,6 +2855,10 @@ int qcow2_change_refcount_order(BlockDriverState *bs, 
int refcount_order,
 cpu_to_be64s(&new_reftable[i]);
 }
 
+qcow2_handle_prealloc(bs, new_reftable_offset,
+  QEMU_ALIGN_UP(new_reftable_size * sizeof(uint64_t),
+s->cluster_size));
+
 ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
   new_reftable_size * sizeof(uint64_t));
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b438f22..6e7ce96 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -464,6 +464,11 @@ static QemuOptsList qcow2_runtime_opts = {
 .type = QEMU_OPT_NUMBER,
 .help = "Clean unused cache entries after this time (in seconds)",
 },
+{
+.name = QCOW2_OPT_PRE

[Qemu-devel] [PATCH v1 03/13] qcow2: do not COW the empty areas

2017-05-19 Thread Anton Nefedov
If COW area of the newly allocated cluster is zeroes, there is no reason
to write zero sectors in perform_cow() again now as whole clusters are
zeroed out in single chunks by handle_alloc_space().

Introduce QCowL2Meta field "reduced", since the existing fields
(offset and nb_bytes) still has to keep other write requests from
simultaneous writing in the area

iotest 060:
write to the discarded cluster does not trigger COW anymore.
so, break on write_aio event instead, will work for the test
(but write won't fail anymore, so update reference output)

iotest 066:
cluster-alignment areas that were not really COWed are now detected
as zeroes, hence the initial write has to be exactly the same size for
the maps to match

performance tests: ===

qemu-io,
  results in seconds to complete (less is better)
  random write 4k to empty image, no backing
HDD
  64k cluster
128M over 128M image:   160 -> 160 ( x1  )
128M over   2G image:86 ->  84 ( x1  )
128M over   8G image:40 ->  29 ( x1.4 )
  1M cluster
 32M over   8G image:58 ->  23 ( x2.5 )

SSD
  64k cluster
  2G over   2G image:71 ->  38 (  x1.9 )
512M over   8G image:85 ->   8 ( x10.6 )
  1M cluster
128M over  32G image:   314 ->   2 ( x157  )

  - improvement grows bigger the bigger the cluster size,
  - first data portions to the fresh image benefit the most
  (more chance to hit an unallocated cluster)
  - SSD improvement is close to the IO length reduction rate
  (e.g. writing only 4k instead of 64k) gives theoretical x16
  and practical x10 improvement)

fio tests over xfs, empty image (cluster 64k), no backing,

  first megabytes of random writes:
randwrite 4k, size=8g:

  HDD (io_size=128m) :  730 ->  1050 IOPS ( x1.45)
  SSD (io_size=512m) : 1500 ->  7000 IOPS ( x4.7 )

  random writes io_size==image_size:
randwrite 4k, size=2g io_size=2g:
   HDD   : 200 IOPS (no difference)
   SSD   : 7500 ->  9500 IOPS ( x1.3 )

  sequential write:
seqwrite 4k, size=4g, iodepth=4
   SSD   : 7000 -> 18000 IOPS ( x2.6 )

  - numbers are similar to qemu-io tests, slightly less improvement
  (damped by fs?)

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
---
 block/qcow2-cluster.c  |  4 +++-
 block/qcow2.c  | 23 +++
 block/qcow2.h  |  4 
 tests/qemu-iotests/060 |  2 +-
 tests/qemu-iotests/060.out |  3 ++-
 tests/qemu-iotests/066 |  2 +-
 tests/qemu-iotests/066.out |  4 ++--
 7 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 347d94b..cf18dee 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -758,7 +758,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, 
Qcow2COWRegion *r)
 BDRVQcow2State *s = bs->opaque;
 int ret;
 
-if (r->nb_bytes == 0) {
+if (r->nb_bytes == 0 || r->reduced) {
 return 0;
 }
 
@@ -1267,10 +1267,12 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 .cow_start = {
 .offset = 0,
 .nb_bytes   = offset_into_cluster(s, guest_offset),
+.reduced= false,
 },
 .cow_end = {
 .offset = nb_bytes,
 .nb_bytes   = avail_bytes - nb_bytes,
+.reduced= false,
 },
 };
 qemu_co_queue_init(&(*m)->dependent_requests);
diff --git a/block/qcow2.c b/block/qcow2.c
index b885dfc..b438f22 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -64,6 +64,9 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
 
+static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
+uint32_t count);
+
 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
 const QCowHeader *cow_header = (const void *)buf;
@@ -1575,6 +1578,25 @@ fail:
 return ret;
 }
 
+static void handle_cow_reduce(BlockDriverState *bs, QCowL2Meta *m)
+{
+if (bs->encrypted) {
+return;
+}
+if (!m->cow_start.reduced && m->cow_start.nb_bytes != 0 &&
+is_zero_sectors(bs,
+(m->offset + m->cow_start.offset) >> BDRV_SECTOR_BITS,
+m->cow_start.nb_bytes >> BDRV_SECTOR_BITS)) {
+m->cow_start.reduced = true;
+}
+if (!m->cow_end.reduced && m->cow_end.nb_bytes != 0 &&
+is_zero_sectors(bs,
+(m->offset + m->cow_end.offset) >> BDRV_SECTOR_BITS,
+m->cow_end.nb_bytes >> BDRV_SECTOR_BITS)) {
+m->cow_end.reduced = true;
+}
+}
+
 static void handle_alloc_space(BlockDriverState *bs, QCowL2Met

[Qemu-devel] [PATCH v1 00/13] qcow2: space preallocation and COW improvements

2017-05-19 Thread Anton Nefedov
This pull request is to address a few performance problems of qcow2 format:

  1. non cluster-aligned write requests (to unallocated clusters) explicitly
pad data with zeroes if there is no backing data. This can be avoided
and the whole clusters are preallocated and zeroed in a single
efficient write_zeroes() operation, also providing better host file
continuity

  2. moreover, efficient write_zeroes() operation can be used to preallocate
space megabytes ahead which gives noticeable improvement on some storage
types (e.g. distributed storages where space allocation operation is
expensive)

  3. preallocating/zeroing the clusters in advance makes possible to enable
simultaneous writes to the same unallocated cluster, which is beneficial
for parallel sequential write operations which are not cluster-aligned

Performance test results are added to commit messages (see patch 3, 12)

Anton Nefedov (9):
  qcow2: is_zero_sectors(): return true if area is outside of backing
file
  qcow2: do not COW the empty areas
  qcow2: set inactive flag
  qcow2: handle_prealloc(): find out if area zeroed by earlier
preallocation
  qcow2: fix misleading comment about L2 linking
  qcow2-cluster: slightly refactor handle_dependencies()
  qcow2-cluster: make handle_dependencies() logic easier to follow
  qcow2: allow concurrent unaligned writes to the same clusters
  iotest 046: test simultaneous cluster write error case

Denis V. Lunev (3):
  qcow2: alloc space for COW in one chunk
  qcow2: preallocation at image expand
  qcow2: truncate preallocated space

Pavel Butsykin (1):
  qcow2: check space leak at the end of the image

 block/qcow2-cache.c|   3 +
 block/qcow2-cluster.c  | 216 +++-
 block/qcow2-refcount.c |  21 +++
 block/qcow2.c  | 286 -
 block/qcow2.h  |  26 
 tests/qemu-iotests/026.out | 104 ++
 tests/qemu-iotests/026.out.nocache | 104 ++
 tests/qemu-iotests/029.out |   5 +-
 tests/qemu-iotests/046 |  38 -
 tests/qemu-iotests/046.out |  23 +++
 tests/qemu-iotests/060 |   2 +-
 tests/qemu-iotests/060.out |  13 +-
 tests/qemu-iotests/061.out |   5 +-
 tests/qemu-iotests/066 |   2 +-
 tests/qemu-iotests/066.out |   9 +-
 tests/qemu-iotests/098.out |   7 +-
 tests/qemu-iotests/108.out |   5 +-
 tests/qemu-iotests/112.out |   5 +-
 tests/qemu-iotests/154.out |   4 +-
 19 files changed, 769 insertions(+), 109 deletions(-)

-- 
2.7.4




[Qemu-devel] [PATCH v3 1/1] qemu-img: wait for convert coroutines to complete

2017-04-26 Thread Anton Nefedov
On error path (like i/o error in one of the coroutines), it's required to
  - wait for coroutines completion before cleaning the common structures
  - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
 qemu-img.c | 26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index bbe1574..8c50379 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1761,13 +1761,13 @@ static void coroutine_fn convert_co_do_copy(void 
*opaque)
 qemu_co_mutex_lock(&s->lock);
 if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
 qemu_co_mutex_unlock(&s->lock);
-goto out;
+break;
 }
 n = convert_iteration_sectors(s, s->sector_num);
 if (n < 0) {
 qemu_co_mutex_unlock(&s->lock);
 s->ret = n;
-goto out;
+break;
 }
 /* save current sector and allocation status to local variables */
 sector_num = s->sector_num;
@@ -1792,7 +1792,6 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 error_report("error while reading sector %" PRId64
  ": %s", sector_num, strerror(-ret));
 s->ret = ret;
-goto out;
 }
 } else if (!s->min_sparse && status == BLK_ZERO) {
 status = BLK_DATA;
@@ -1801,22 +1800,20 @@ static void coroutine_fn convert_co_do_copy(void 
*opaque)
 
 if (s->wr_in_order) {
 /* keep writes in order */
-while (s->wr_offs != sector_num) {
-if (s->ret != -EINPROGRESS) {
-goto out;
-}
+while (s->wr_offs != sector_num && s->ret == -EINPROGRESS) {
 s->wait_sector_num[index] = sector_num;
 qemu_coroutine_yield();
 }
 s->wait_sector_num[index] = -1;
 }
 
-ret = convert_co_write(s, sector_num, n, buf, status);
-if (ret < 0) {
-error_report("error while writing sector %" PRId64
- ": %s", sector_num, strerror(-ret));
-s->ret = ret;
-goto out;
+if (s->ret == -EINPROGRESS) {
+ret = convert_co_write(s, sector_num, n, buf, status);
+if (ret < 0) {
+error_report("error while writing sector %" PRId64
+ ": %s", sector_num, strerror(-ret));
+s->ret = ret;
+}
 }
 
 if (s->wr_in_order) {
@@ -1837,7 +1834,6 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 }
 }
 
-out:
 qemu_vfree(buf);
 s->co[index] = NULL;
 s->running_coroutines--;
@@ -1899,7 +1895,7 @@ static int convert_do_copy(ImgConvertState *s)
 qemu_coroutine_enter(s->co[i]);
 }
 
-while (s->ret == -EINPROGRESS) {
+while (s->running_coroutines) {
 main_loop_wait(false);
 }
 
-- 
2.7.4




Re: [Qemu-devel] [Qemu-stable] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete

2017-04-25 Thread Anton Nefedov



On 04/25/2017 12:55 PM, Peter Lieven wrote:

Am 24.04.2017 um 22:13 schrieb Anton Nefedov:


On 24/04/2017 21:16, Peter Lieven wrote:




Am 24.04.2017 um 18:27 schrieb Anton Nefedov
:


On 04/21/2017 03:37 PM, Peter Lieven wrote:

Am 21.04.2017 um 14:19 schrieb Anton Nefedov:

On 04/21/2017 01:44 PM, Peter Lieven wrote:

Am 21.04.2017 um 12:04 schrieb Anton Nefedov:
On error path (like i/o error in one of the coroutines), it's
required to
 - wait for coroutines completion before cleaning the common
structures
 - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
[..]




And what if we error out in the read path? Wouldn't be something
like this easier?


diff --git a/qemu-img.c b/qemu-img.c
index 22f559a..4ff1085 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState
*s)
main_loop_wait(false);
}

+/* on error path we need to enter all coroutines that are still
+ * running before cleaning up common structures */
+if (s->ret) {
+for (i = 0; i < s->num_coroutines; i++) {
+ if (s->co[i]) {
+ qemu_coroutine_enter(s->co[i]);
+ }
+}
+}
+
if (s->compressed && !s->ret) {
/* signal EOF to align */
ret = blk_pwrite_compressed(s->target, 0, NULL, 0);


Peter



seemed a bit too daring to me to re-enter every coroutine
potentially including the ones that yielded waiting for I/O
completion.
If that's ok - that is for sure easier :)


I think we should enter every coroutine that is still running and
have it terminate correctly. It was my mistake that I have not
done this in the original patch. Can you check if the above fixes
your test cases that triggered the bug?



hi, sorry I'm late with the answer

this segfaults in bdrv_close(). Looks like it tries to finish some
i/o which coroutine we have already entered and terminated?

(gdb) run
Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O
[..]



Peter



/Anton



it seems that this is a bit tricky, can you share how your test case
works?

thanks,
peter



how I tested today was basically

diff --git a/qemu-img.c b/qemu-img.c
index 4425aaa..3d2d506 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1788,6 +1788,10 @@ static void coroutine_fn
convert_co_do_copy(void *opaque)

 if (status == BLK_DATA) {
 ret = convert_co_read(s, sector_num, n, buf);
+const uint64_t fsector = 10*1024*1024/512;
+if (sector_num <= fsector && fsector < sector_num+n) {
+ret = -EIO;
+}
 if (ret < 0) {
 error_report("error while reading sector %" PRId64
  ": %s", sector_num, strerror(-ret));



I ended up with sth similar to your approach. Can you check this?


Thanks,

Peter


diff --git a/qemu-img.c b/qemu-img.c
index b94fc11..ed9ce9e 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1761,13 +1761,13 @@ static void coroutine_fn convert_co_do_copy(void
*opaque)
 qemu_co_mutex_lock(&s->lock);
 if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
 qemu_co_mutex_unlock(&s->lock);
-goto out;
+break;
 }
 n = convert_iteration_sectors(s, s->sector_num);
 if (n < 0) {
 qemu_co_mutex_unlock(&s->lock);
 s->ret = n;
-goto out;
+break;
 }
 /* save current sector and allocation status to local variables */
 sector_num = s->sector_num;
@@ -1792,7 +1792,6 @@ static void coroutine_fn convert_co_do_copy(void
*opaque)
 error_report("error while reading sector %" PRId64
  ": %s", sector_num, strerror(-ret));
 s->ret = ret;
-goto out;
 }
 } else if (!s->min_sparse && status == BLK_ZERO) {
 status = BLK_DATA;
@@ -1801,22 +1800,20 @@ static void coroutine_fn convert_co_do_copy(void
*opaque)

 if (s->wr_in_order) {
 /* keep writes in order */
-while (s->wr_offs != sector_num) {
-if (s->ret != -EINPROGRESS) {
-goto out;
-}
+while (s->wr_offs != sector_num && s->ret == -EINPROGRESS) {
 s->wait_sector_num[index] = sector_num;
 qemu_coroutine_yield();
 }
 s->wait_sector_num[index] = -1;
 }

-ret = convert_co_write(s, sector_num, n, buf, status);
-if (ret < 0) {
-error_report("error while writing sector %" PRId64
- ": %s", sector_num, strerror(-ret));
-s->ret = ret;
- 

Re: [Qemu-devel] [Qemu-stable] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete

2017-04-24 Thread Anton Nefedov


On 24/04/2017 21:16, Peter Lieven wrote:




Am 24.04.2017 um 18:27 schrieb Anton Nefedov :


On 04/21/2017 03:37 PM, Peter Lieven wrote:

Am 21.04.2017 um 14:19 schrieb Anton Nefedov:

On 04/21/2017 01:44 PM, Peter Lieven wrote:

Am 21.04.2017 um 12:04 schrieb Anton Nefedov:
On error path (like i/o error in one of the coroutines), it's required to
 - wait for coroutines completion before cleaning the common structures
 - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
[..]




And what if we error out in the read path? Wouldn't be something like this 
easier?


diff --git a/qemu-img.c b/qemu-img.c
index 22f559a..4ff1085 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s)
main_loop_wait(false);
}

+/* on error path we need to enter all coroutines that are still
+ * running before cleaning up common structures */
+if (s->ret) {
+for (i = 0; i < s->num_coroutines; i++) {
+ if (s->co[i]) {
+ qemu_coroutine_enter(s->co[i]);
+ }
+}
+}
+
if (s->compressed && !s->ret) {
/* signal EOF to align */
ret = blk_pwrite_compressed(s->target, 0, NULL, 0);


Peter



seemed a bit too daring to me to re-enter every coroutine potentially including 
the ones that yielded waiting for I/O completion.
If that's ok - that is for sure easier :)


I think we should enter every coroutine that is still running and have it 
terminate correctly. It was my mistake that I have not
done this in the original patch. Can you check if the above fixes your test 
cases that triggered the bug?



hi, sorry I'm late with the answer

this segfaults in bdrv_close(). Looks like it tries to finish some i/o which 
coroutine we have already entered and terminated?

(gdb) run
Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O qcow2 
./harddisk.hdd.c ./harddisk.hdd
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffeac2d700 (LWP 436020)]
[New Thread 0x7fffe4ed6700 (LWP 436021)]
qemu-img: error while reading sector 20480: Input/output error
qemu-img: error while writing sector 19712: Operation now in progress

Program received signal SIGSEGV, Segmentation fault.
aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454
454 ctx = atomic_read(&co->ctx);
(gdb) bt
#0  aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454
/* [Anton]: thread_pool_co_cb () here */
#1  0x55634629 in thread_pool_completion_bh (opaque=0x55cfe020) at 
/mnt/code/us-qemu/util/thread-pool.c:189
#2  0x55633b31 in aio_bh_call (bh=0x55cfe0f0) at 
/mnt/code/us-qemu/util/async.c:90
#3  aio_bh_poll (ctx=ctx@entry=0x55cee6d0) at 
/mnt/code/us-qemu/util/async.c:118
#4  0x55636f14 in aio_poll (ctx=ctx@entry=0x55cee6d0, 
blocking=) at /mnt/code/us-qemu/util/aio-posix.c:682
#5  0x555c52d4 in bdrv_drain_recurse (bs=bs@entry=0x55d22560) at 
/mnt/code/us-qemu/block/io.c:164
#6  0x555c5aed in bdrv_drained_begin (bs=bs@entry=0x55d22560) at 
/mnt/code/us-qemu/block/io.c:248
#7  0x55581443 in bdrv_close (bs=0x55d22560) at 
/mnt/code/us-qemu/block.c:2909
#8  bdrv_delete (bs=0x55d22560) at /mnt/code/us-qemu/block.c:3100
#9  bdrv_unref (bs=0x55d22560) at /mnt/code/us-qemu/block.c:4087
#10 0x555baf44 in blk_remove_bs (blk=blk@entry=0x55d22380) at 
/mnt/code/us-qemu/block/block-backend.c:552
#11 0x555bb173 in blk_delete (blk=0x55d22380) at 
/mnt/code/us-qemu/block/block-backend.c:238
#12 blk_unref (blk=blk@entry=0x55d22380) at 
/mnt/code/us-qemu/block/block-backend.c:282
#13 0x5557a22c in img_convert (argc=, argv=) at /mnt/code/us-qemu/qemu-img.c:2359
#14 0x55574189 in main (argc=5, argv=0x7fffe4a0) at 
/mnt/code/us-qemu/qemu-img.c:4464



Peter



/Anton



it seems that this is a bit tricky, can you share how your test case works?

thanks,
peter



how I tested today was basically

diff --git a/qemu-img.c b/qemu-img.c
index 4425aaa..3d2d506 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1788,6 +1788,10 @@ static void coroutine_fn convert_co_do_copy(void 
*opaque)


 if (status == BLK_DATA) {
 ret = convert_co_read(s, sector_num, n, buf);
+const uint64_t fsector = 10*1024*1024/512;
+if (sector_num <= fsector && fsector < sector_num+n) {
+ret = -EIO;
+}
 if (ret < 0) {
 error_report("error while reading sector %" PRId64
  ": %s", sector_num, strerror(-ret));




Re: [Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete

2017-04-24 Thread Anton Nefedov

On 04/21/2017 03:37 PM, Peter Lieven wrote:

Am 21.04.2017 um 14:19 schrieb Anton Nefedov:

On 04/21/2017 01:44 PM, Peter Lieven wrote:

Am 21.04.2017 um 12:04 schrieb Anton Nefedov:

On error path (like i/o error in one of the coroutines), it's required to
  - wait for coroutines completion before cleaning the common structures
  - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
[..]




And what if we error out in the read path? Wouldn't be something like this 
easier?


diff --git a/qemu-img.c b/qemu-img.c
index 22f559a..4ff1085 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s)
 main_loop_wait(false);
 }

+/* on error path we need to enter all coroutines that are still
+ * running before cleaning up common structures */
+if (s->ret) {
+for (i = 0; i < s->num_coroutines; i++) {
+ if (s->co[i]) {
+ qemu_coroutine_enter(s->co[i]);
+ }
+}
+}
+
 if (s->compressed && !s->ret) {
 /* signal EOF to align */
 ret = blk_pwrite_compressed(s->target, 0, NULL, 0);


Peter



seemed a bit too daring to me to re-enter every coroutine potentially including 
the ones that yielded waiting for I/O completion.
If that's ok - that is for sure easier :)


I think we should enter every coroutine that is still running and have it 
terminate correctly. It was my mistake that I have not
done this in the original patch. Can you check if the above fixes your test 
cases that triggered the bug?



hi, sorry I'm late with the answer

this segfaults in bdrv_close(). Looks like it tries to finish some i/o 
which coroutine we have already entered and terminated?


(gdb) run
Starting program: /vz/anefedov/qemu-build/us/./qemu-img convert -O qcow2 
./harddisk.hdd.c ./harddisk.hdd

[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffeac2d700 (LWP 436020)]
[New Thread 0x7fffe4ed6700 (LWP 436021)]
qemu-img: error while reading sector 20480: Input/output error
qemu-img: error while writing sector 19712: Operation now in progress

Program received signal SIGSEGV, Segmentation fault.
aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454
454 ctx = atomic_read(&co->ctx);
(gdb) bt
#0  aio_co_wake (co=0x0) at /mnt/code/us-qemu/util/async.c:454
/* [Anton]: thread_pool_co_cb () here */
#1  0x55634629 in thread_pool_completion_bh 
(opaque=0x55cfe020) at /mnt/code/us-qemu/util/thread-pool.c:189
#2  0x55633b31 in aio_bh_call (bh=0x55cfe0f0) at 
/mnt/code/us-qemu/util/async.c:90
#3  aio_bh_poll (ctx=ctx@entry=0x55cee6d0) at 
/mnt/code/us-qemu/util/async.c:118
#4  0x55636f14 in aio_poll (ctx=ctx@entry=0x55cee6d0, 
blocking=) at /mnt/code/us-qemu/util/aio-posix.c:682
#5  0x555c52d4 in bdrv_drain_recurse 
(bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:164
#6  0x555c5aed in bdrv_drained_begin 
(bs=bs@entry=0x55d22560) at /mnt/code/us-qemu/block/io.c:248
#7  0x55581443 in bdrv_close (bs=0x55d22560) at 
/mnt/code/us-qemu/block.c:2909

#8  bdrv_delete (bs=0x55d22560) at /mnt/code/us-qemu/block.c:3100
#9  bdrv_unref (bs=0x55d22560) at /mnt/code/us-qemu/block.c:4087
#10 0x555baf44 in blk_remove_bs (blk=blk@entry=0x55d22380) 
at /mnt/code/us-qemu/block/block-backend.c:552
#11 0x555bb173 in blk_delete (blk=0x55d22380) at 
/mnt/code/us-qemu/block/block-backend.c:238
#12 blk_unref (blk=blk@entry=0x55d22380) at 
/mnt/code/us-qemu/block/block-backend.c:282
#13 0x5557a22c in img_convert (argc=, 
argv=) at /mnt/code/us-qemu/qemu-img.c:2359
#14 0x55574189 in main (argc=5, argv=0x7fffe4a0) at 
/mnt/code/us-qemu/qemu-img.c:4464




Peter



/Anton



[Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete

2017-04-21 Thread Anton Nefedov
On error path (like i/o error in one of the coroutines), it's required to
  - wait for coroutines completion before cleaning the common structures
  - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
 qemu-img.c | 44 +++-
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index b220cf7..24950c1 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1735,6 +1735,27 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 return 0;
 }
 
+static void coroutine_fn convert_reenter_waiting(ImgConvertState *s,
+ uint64_t wr_offs)
+{
+int i;
+if (s->wr_in_order) {
+/* reenter the coroutine that might have waited
+ * for the write to complete */
+for (i = 0; i < s->num_coroutines; i++) {
+if (s->co[i] && s->wait_sector_num[i] == wr_offs) {
+/*
+ * A -> B -> A cannot occur because A has
+ * s->wait_sector_num[i] == -1 during A -> B.  Therefore
+ * B will never enter A during this time window.
+ */
+qemu_coroutine_enter(s->co[i]);
+break;
+}
+}
+}
+}
+
 static void coroutine_fn convert_co_do_copy(void *opaque)
 {
 ImgConvertState *s = opaque;
@@ -1792,6 +1813,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 error_report("error while reading sector %" PRId64
  ": %s", sector_num, strerror(-ret));
 s->ret = ret;
+convert_reenter_waiting(s, sector_num + n);
 goto out;
 }
 } else if (!s->min_sparse && status == BLK_ZERO) {
@@ -1803,6 +1825,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 /* keep writes in order */
 while (s->wr_offs != sector_num) {
 if (s->ret != -EINPROGRESS) {
+convert_reenter_waiting(s, sector_num + n);
 goto out;
 }
 s->wait_sector_num[index] = sector_num;
@@ -1816,25 +1839,12 @@ static void coroutine_fn convert_co_do_copy(void 
*opaque)
 error_report("error while writing sector %" PRId64
  ": %s", sector_num, strerror(-ret));
 s->ret = ret;
+convert_reenter_waiting(s, sector_num + n);
 goto out;
 }
 
-if (s->wr_in_order) {
-/* reenter the coroutine that might have waited
- * for this write to complete */
-s->wr_offs = sector_num + n;
-for (i = 0; i < s->num_coroutines; i++) {
-if (s->co[i] && s->wait_sector_num[i] == s->wr_offs) {
-/*
- * A -> B -> A cannot occur because A has
- * s->wait_sector_num[i] == -1 during A -> B.  Therefore
- * B will never enter A during this time window.
- */
-qemu_coroutine_enter(s->co[i]);
-break;
-}
-}
-}
+s->wr_offs = sector_num + n;
+convert_reenter_waiting(s, s->wr_offs);
 }
 
 out:
@@ -1899,7 +1909,7 @@ static int convert_do_copy(ImgConvertState *s)
 qemu_coroutine_enter(s->co[i]);
 }
 
-while (s->ret == -EINPROGRESS) {
+while (s->running_coroutines) {
 main_loop_wait(false);
 }
 
-- 
2.7.4




Re: [Qemu-devel] [PATCH v2 1/1] qemu-img: wait for convert coroutines to complete

2017-04-21 Thread Anton Nefedov

On 04/21/2017 01:44 PM, Peter Lieven wrote:

Am 21.04.2017 um 12:04 schrieb Anton Nefedov:

On error path (like i/o error in one of the coroutines), it's required to
  - wait for coroutines completion before cleaning the common structures
  - reenter dependent coroutines so they ever finish

Introduced in 2d9187bc65.

Signed-off-by: Anton Nefedov 
---
[..]




And what if we error out in the read path? Wouldn't be something like this 
easier?


diff --git a/qemu-img.c b/qemu-img.c
index 22f559a..4ff1085 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1903,6 +1903,16 @@ static int convert_do_copy(ImgConvertState *s)
 main_loop_wait(false);
 }

+/* on error path we need to enter all coroutines that are still
+ * running before cleaning up common structures */
+if (s->ret) {
+for (i = 0; i < s->num_coroutines; i++) {
+ if (s->co[i]) {
+ qemu_coroutine_enter(s->co[i]);
+ }
+}
+}
+
 if (s->compressed && !s->ret) {
 /* signal EOF to align */
 ret = blk_pwrite_compressed(s->target, 0, NULL, 0);


Peter



seemed a bit too daring to me to re-enter every coroutine potentially 
including the ones that yielded waiting for I/O completion.

If that's ok - that is for sure easier :)

Or maybe some intermediate variant, as you suggest but only enter the 
"wait_sector" coroutines (and main_loop_wait for the rest)?

Or do not even re-enter them, like just

-while (s->ret == -EINPROGRESS) {
+while (s->running_coroutines != s->waiting_coroutines) {
 main_loop_wait(false);
 }


/Anton



Re: [Qemu-devel] [PATCH 1/1] qemu-img: wait for convert coroutines to complete

2017-04-21 Thread Anton Nefedov

On 04/21/2017 12:18 PM, Peter Lieven wrote:

Am 18.04.2017 um 12:27 schrieb Denis V. Lunev:

From: Anton Nefedov 

We should wait for other coroutines on error path, i.e. one of coroutines
terminates with i/o error, before cleaning the common structures. In the
other case we would crash in a lot of different places. This behaviour
was introduced by commit 2d9187bc65.

Signed-off-by: Anton Nefedov 
Signed-off-by: Denis V. Lunev 
CC: Peter Lieven 
CC: Kevin Wolf 
CC: Max Reitz 


Cc: qemu-sta...@nongnu.org

This should go into 2.9.1

Peter



Actually I'm afraid the patch is incorrect.. The erroneous coroutine 
bails out without reentering its possible dependent coroutine so it (and 
the following dependents) will never finish.


I'll send out the 2nd version soon

/Anton



Re: [Qemu-devel] [PATCH] io: fully parse & validate HTTP headers for websocket protocol handshake

2017-02-28 Thread Anton Nefedov

On 02/28/2017 03:07 PM, Daniel P. Berrange wrote:

The current websockets protocol handshake code is very relaxed, just
doing crude string searching across the HTTP header data. This causes
it to both reject valid connections and fail to reject invalid
connections. For example, according to the RFC 6455 it:

 - MUST reject any method other than "GET"
 - MUST reject any HTTP version less than "HTTP/1.1"
 - MUST reject Connection header without "Upgrade" listed
 - MUST reject Upgrade header which is not 'websocket'
 - MUST reject missing Host header
 - MUST treat HTTP header names as case insensitive

To do all this validation correctly requires that we fully parse the
HTTP headers, populating a data structure containing the header
fields.

After this change, we also reject any path other than '/'

Signed-off-by: Daniel P. Berrange 
---


thanks, this works with the client that used to fail
(tornado python lib http://www.tornadoweb.org/en/stable/)


/Anton



Re: [Qemu-devel] [PATCH 3/3] vl: pass CPUState to qemu_system_guest_panicked

2017-02-22 Thread Anton Nefedov

hi,

On 02/21/2017 01:42 PM, Paolo Bonzini wrote:

qemu_system_guest_panicked was already using current_cpu implicitly,
so it makes sense for it to receive a CPUState.  This lets the
function call cpu_get_crash_info and free the result.



my motivation against this was that as qemu_system_guest_panicked() is 
also used by pvpanic device, if it's ever going to pass any information, 
GuestPanicInformation struct could be reused.

Though I guess it doesnt matter much now

/Anton



[Qemu-devel] [PATCH 0/1] qmp-events: fix GUEST_PANICKED description formatting

2017-02-17 Thread Anton Nefedov
This patch is a follow-up for:

http://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg03647.html




[Qemu-devel] [PATCH] qmp-events: fix GUEST_PANICKED description formatting

2017-02-16 Thread Anton Nefedov
also remove a useless NULL check in the event reporting code

Signed-off-by: Anton Nefedov 
---
 qapi/event.json |  4 ++--
 vl.c| 22 ++
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/qapi/event.json b/qapi/event.json
index 970ff02..e02852c 100644
--- a/qapi/event.json
+++ b/qapi/event.json
@@ -488,9 +488,9 @@
 #
 # @action: action that has been taken, currently always "pause"
 #
-# @info: optional information about a panic
+# @info: #optional information about a panic (since 2.9)
 #
-# Since: 1.5 (@info since 2.9)
+# Since: 1.5
 #
 # Example:
 #
diff --git a/vl.c b/vl.c
index 903c46d..f410e03 100644
--- a/vl.c
+++ b/vl.c
@@ -1710,6 +1710,15 @@ void qemu_system_reset(bool report)
 void qemu_system_guest_panicked(GuestPanicInformation *info)
 {
 qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed\n");
+if (info && info->type == GUEST_PANIC_INFORMATION_KIND_HYPER_V) {
+qemu_log_mask(LOG_GUEST_ERROR, "HV crash parameters: (%#"PRIx64
+  " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n",
+  info->u.hyper_v.data->arg1,
+  info->u.hyper_v.data->arg2,
+  info->u.hyper_v.data->arg3,
+  info->u.hyper_v.data->arg4,
+  info->u.hyper_v.data->arg5);
+}
 
 if (current_cpu) {
 current_cpu->crash_occurred = true;
@@ -1723,18 +1732,7 @@ void qemu_system_guest_panicked(GuestPanicInformation 
*info)
 qemu_system_shutdown_request();
 }
 
-if (info) {
-if (info->type == GUEST_PANIC_INFORMATION_KIND_HYPER_V) {
-qemu_log_mask(LOG_GUEST_ERROR, "HV crash parameters: (%#"PRIx64
-  " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n",
-  info->u.hyper_v.data->arg1,
-  info->u.hyper_v.data->arg2,
-  info->u.hyper_v.data->arg3,
-  info->u.hyper_v.data->arg4,
-  info->u.hyper_v.data->arg5);
-}
-qapi_free_GuestPanicInformation(info);
-}
+qapi_free_GuestPanicInformation(info);
 }
 
 void qemu_system_reset_request(void)
-- 
2.7.4




[Qemu-devel] [PATCH] qmp-events: fix GUEST_PANICKED description formatting

2017-02-16 Thread Anton Nefedov
Signed-off-by: Anton Nefedov 
---
 qapi/event.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qapi/event.json b/qapi/event.json
index 970ff02..e02852c 100644
--- a/qapi/event.json
+++ b/qapi/event.json
@@ -488,9 +488,9 @@
 #
 # @action: action that has been taken, currently always "pause"
 #
-# @info: optional information about a panic
+# @info: #optional information about a panic (since 2.9)
 #
-# Since: 1.5 (@info since 2.9)
+# Since: 1.5
 #
 # Example:
 #
-- 
2.7.4




Re: [Qemu-devel] [PATCH 1/1] mirror: restore offset after zeroing out the image

2017-01-31 Thread Anton Nefedov

On 01/30/2017 08:33 PM, Denis V. Lunev wrote:

On 01/30/2017 08:16 PM, Eric Blake wrote:

On 01/30/2017 04:22 AM, Denis V. Lunev wrote:

If explicit zeroing out before mirroring is required for the target image,
it moves the block job offset counter to EOF, then offset and len counters
count the image size twice.

There is no harm but confusing stats (e.g. for 1G image the completion
counter starts from 1G and increases to 2G)

The patch fixed that problem by resetting the offset counter.

Counters are explicitly documented NOT tied to disk length; they are
merely estimates of proportional completion.  I'm not sure if this makes
the numbers jump backwards from the observer's viewpoint, but if you can
ever spot 1g/1g right before rewinding to 0g/1g (where pre-patch could
see 1g/2g), then that is a reason to not take this patch.


FWIW you'll actually see 1g/0g first (if you happen to catch that tiny 
phase :)



On the other
hand, your argument that the pre-patch behavior progressing towards 2g
has a very fast progression from 0-1g/2g, and then a much slower
1g-2g/2g, which makes the estimate of percent completion skewed, while a
newer progression of 0-1g/1g is more realistic, may have some merit.

I'm not sold on this patch yet, but stronger arguments in the commit
message may sway me.


+++ b/tests/qemu-iotests/094.out
@@ -3,9 +3,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 Formatting 'TEST_DIR/source.IMGFMT', fmt=IMGFMT size=67108864
 {"return": {}}
 {"return": {}}
-{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", 
"len": 67108864, "offset": 67108864, "speed": 0, "type": "mirror"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_READY", "data": {"device": "src", 
"len": 0, "offset": 0, "speed": 0, "type": "mirror"}}
 {"return": {}}

This part of the change is scary - a ready event showing 0/0 HAS been
known to confuse libvirt in the past.  Qemu should NEVER advertise a
ready event with 0/0, it should at least be 1/1 (because of the number
of clients that have workarounds to deal with older qemu behavior on 0/0
and which might misbehave if we ever issue that again).


I think this 094.out modification is not needed and actually breaks the 
test; apologies





-{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", 
"len": 67108864, "offset": 67108864, "speed": 0, "type": "mirror"}}
+{"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "src", 
"len": 0, "offset": 0, "speed": 0, "type": "mirror"}}

So NACK to the patch as currently written, but not necessarily to the
idea if you can give better progress numbers and never reach the state
of BLOCK_JOB_READY and BLOCK_JOB_COMPLETED with a 0/0 length/offset.


ok. fair enough. Thank you for the review.

Will it be better to (somehow) skip progressing below using
some condition during mirror_dirty_init() stage?

static void mirror_iteration_done(MirrorOp *op, int ret)
{
.

if (ret >= 0) {
if (s->cow_bitmap) {
bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
}
s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE;
< specifically this progressing
}


Isn't this going to look quite too invasive for such cause

--
best regards,
Anton



<    1   2   3   4   5