from:"Dr. David Alan Gilbert \(git\)"

[PATCH 091/104] libvhost-user: Fix some memtable remap cases

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

If a new setmemtable command comes in once the vhost threads are
running, it will remap the guests address space and the threads
will now be looking in the wrong place.

Fortunately we're running this command under lock, so we can
update the queue mappings so that threads will look in the new-right
place.

Note: This doesn't fix things that the threads might be doing
without a lock (e.g. a readv/writev!)  That's for another time.

Signed-off-by: Dr. David Alan Gilbert 
---
 contrib/libvhost-user/libvhost-user.c | 33 ---
 contrib/libvhost-user/libvhost-user.h |  3 +++
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c 
b/contrib/libvhost-user/libvhost-user.c
index 63e41062a4..b89bf18501 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -564,6 +564,21 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
 return false;
 }
 
+static bool
+map_ring(VuDev *dev, VuVirtq *vq)
+{
+vq->vring.desc = qva_to_va(dev, vq->vra.desc_user_addr);
+vq->vring.used = qva_to_va(dev, vq->vra.used_user_addr);
+vq->vring.avail = qva_to_va(dev, vq->vra.avail_user_addr);
+
+DPRINT("Setting virtq addresses:\n");
+DPRINT("vring_desc  at %p\n", vq->vring.desc);
+DPRINT("vring_used  at %p\n", vq->vring.used);
+DPRINT("vring_avail at %p\n", vq->vring.avail);
+
+return !(vq->vring.desc && vq->vring.used && vq->vring.avail);
+}
+
 static bool
 vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -767,6 +782,14 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
 close(vmsg->fds[i]);
 }
 
+for (i = 0; i < dev->max_queues; i++) {
+if (dev->vq[i].vring.desc) {
+if (map_ring(dev, &dev->vq[i])) {
+vu_panic(dev, "remaping queue %d during setmemtable", i);
+}
+}
+}
+
 return false;
 }
 
@@ -853,18 +876,12 @@ vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg)
 DPRINT("avail_user_addr:  0x%016" PRIx64 "\n", vra->avail_user_addr);
 DPRINT("log_guest_addr:   0x%016" PRIx64 "\n", vra->log_guest_addr);
 
+vq->vra = *vra;
 vq->vring.flags = vra->flags;
-vq->vring.desc = qva_to_va(dev, vra->desc_user_addr);
-vq->vring.used = qva_to_va(dev, vra->used_user_addr);
-vq->vring.avail = qva_to_va(dev, vra->avail_user_addr);
 vq->vring.log_guest_addr = vra->log_guest_addr;
 
-DPRINT("Setting virtq addresses:\n");
-DPRINT("vring_desc  at %p\n", vq->vring.desc);
-DPRINT("vring_used  at %p\n", vq->vring.used);
-DPRINT("vring_avail at %p\n", vq->vring.avail);
 
-if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) {
+if (map_ring(dev, vq)) {
 vu_panic(dev, "Invalid vring_addr message");
 return false;
 }
diff --git a/contrib/libvhost-user/libvhost-user.h 
b/contrib/libvhost-user/libvhost-user.h
index 1844b6f8d4..5cb7708559 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -327,6 +327,9 @@ typedef struct VuVirtq {
 int err_fd;
 unsigned int enable;
 bool started;
+
+/* Guest addresses of our ring */
+struct vhost_vring_addr vra;
 } VuVirtq;
 
 enum VuWatchCondtion {
-- 
2.23.0

[PATCH 095/104] virtiofsd: convert more fprintf and perror to use fuse log infra

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: Eryu Guan 

Signed-off-by: Eryu Guan 
---
 tools/virtiofsd/fuse_signals.c | 6 +-
 tools/virtiofsd/helper.c   | 9 ++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c
index 10a6f88088..edabf24e0d 100644
--- a/tools/virtiofsd/fuse_signals.c
+++ b/tools/virtiofsd/fuse_signals.c
@@ -11,6 +11,7 @@
 #include "fuse_i.h"
 #include "fuse_lowlevel.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -46,12 +47,15 @@ static int set_one_signal_handler(int sig, void 
(*handler)(int), int remove)
 sa.sa_flags = 0;
 
 if (sigaction(sig, NULL, &old_sa) == -1) {
-perror("fuse: cannot get old signal handler");
+fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n",
+ strerror(errno));
 return -1;
 }
 
 if (old_sa.sa_handler == (remove ? handler : SIG_DFL) &&
 sigaction(sig, &sa, NULL) == -1) {
+fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n",
+ strerror(errno));
 perror("fuse: cannot set signal handler");
 return -1;
 }
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 7b28507a38..bcb8c05063 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -200,7 +200,8 @@ int fuse_daemonize(int foreground)
 char completed;
 
 if (pipe(waiter)) {
-perror("fuse_daemonize: pipe");
+fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n",
+ strerror(errno));
 return -1;
 }
 
@@ -210,7 +211,8 @@ int fuse_daemonize(int foreground)
  */
 switch (fork()) {
 case -1:
-perror("fuse_daemonize: fork");
+fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n",
+ strerror(errno));
 return -1;
 case 0:
 break;
@@ -220,7 +222,8 @@ int fuse_daemonize(int foreground)
 }
 
 if (setsid() == -1) {
-perror("fuse_daemonize: setsid");
+fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n",
+ strerror(errno));
 return -1;
 }
 
-- 
2.23.0

[PATCH 097/104] virtiofsd: Fix data corruption with O_APPEND wirte in writeback mode

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: Misono Tomohiro 

When writeback mode is enabled (-o writeback), O_APPEND handling is
done in kernel. Therefore virtiofsd clears O_APPEND flag when open.
Otherwise O_APPEND flag takes precedence over pwrite() and write
data may corrupt.

Currently clearing O_APPEND flag is done in lo_open(), but we also
need the same operation in lo_create(). So, factor out the flag
update operation in lo_open() to update_open_flags() and call it
in both lo_open() and lo_create().

This fixes the failure of xfstest generic/069 in writeback mode
(which tests O_APPEND write data integrity).

Signed-off-by: Misono Tomohiro 
---
 tools/virtiofsd/passthrough_ll.c | 66 
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 6b3d396b6f..1bf251a91d 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -1676,6 +1676,37 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
 fuse_reply_err(req, 0);
 }
 
+static void update_open_flags(int writeback, struct fuse_file_info *fi)
+{
+/*
+ * With writeback cache, kernel may send read requests even
+ * when userspace opened write-only
+ */
+if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
+fi->flags &= ~O_ACCMODE;
+fi->flags |= O_RDWR;
+}
+
+/*
+ * With writeback cache, O_APPEND is handled by the kernel.
+ * This breaks atomicity (since the file may change in the
+ * underlying filesystem, so that the kernel's idea of the
+ * end of the file isn't accurate anymore). In this example,
+ * we just accept that. A more rigorous filesystem may want
+ * to return an error here
+ */
+if (writeback && (fi->flags & O_APPEND)) {
+fi->flags &= ~O_APPEND;
+}
+
+/*
+ * O_DIRECT in guest should not necessarily mean bypassing page
+ * cache on host as well. If somebody needs that behavior, it
+ * probably should be a configuration knob in daemon.
+ */
+fi->flags &= ~O_DIRECT;
+}
+
 static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
   mode_t mode, struct fuse_file_info *fi)
 {
@@ -1705,12 +1736,7 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, 
const char *name,
 goto out;
 }
 
-/*
- * O_DIRECT in guest should not necessarily mean bypassing page
- * cache on host as well. If somebody needs that behavior, it
- * probably should be a configuration knob in daemon.
- */
-fi->flags &= ~O_DIRECT;
+update_open_flags(lo->writeback, fi);
 
 fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
 mode);
@@ -1920,33 +1946,7 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, 
struct fuse_file_info *fi)
 fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
  fi->flags);
 
-/*
- * With writeback cache, kernel may send read requests even
- * when userspace opened write-only
- */
-if (lo->writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
-fi->flags &= ~O_ACCMODE;
-fi->flags |= O_RDWR;
-}
-
-/*
- * With writeback cache, O_APPEND is handled by the kernel.
- * This breaks atomicity (since the file may change in the
- * underlying filesystem, so that the kernel's idea of the
- * end of the file isn't accurate anymore). In this example,
- * we just accept that. A more rigorous filesystem may want
- * to return an error here
- */
-if (lo->writeback && (fi->flags & O_APPEND)) {
-fi->flags &= ~O_APPEND;
-}
-
-/*
- * O_DIRECT in guest should not necessarily mean bypassing page
- * cache on host as well. If somebody needs that behavior, it
- * probably should be a configuration knob in daemon.
- */
-fi->flags &= ~O_DIRECT;
+update_open_flags(lo->writeback, fi);
 
 sprintf(buf, "%i", lo_fd(req, ino));
 fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
-- 
2.23.0

[PATCH 093/104] virtiofsd: introduce inode refcount to prevent use-after-free

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: Stefan Hajnoczi 

If thread A is using an inode it must not be deleted by thread B when
processing a FUSE_FORGET request.

The FUSE protocol itself already has a counter called nlookup that is
used in FUSE_FORGET messages.  We cannot trust this counter since the
untrusted client can manipulate it via FUSE_FORGET messages.

Introduce a new refcount to keep inodes alive for the required lifespan.
lo_inode_put() must be called to release a reference.  FUSE's nlookup
counter holds exactly one reference so that the inode stays alive as
long as the client still wants to remember it.

Note that the lo_inode->is_symlink field is moved to avoid creating a
hole in the struct due to struct field alignment.

Signed-off-by: Stefan Hajnoczi 
---
 tools/virtiofsd/passthrough_ll.c | 168 ++-
 1 file changed, 145 insertions(+), 23 deletions(-)

diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index b19c9ee328..8f4ab8351c 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -99,7 +99,13 @@ struct lo_key {
 
 struct lo_inode {
 int fd;
-bool is_symlink;
+
+/*
+ * Atomic reference count for this object.  The nlookup field holds a
+ * reference and release it when nlookup reaches 0.
+ */
+gint refcount;
+
 struct lo_key key;
 
 /*
@@ -118,6 +124,8 @@ struct lo_inode {
 fuse_ino_t fuse_ino;
 pthread_mutex_t plock_mutex;
 GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
+
+bool is_symlink;
 };
 
 struct lo_cred {
@@ -473,6 +481,23 @@ static ssize_t lo_add_inode_mapping(fuse_req_t req, struct 
lo_inode *inode)
 return elem - lo_data(req)->ino_map.elems;
 }
 
+static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
+{
+struct lo_inode *inode = *inodep;
+
+if (!inode) {
+return;
+}
+
+*inodep = NULL;
+
+if (g_atomic_int_dec_and_test(&inode->refcount)) {
+close(inode->fd);
+free(inode);
+}
+}
+
+/* Caller must release refcount using lo_inode_put() */
 static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
 {
 struct lo_data *lo = lo_data(req);
@@ -480,6 +505,9 @@ static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t 
ino)
 
 pthread_mutex_lock(&lo->mutex);
 elem = lo_map_get(&lo->ino_map, ino);
+if (elem) {
+g_atomic_int_inc(&elem->inode->refcount);
+}
 pthread_mutex_unlock(&lo->mutex);
 
 if (!elem) {
@@ -489,10 +517,23 @@ static struct lo_inode *lo_inode(fuse_req_t req, 
fuse_ino_t ino)
 return elem->inode;
 }
 
+/*
+ * TODO Remove this helper and force callers to hold an inode refcount until
+ * they are done with the fd.  This will be done in a later patch to make
+ * review easier.
+ */
 static int lo_fd(fuse_req_t req, fuse_ino_t ino)
 {
 struct lo_inode *inode = lo_inode(req, ino);
-return inode ? inode->fd : -1;
+int fd;
+
+if (!inode) {
+return -1;
+}
+
+fd = inode->fd;
+lo_inode_put(lo_data(req), &inode);
+return fd;
 }
 
 static void lo_init(void *userdata, struct fuse_conn_info *conn)
@@ -547,6 +588,10 @@ static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
 fuse_reply_attr(req, &buf, lo->timeout);
 }
 
+/*
+ * Increments parent->nlookup and caller must release refcount using
+ * lo_inode_put(&parent).
+ */
 static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
   char path[PATH_MAX], struct lo_inode **parent)
 {
@@ -584,6 +629,7 @@ retry:
 p = &lo->root;
 pthread_mutex_lock(&lo->mutex);
 p->nlookup++;
+g_atomic_int_inc(&p->refcount);
 pthread_mutex_unlock(&lo->mutex);
 } else {
 *last = '\0';
@@ -665,6 +711,7 @@ fallback:
 if (res != -1) {
 res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW);
 unref_inode_lolocked(lo, parent, 1);
+lo_inode_put(lo, &parent);
 }
 
 return res;
@@ -782,11 +829,13 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, 
struct stat *attr,
 goto out_err;
 }
 }
+lo_inode_put(lo, &inode);
 
 return lo_getattr(req, ino, fi);
 
 out_err:
 saverr = errno;
+lo_inode_put(lo, &inode);
 fuse_reply_err(req, saverr);
 }
 
@@ -803,6 +852,7 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct 
stat *st)
 if (p) {
 assert(p->nlookup > 0);
 p->nlookup++;
+g_atomic_int_inc(&p->refcount);
 }
 pthread_mutex_unlock(&lo->mutex);
 
@@ -822,6 +872,10 @@ static void posix_locks_value_destroy(gpointer data)
 free(plock);
 }
 
+/*
+ * Increments nlookup and caller must release refcount using
+ * lo_inode_put(&parent).
+ */
 static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
 struct fuse_entry_param *e)
 {
@@ -829,7 +883,8 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, 
const char *name,

[PATCH 098/104] virtiofsd: add definition of fuse_buf_writev()

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: piaojun 

Define fuse_buf_writev() which use pwritev and writev to improve io
bandwidth. Especially, the src bufs with 0 size should be skipped as
their mems are not *block_size* aligned which will cause writev failed
in direct io mode.

Signed-off-by: Jun Piao 
Suggested-by: Stefan Hajnoczi 
---
 tools/virtiofsd/buffer.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c
index f59d8d72eb..ae420c70c4 100644
--- a/tools/virtiofsd/buffer.c
+++ b/tools/virtiofsd/buffer.c
@@ -13,6 +13,7 @@
 #include "fuse_lowlevel.h"
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -32,6 +33,44 @@ size_t fuse_buf_size(const struct fuse_bufvec *bufv)
 return size;
 }
 
+__attribute__((unused))
+static ssize_t fuse_buf_writev(fuse_req_t req,
+   struct fuse_buf *out_buf,
+   struct fuse_bufvec *in_buf)
+{
+ssize_t res, i, j;
+size_t iovcnt = in_buf->count;
+struct iovec *iov;
+int fd = out_buf->fd;
+
+iov = calloc(iovcnt, sizeof(struct iovec));
+if (!iov) {
+return -ENOMEM;
+}
+
+for (i = 0, j = 0; i < iovcnt; i++) {
+/* Skip the buf with 0 size */
+if (in_buf->buf[i].size) {
+iov[j].iov_base = in_buf->buf[i].mem;
+iov[j].iov_len = in_buf->buf[i].size;
+j++;
+}
+}
+
+if (out_buf->flags & FUSE_BUF_FD_SEEK) {
+res = pwritev(fd, iov, iovcnt, out_buf->pos);
+} else {
+res = writev(fd, iov, iovcnt);
+}
+
+if (res == -1) {
+res = -errno;
+}
+
+free(iov);
+return res;
+}
+
 static size_t min_size(size_t s1, size_t s2)
 {
 return s1 < s2 ? s1 : s2;
-- 
2.23.0

[PATCH 100/104] virtiofsd: process requests in a thread pool

2019-12-12 Thread Dr. David Alan Gilbert (git)

From: Stefan Hajnoczi 

Introduce a thread pool so that fv_queue_thread() just pops
VuVirtqElements and hands them to the thread pool.  For the time being
only one worker thread is allowed since passthrough_ll.c is not
thread-safe yet.  Future patches will lift this restriction so that
multiple FUSE requests can be processed in parallel.

The main new concept is struct FVRequest, which contains both
VuVirtqElement and struct fuse_chan.  We now have fv_VuDev for a device,
fv_QueueInfo for a virtqueue, and FVRequest for a request.  Some of
fv_QueueInfo's fields are moved into FVRequest because they are
per-request.  The name FVRequest conforms to QEMU coding style and I
expect the struct fv_* types will be renamed in a future refactoring.

This patch series is not optimal.  fbuf reuse is dropped so each request
does malloc(se->bufsize), but there is no clean and cheap way to keep
this with a thread pool.  The vq_lock mutex is held for longer than
necessary, especially during the eventfd_write() syscall.  Performance
can be improved in the future.

prctl(2) had to be added to the seccomp whitelist because glib invokes
it.

Signed-off-by: Stefan Hajnoczi 
---
 tools/virtiofsd/fuse_virtio.c | 361 +++---
 1 file changed, 202 insertions(+), 159 deletions(-)

diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 2c1e524852..b696ac3135 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -22,6 +22,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -37,17 +38,28 @@
 struct fv_VuDev;
 struct fv_QueueInfo {
 pthread_t thread;
+/*
+ * This lock protects the VuVirtq preventing races between
+ * fv_queue_thread() and fv_queue_worker().
+ */
+pthread_mutex_t vq_lock;
+
 struct fv_VuDev *virtio_dev;
 
 /* Our queue index, corresponds to array position */
 int qidx;
 int kick_fd;
 int kill_fd; /* For killing the thread */
+};
 
-/* The element for the command currently being processed */
-VuVirtqElement *qe;
+/* A FUSE request */
+typedef struct {
+VuVirtqElement elem;
+struct fuse_chan ch;
+
+/* Used to complete requests that involve no reply */
 bool reply_sent;
-};
+} FVRequest;
 
 /*
  * We pass the dev element into libvhost-user
@@ -191,8 +203,11 @@ static void copy_iov(struct iovec *src_iov, int src_count,
 int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
 struct iovec *iov, int count)
 {
-VuVirtqElement *elem;
-VuVirtq *q;
+FVRequest *req = container_of(ch, FVRequest, ch);
+struct fv_QueueInfo *qi = ch->qi;
+VuDev *dev = &se->virtio_dev->dev;
+VuVirtq *q = vu_get_queue(dev, qi->qidx);
+VuVirtqElement *elem = &req->elem;
 int ret = 0;
 
 assert(count >= 1);
@@ -205,11 +220,7 @@ int virtio_send_msg(struct fuse_session *se, struct 
fuse_chan *ch,
 
 /* unique == 0 is notification, which we don't support */
 assert(out->unique);
-/* For virtio we always have ch */
-assert(ch);
-assert(!ch->qi->reply_sent);
-elem = ch->qi->qe;
-q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
+assert(!req->reply_sent);
 
 /* The 'in' part of the elem is to qemu */
 unsigned int in_num = elem->in_num;
@@ -236,9 +247,15 @@ int virtio_send_msg(struct fuse_session *se, struct 
fuse_chan *ch,
 }
 
 copy_iov(iov, count, in_sg, in_num, tosend_len);
-vu_queue_push(&se->virtio_dev->dev, q, elem, tosend_len);
-vu_queue_notify(&se->virtio_dev->dev, q);
-ch->qi->reply_sent = true;
+
+pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+pthread_mutex_lock(&qi->vq_lock);
+vu_queue_push(dev, q, elem, tosend_len);
+vu_queue_notify(dev, q);
+pthread_mutex_unlock(&qi->vq_lock);
+pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+
+req->reply_sent = true;
 
 err:
 return ret;
@@ -254,9 +271,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct 
fuse_chan *ch,
  struct iovec *iov, int count, struct fuse_bufvec *buf,
  size_t len)
 {
+FVRequest *req = container_of(ch, FVRequest, ch);
+struct fv_QueueInfo *qi = ch->qi;
+VuDev *dev = &se->virtio_dev->dev;
+VuVirtq *q = vu_get_queue(dev, qi->qidx);
+VuVirtqElement *elem = &req->elem;
 int ret = 0;
-VuVirtqElement *elem;
-VuVirtq *q;
 
 assert(count >= 1);
 assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
@@ -275,11 +295,7 @@ int virtio_send_data_iov(struct fuse_session *se, struct 
fuse_chan *ch,
 /* unique == 0 is notification which we don't support */
 assert(out->unique);
 
-/* For virtio we always have ch */
-assert(ch);
-assert(!ch->qi->reply_sent);
-elem = ch->qi->qe;
-q = &ch->qi->virtio_dev->dev.vq[ch->qi->qidx];
+assert(!req->reply_sent);
 
 /* The 'in' part of the elem is to qemu */
 unsigned int in_num = e

[PULL 1/2] vhost-user-fs: remove "vhostfd" property

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: Marc-André Lureau 

The property doesn't make much sense for a vhost-user device.

Signed-off-by: Marc-André Lureau 
Message-Id: <20191116112016.14872-1-marcandre.lur...@redhat.com>
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Dr. David Alan Gilbert 
---
 hw/virtio/vhost-user-fs.c | 1 -
 include/hw/virtio/vhost-user-fs.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index f0df7f4746..ca0b7fc9de 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -263,7 +263,6 @@ static Property vuf_properties[] = {
 DEFINE_PROP_UINT16("num-request-queues", VHostUserFS,
conf.num_request_queues, 1),
 DEFINE_PROP_UINT16("queue-size", VHostUserFS, conf.queue_size, 128),
-DEFINE_PROP_STRING("vhostfd", VHostUserFS, conf.vhostfd),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/vhost-user-fs.h 
b/include/hw/virtio/vhost-user-fs.h
index 539885b458..9ff1bdb7cf 100644
--- a/include/hw/virtio/vhost-user-fs.h
+++ b/include/hw/virtio/vhost-user-fs.h
@@ -28,7 +28,6 @@ typedef struct {
 char *tag;
 uint16_t num_request_queues;
 uint16_t queue_size;
-char *vhostfd;
 } VHostUserFSConf;
 
 typedef struct {
-- 
2.23.0

[PULL 0/2] virtiofs queue

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The following changes since commit b0ca999a43a22b38158a33d3f5881648bb4f:

  Update version for v4.2.0 release (2019-12-12 16:45:57 +)

are available in the Git repository at:

  git://github.com/dagrh/qemu.git tags/pull-virtiofs-20191213a

for you to fetch changes up to 366844f3d1329c6423dd752891a28ccb3ee8fddd:

  virtio-fs: fix MSI-X nvectors calculation (2019-12-13 10:53:57 +)


virtiofs pull 2019-12-13: Minor fixes and cleanups

Cleanup from Marc-André and MSI-X fix from Stefan.


Marc-André Lureau (1):
  vhost-user-fs: remove "vhostfd" property

Stefan Hajnoczi (1):
  virtio-fs: fix MSI-X nvectors calculation

 hw/virtio/vhost-user-fs-pci.c | 3 ++-
 hw/virtio/vhost-user-fs.c | 1 -
 include/hw/virtio/vhost-user-fs.h | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

[PULL 2/2] virtio-fs: fix MSI-X nvectors calculation

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: Stefan Hajnoczi 

The following MSI-X vectors are required:
 * VIRTIO Configuration Change
 * hiprio virtqueue
 * requests virtqueues

Fix the calculation to reserve enough MSI-X vectors.  Otherwise guest
drivers fall back to a sub-optional configuration where all virtqueues
share a single vector.

This change does not break live migration compatibility since
vhost-user-fs-pci devices are not migratable yet.

Reported-by: Vivek Goyal 
Signed-off-by: Stefan Hajnoczi 
Message-Id: <20191209110759.35227-1-stefa...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Dr. David Alan Gilbert 
---
 hw/virtio/vhost-user-fs-pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c
index 933a3f265b..e3a649d4a6 100644
--- a/hw/virtio/vhost-user-fs-pci.c
+++ b/hw/virtio/vhost-user-fs-pci.c
@@ -40,7 +40,8 @@ static void vhost_user_fs_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 DeviceState *vdev = DEVICE(&dev->vdev);
 
 if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
-vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 1;
+/* Also reserve config change and hiprio queue vectors */
+vpci_dev->nvectors = dev->vdev.conf.num_request_queues + 2;
 }
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-- 
2.23.0

[PATCH 0/2] rcu_read auto macro use

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Hi,
  A couple more uses of the rcu_read macros; in qsp and
hyperv (neither of which list maintainers, so I guess
best through RCU).

The hyperv case saves a temporary.
The qsp case uses an rcu_read_lock around the lifetime
of a snapshot and carefully comments that; but now
it's automatic.

[Hyperv not tested]

Dave

Dr. David Alan Gilbert (2):
  hyperv: Use auto rcu_read macros
  qsp: Use WITH_RCU_READ_LOCK_GUARD

 hw/hyperv/hyperv.c | 22 +-
 util/qsp.c | 22 ++
 2 files changed, 19 insertions(+), 25 deletions(-)

-- 
2.23.0

[PATCH 2/2] qsp: Use WITH_RCU_READ_LOCK_GUARD

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The automatic rcu read lock maintenance works quite
nicely in this case where it previously relied on a comment to
delimit the lifetime and now has a block.

Signed-off-by: Dr. David Alan Gilbert 
---
 util/qsp.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/util/qsp.c b/util/qsp.c
index 62265417fd..7d5147f1b2 100644
--- a/util/qsp.c
+++ b/util/qsp.c
@@ -598,7 +598,6 @@ static void qsp_ht_delete(void *p, uint32_t h, void *htp)
 
 static void qsp_mktree(GTree *tree, bool callsite_coalesce)
 {
-QSPSnapshot *snap;
 struct qht ht, coalesce_ht;
 struct qht *htp;
 
@@ -610,20 +609,19 @@ static void qsp_mktree(GTree *tree, bool 
callsite_coalesce)
  * We must remain in an RCU read-side critical section until we're done
  * with the snapshot.
  */
-rcu_read_lock();
-snap = atomic_rcu_read(&qsp_snapshot);
+WITH_RCU_READ_LOCK_GUARD() {
+QSPSnapshot *snap = atomic_rcu_read(&qsp_snapshot);
 
-/* Aggregate all results from the global hash table into a local one */
-qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
- QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
-qht_iter(&qsp_ht, qsp_aggregate, &ht);
+/* Aggregate all results from the global hash table into a local one */
+qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+qht_iter(&qsp_ht, qsp_aggregate, &ht);
 
-/* compute the difference wrt the snapshot, if any */
-if (snap) {
-qsp_diff(&snap->ht, &ht);
+/* compute the difference wrt the snapshot, if any */
+if (snap) {
+qsp_diff(&snap->ht, &ht);
+}
 }
-/* done with the snapshot; RCU can reclaim it */
-rcu_read_unlock();
 
 htp = &ht;
 if (callsite_coalesce) {
-- 
2.23.0

[PATCH 1/2] hyperv: Use auto rcu_read macros

2019-12-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Use RCU_READ_LOCK_GUARD and WITH_RCU_READ_LOCK_GUARD
to replace the manual rcu_read_(un)lock calls.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/hyperv/hyperv.c | 22 +-
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/hw/hyperv/hyperv.c b/hw/hyperv/hyperv.c
index 6ebf31c310..da8ce82725 100644
--- a/hw/hyperv/hyperv.c
+++ b/hw/hyperv/hyperv.c
@@ -546,14 +546,14 @@ uint16_t hyperv_hcall_post_message(uint64_t param, bool 
fast)
 }
 
 ret = HV_STATUS_INVALID_CONNECTION_ID;
-rcu_read_lock();
-QLIST_FOREACH_RCU(mh, &msg_handlers, link) {
-if (mh->conn_id == (msg->connection_id & HV_CONNECTION_ID_MASK)) {
-ret = mh->handler(msg, mh->data);
-break;
+WITH_RCU_READ_LOCK_GUARD() {
+QLIST_FOREACH_RCU(mh, &msg_handlers, link) {
+if (mh->conn_id == (msg->connection_id & HV_CONNECTION_ID_MASK)) {
+ret = mh->handler(msg, mh->data);
+break;
+}
 }
 }
-rcu_read_unlock();
 
 unmap:
 cpu_physical_memory_unmap(msg, len, 0, 0);
@@ -619,7 +619,6 @@ int hyperv_set_event_flag_handler(uint32_t conn_id, 
EventNotifier *notifier)
 
 uint16_t hyperv_hcall_signal_event(uint64_t param, bool fast)
 {
-uint16_t ret;
 EventFlagHandler *handler;
 
 if (unlikely(!fast)) {
@@ -645,15 +644,12 @@ uint16_t hyperv_hcall_signal_event(uint64_t param, bool 
fast)
 return HV_STATUS_INVALID_HYPERCALL_INPUT;
 }
 
-ret = HV_STATUS_INVALID_CONNECTION_ID;
-rcu_read_lock();
+RCU_READ_LOCK_GUARD();
 QLIST_FOREACH_RCU(handler, &event_flag_handlers, link) {
 if (handler->conn_id == param) {
 event_notifier_set(handler->notifier);
-ret = 0;
-break;
+return 0;
 }
 }
-rcu_read_unlock();
-return ret;
+return HV_STATUS_INVALID_CONNECTION_ID;
 }
-- 
2.23.0

[PATCH] usbredir: Prevent recursion in usbredir_write

2019-12-18 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

I've got a case where usbredir_write manages to call back into itself
via spice; this patch causes the recursion to fail (0 bytes) the write;
this seems to avoid the deadlock I was previously seeing.

I can't say I fully understand the interaction of usbredir and spice;
but there are a few similar guards in spice and usbredir
to catch other cases especially onces also related to 
spice_server_char_device_wakeup

This case seems to be triggered by repeated migration+repeated
reconnection of the viewer; but my debugging suggests the migration
finished before this hits.

The backtrace of the hang looks like:
  reds_handle_ticket
  reds_handle_other_links
  reds_channel_do_link
  red_channel_connect
  spicevmc_connect
  usbredir_create_parser
  usbredirparser_do_write
  usbredir_write
  qemu_chr_fe_write
  qemu_chr_write
  qemu_chr_write_buffer
  spice_chr_write
  spice_server_char_device_wakeup
  red_char_device_wakeup
  red_char_device_write_to_device
  vmc_write
  usbredirparser_do_write
  usbredir_write
  qemu_chr_fe_write
  qemu_chr_write
  qemu_chr_write_buffer
  qemu_mutex_lock_impl

and we fail as we lang through qemu_chr_write_buffer's lock
twice.

Bug: https://bugzilla.redhat.com/show_bug.cgi?id=1752320

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/usb/redirect.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c
index e0f5ca6f81..97f2c3a7da 100644
--- a/hw/usb/redirect.c
+++ b/hw/usb/redirect.c
@@ -113,6 +113,7 @@ struct USBRedirDevice {
 /* Properties */
 CharBackend cs;
 bool enable_streams;
+bool in_write;
 uint8_t debug;
 int32_t bootindex;
 char *filter_str;
@@ -290,6 +291,13 @@ static int usbredir_write(void *priv, uint8_t *data, int 
count)
 return 0;
 }
 
+/* Recursion check */
+if (dev->in_write) {
+DPRINTF("usbredir_write recursion\n");
+return 0;
+}
+dev->in_write = true;
+
 r = qemu_chr_fe_write(&dev->cs, data, count);
 if (r < count) {
 if (!dev->watch) {
@@ -300,6 +308,7 @@ static int usbredir_write(void *priv, uint8_t *data, int 
count)
 r = 0;
 }
 }
+dev->in_write = false;
 return r;
 }
 
-- 
2.23.0

[PATCH] kvm: Reallocate dirty_bmap when we change a slot

2019-11-21 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

kvm_set_phys_mem can be called to reallocate a slot by something the
guest does (e.g. writing to PAM and other chipset registers).
This can happen in the middle of a migration, and if we're unlucky
it can now happen between the split 'sync' and 'clear'; the clear
asserts if there's no bmap to clear.   Recreate the bmap whenever
we change the slot, keeping the clear path happy.

Typically this is triggered by the guest rebooting during a migrate.

Corresponds to:
https://bugzilla.redhat.com/show_bug.cgi?id=1772774
https://bugzilla.redhat.com/show_bug.cgi?id=1771032

Signed-off-by: Dr. David Alan Gilbert 
---
 accel/kvm/kvm-all.c | 44 +---
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 140b0bd8f6..dd56f61420 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -515,6 +515,27 @@ static int 
kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 
 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 
+/* Allocate the dirty bitmap for a slot  */
+static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
+{
+/*
+ * XXX bad kernel interface alert
+ * For dirty bitmap, kernel allocates array of size aligned to
+ * bits-per-long.  But for case when the kernel is 64bits and
+ * the userspace is 32bits, userspace can't align to the same
+ * bits-per-long, since sizeof(long) is different between kernel
+ * and user space.  This way, userspace will provide buffer which
+ * may be 4 bytes less than the kernel will use, resulting in
+ * userspace memory corruption (which is not detectable by valgrind
+ * too, in most cases).
+ * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
+ * a hope that sizeof(long) won't become >8 any time soon.
+ */
+hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
+/*HOST_LONG_BITS*/ 64) / 8;
+mem->dirty_bmap = g_malloc0(bitmap_size);
+}
+
 /**
  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
  *
@@ -547,23 +568,9 @@ static int 
kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 goto out;
 }
 
-/* XXX bad kernel interface alert
- * For dirty bitmap, kernel allocates array of size aligned to
- * bits-per-long.  But for case when the kernel is 64bits and
- * the userspace is 32bits, userspace can't align to the same
- * bits-per-long, since sizeof(long) is different between kernel
- * and user space.  This way, userspace will provide buffer which
- * may be 4 bytes less than the kernel will use, resulting in
- * userspace memory corruption (which is not detectable by valgrind
- * too, in most cases).
- * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
- * a hope that sizeof(long) won't become >8 any time soon.
- */
 if (!mem->dirty_bmap) {
-hwaddr bitmap_size = ALIGN(((mem->memory_size) >> 
TARGET_PAGE_BITS),
-/*HOST_LONG_BITS*/ 64) / 8;
 /* Allocate on the first log_sync, once and for all */
-mem->dirty_bmap = g_malloc0(bitmap_size);
+kvm_memslot_init_dirty_bitmap(mem);
 }
 
 d.dirty_bitmap = mem->dirty_bmap;
@@ -1064,6 +1071,13 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
 mem->ram = ram;
 mem->flags = kvm_mem_flags(mr);
 
+if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+/*
+ * Reallocate the bmap; it means it doesn't disappear in
+ * middle of a migrate.
+ */
+kvm_memslot_init_dirty_bitmap(mem);
+}
 err = kvm_set_user_memory_region(kml, mem, true);
 if (err) {
 fprintf(stderr, "%s: error registering slot: %s\n", __func__,
-- 
2.23.0

[PATCH] vmstate-static-checker: Fix for current python

2019-11-21 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Python 3.7.5 on f31 doesn't seem to like the old type=file syntax
on argparse.

Signed-off-by: Dr. David Alan Gilbert 
---
 scripts/vmstate-static-checker.py | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/vmstate-static-checker.py 
b/scripts/vmstate-static-checker.py
index 21dbdccf3e..9f912dd870 100755
--- a/scripts/vmstate-static-checker.py
+++ b/scripts/vmstate-static-checker.py
@@ -379,9 +379,11 @@ def main():
 help_text = "Parse JSON-formatted vmstate dumps from QEMU in files SRC and 
DEST.  Checks whether migration from SRC to DEST QEMU versions would break 
based on the VMSTATE information contained within the JSON outputs.  The JSON 
output is created from a QEMU invocation with the -dump-vmstate parameter and a 
filename argument to it.  Other parameters to QEMU do not matter, except the -M 
(machine type) parameter."
 
 parser = argparse.ArgumentParser(description=help_text)
-parser.add_argument('-s', '--src', type=file, required=True,
+parser.add_argument('-s', '--src', type=argparse.FileType('r'),
+required=True,
 help='json dump from src qemu')
-parser.add_argument('-d', '--dest', type=file, required=True,
+parser.add_argument('-d', '--dest', type=argparse.FileType('r'),
+required=True,
 help='json dump from dest qemu')
 parser.add_argument('--reverse', required=False, default=False,
 action='store_true',
-- 
2.23.0

[PATCH] ci: Use libcap-ng

2019-11-29 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

We currently enable libcap-dev in build-clang to pick up the 9p proxy
helper.  Paolo's patch changes that to use libcap-ng, so switch to using
it.  This also means we'll be testing the scsi pr manager and the bridge
helper.

Signed-off-by: Dr. David Alan Gilbert 
---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index be57c6a454..62a9609798 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -59,7 +59,7 @@ build-user:
 
 build-clang:
  script:
- - apt-get install -y -qq clang libsdl2-dev libattr1-dev libcap-dev
+ - apt-get install -y -qq clang libsdl2-dev libattr1-dev libcap-ng-dev
   xfslibs-dev libiscsi-dev libnfs-dev libseccomp-dev gnutls-dev librbd-dev
  - ./configure --cc=clang --cxx=clang++ --enable-werror
   --target-list="alpha-softmmu arm-softmmu m68k-softmmu mips64-softmmu
-- 
2.23.0

[Qemu-devel] [PATCH 0/2] Fix MemoryRegionSection alignment and comparison

2019-08-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

This fixes a symptom I've seen on vhost-user on aarch64 where the
daemon would be falsely notified of memory region changes that didn't
exist.
The underlying problem was me memcmp'ing MemoryRegionSections even
though they had padding in.

(Discovered while getting virtiofs working on aarch)

Dave


Dr. David Alan Gilbert (2):
  memory: Align and add helper for comparing MemoryRegionSections
  vhost: Fix memory region section comparison

 hw/virtio/vhost.c |  9 +++--
 include/exec/memory.h | 14 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

-- 
2.21.0

[Qemu-devel] [PATCH 2/2] vhost: Fix memory region section comparison

2019-08-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Using memcmp to compare structures wasn't safe,
as I found out on ARM when I was getting false miscompares.

Use the helper function for comparing the MRSs.

Fixes: ade6d081fc33948e56e6

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/virtio/vhost.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index bc899fc60e..2ef4bc720f 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -451,8 +451,13 @@ static void vhost_commit(MemoryListener *listener)
 changed = true;
 } else {
 /* Same size, lets check the contents */
-changed = n_old_sections && memcmp(dev->mem_sections, old_sections,
- n_old_sections * sizeof(old_sections[0])) != 0;
+for (int i = 0; i < n_old_sections; i++) {
+if (!MemoryRegionSection_eq(&old_sections[i],
+&dev->mem_sections[i])) {
+changed = true;
+break;
+}
+}
 }
 
 trace_vhost_commit(dev->started, changed);
-- 
2.21.0

[Qemu-devel] [PATCH 1/2] memory: Align and add helper for comparing MemoryRegionSections

2019-08-13 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

MemoryRegionSection includes an Int128 'size' field;
on some platforms the compiler causes an alignment of this to
a 128bit boundary, leaving 8 bytes of dead space.
This deadspace can be filled with junk.

Move the size field to the top avoiding unnecsssary alignment
and provide an 'eq' routine to safely compare MRS's.

Signed-off-by: Dr. David Alan Gilbert 
---
 include/exec/memory.h | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 606250172a..ce62e847bd 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -487,15 +487,27 @@ static inline FlatView 
*address_space_to_flatview(AddressSpace *as)
  * @nonvolatile: this section is non-volatile
  */
 struct MemoryRegionSection {
+Int128 size;
 MemoryRegion *mr;
 FlatView *fv;
 hwaddr offset_within_region;
-Int128 size;
 hwaddr offset_within_address_space;
 bool readonly;
 bool nonvolatile;
 };
 
+static inline bool MemoryRegionSection_eq(MemoryRegionSection *a,
+  MemoryRegionSection *b)
+{
+return a->mr == b->mr &&
+   a->fv == b->fv &&
+   a->offset_within_region == b->offset_within_region &&
+   a->offset_within_address_space == b->offset_within_address_space &&
+   int128_eq(a->size, b->size) &&
+   a->readonly == b->readonly &&
+   a->nonvolatile == b->nonvolatile;
+}
+
 /**
  * memory_region_init: Initialize a memory region
  *
-- 
2.21.0

[Qemu-devel] [PATCH v2 2/3] memory: Provide an equality function for MemoryRegionSections

2019-08-14 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Provide a comparison function that checks all the fields are the same.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/exec/memory.h | 12 
 1 file changed, 12 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index a74a58c289..ce62e847bd 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -496,6 +496,18 @@ struct MemoryRegionSection {
 bool nonvolatile;
 };
 
+static inline bool MemoryRegionSection_eq(MemoryRegionSection *a,
+  MemoryRegionSection *b)
+{
+return a->mr == b->mr &&
+   a->fv == b->fv &&
+   a->offset_within_region == b->offset_within_region &&
+   a->offset_within_address_space == b->offset_within_address_space &&
+   int128_eq(a->size, b->size) &&
+   a->readonly == b->readonly &&
+   a->nonvolatile == b->nonvolatile;
+}
+
 /**
  * memory_region_init: Initialize a memory region
  *
-- 
2.21.0

[Qemu-devel] [PATCH v2 0/3] Fix MemoryRegionSection alignment and comparison

2019-08-14 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

This fixes a symptom I've seen on vhost-user on aarch64 where the
daemon would be falsely notified of memory region changes that didn't
exist.
The underlying problem was me memcmp'ing MemoryRegionSections even
though they had padding in.

(Discovered while getting virtiofs working on aarch)

Dave

v2
  Split 1st patch and fix spelling [Philippe's review]

Dr. David Alan Gilbert (3):
  memory: Align MemoryRegionSections fields
  memory: Provide an equality function for MemoryRegionSections
  vhost: Fix memory region section comparison

 hw/virtio/vhost.c |  9 +++--
 include/exec/memory.h | 14 +-
 2 files changed, 20 insertions(+), 3 deletions(-)

-- 
2.21.0

[Qemu-devel] [PATCH v2 1/3] memory: Align MemoryRegionSections fields

2019-08-14 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

MemoryRegionSection includes an Int128 'size' field;
on some platforms the compiler causes an alignment of this to
a 128bit boundary, leaving 8 bytes of dead space.
This deadspace can be filled with junk.

Move the size field to the top avoiding unnecessary alignment.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/exec/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 606250172a..a74a58c289 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -487,10 +487,10 @@ static inline FlatView 
*address_space_to_flatview(AddressSpace *as)
  * @nonvolatile: this section is non-volatile
  */
 struct MemoryRegionSection {
+Int128 size;
 MemoryRegion *mr;
 FlatView *fv;
 hwaddr offset_within_region;
-Int128 size;
 hwaddr offset_within_address_space;
 bool readonly;
 bool nonvolatile;
-- 
2.21.0

[Qemu-devel] [PATCH v2 3/3] vhost: Fix memory region section comparison

2019-08-14 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Using memcmp to compare structures wasn't safe,
as I found out on ARM when I was getting falce miscompares.

Use the helper function for comparing the MRSs.

Fixes: ade6d081fc33948e56e6

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/virtio/vhost.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index bc899fc60e..2ef4bc720f 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -451,8 +451,13 @@ static void vhost_commit(MemoryListener *listener)
 changed = true;
 } else {
 /* Same size, lets check the contents */
-changed = n_old_sections && memcmp(dev->mem_sections, old_sections,
- n_old_sections * sizeof(old_sections[0])) != 0;
+for (int i = 0; i < n_old_sections; i++) {
+if (!MemoryRegionSection_eq(&old_sections[i],
+&dev->mem_sections[i])) {
+changed = true;
+break;
+}
+}
 }
 
 trace_vhost_commit(dev->started, changed);
-- 
2.21.0

[Qemu-devel] [PULL 05/33] migration/postcopy: break the loop when there is no more page to discard

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

When one is equal or bigger then end, it means there is no page to
discard. Just break the loop in this case instead of processing it.

No functional change, just refactor it a little.

Signed-off-by: Wei Yang 
Message-Id: <20190627020822.15485-3-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 68bc11c9e7..8a97dadec4 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2864,23 +2864,23 @@ static int postcopy_send_discard_bm_ram(MigrationState 
*ms,
 
 for (current = 0; current < end; ) {
 unsigned long one = find_next_bit(unsentmap, end, current);
+unsigned long zero, discard_length;
 
-if (one < end) {
-unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
-unsigned long discard_length;
+if (one >= end) {
+break;
+}
 
-if (zero >= end) {
-discard_length = end - one;
-} else {
-discard_length = zero - one;
-}
-if (discard_length) {
-postcopy_discard_send_range(ms, pds, one, discard_length);
-}
-current = one + discard_length;
+zero = find_next_zero_bit(unsentmap, end, one + 1);
+
+if (zero >= end) {
+discard_length = end - one;
 } else {
-current = one;
+discard_length = zero - one;
+}
+if (discard_length) {
+postcopy_discard_send_range(ms, pds, one, discard_length);
 }
+current = one + discard_length;
 }
 
 return 0;
-- 
2.21.0

[Qemu-devel] [PULL 02/33] hw/net: fix vmxnet3 live migration

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Marcel Apfelbaum 

At some point vmxnet3 live migration stopped working and git-bisect
didn't help finding a working version.
The issue is the PCI configuration space is not being migrated
successfully and MSIX remains masked at destination.

Remove the migration differentiation between PCI and PCIe since
the logic resides now inside VMSTATE_PCI_DEVICE.
Remove also the VMXNET3_COMPAT_FLAG_DISABLE_PCIE based differentiation
since at 'realize' time is decided if the device is PCI or PCIe,
then the above macro is enough.

Use the opportunity to move to the standard VMSTATE_MSIX
instead of the deprecated SaveVMHandlers.

Signed-off-by: Marcel Apfelbaum 
Message-Id: <20190705010711.23277-1-marcel.apfelb...@gmail.com>
Tested-by: Sukrit Bhatnagar 
Reviewed-by: Dmitry Fleytman 
Signed-off-by: Dr. David Alan Gilbert 
---
 hw/net/vmxnet3.c | 52 ++--
 1 file changed, 2 insertions(+), 50 deletions(-)

diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 10d01d0058..8b17548b02 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -2141,21 +2141,6 @@ vmxnet3_cleanup_msi(VMXNET3State *s)
 msi_uninit(d);
 }
 
-static void
-vmxnet3_msix_save(QEMUFile *f, void *opaque)
-{
-PCIDevice *d = PCI_DEVICE(opaque);
-msix_save(d, f);
-}
-
-static int
-vmxnet3_msix_load(QEMUFile *f, void *opaque, int version_id)
-{
-PCIDevice *d = PCI_DEVICE(opaque);
-msix_load(d, f);
-return 0;
-}
-
 static const MemoryRegionOps b0_ops = {
 .read = vmxnet3_io_bar0_read,
 .write = vmxnet3_io_bar0_write,
@@ -2176,11 +2161,6 @@ static const MemoryRegionOps b1_ops = {
 },
 };
 
-static SaveVMHandlers savevm_vmxnet3_msix = {
-.save_state = vmxnet3_msix_save,
-.load_state = vmxnet3_msix_load,
-};
-
 static uint64_t vmxnet3_device_serial_num(VMXNET3State *s)
 {
 uint64_t dsn_payload;
@@ -2203,7 +2183,6 @@ static uint64_t vmxnet3_device_serial_num(VMXNET3State *s)
 
 static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error **errp)
 {
-DeviceState *dev = DEVICE(pci_dev);
 VMXNET3State *s = VMXNET3(pci_dev);
 int ret;
 
@@ -2249,8 +2228,6 @@ static void vmxnet3_pci_realize(PCIDevice *pci_dev, Error 
**errp)
 pcie_dev_ser_num_init(pci_dev, VMXNET3_DSN_OFFSET,
   vmxnet3_device_serial_num(s));
 }
-
-register_savevm_live(dev, "vmxnet3-msix", -1, 1, &savevm_vmxnet3_msix, s);
 }
 
 static void vmxnet3_instance_init(Object *obj)
@@ -2440,29 +2417,6 @@ static const VMStateDescription 
vmstate_vmxnet3_int_state = {
 }
 };
 
-static bool vmxnet3_vmstate_need_pcie_device(void *opaque)
-{
-VMXNET3State *s = VMXNET3(opaque);
-
-return !(s->compat_flags & VMXNET3_COMPAT_FLAG_DISABLE_PCIE);
-}
-
-static bool vmxnet3_vmstate_test_pci_device(void *opaque, int version_id)
-{
-return !vmxnet3_vmstate_need_pcie_device(opaque);
-}
-
-static const VMStateDescription vmstate_vmxnet3_pcie_device = {
-.name = "vmxnet3/pcie",
-.version_id = 1,
-.minimum_version_id = 1,
-.needed = vmxnet3_vmstate_need_pcie_device,
-.fields = (VMStateField[]) {
-VMSTATE_PCI_DEVICE(parent_obj, VMXNET3State),
-VMSTATE_END_OF_LIST()
-}
-};
-
 static const VMStateDescription vmstate_vmxnet3 = {
 .name = "vmxnet3",
 .version_id = 1,
@@ -2470,9 +2424,8 @@ static const VMStateDescription vmstate_vmxnet3 = {
 .pre_save = vmxnet3_pre_save,
 .post_load = vmxnet3_post_load,
 .fields = (VMStateField[]) {
-VMSTATE_STRUCT_TEST(parent_obj, VMXNET3State,
-vmxnet3_vmstate_test_pci_device, 0,
-vmstate_pci_device, PCIDevice),
+VMSTATE_PCI_DEVICE(parent_obj, VMXNET3State),
+VMSTATE_MSIX(parent_obj, VMXNET3State),
 VMSTATE_BOOL(rx_packets_compound, VMXNET3State),
 VMSTATE_BOOL(rx_vlan_stripping, VMXNET3State),
 VMSTATE_BOOL(lro_supported, VMXNET3State),
@@ -2508,7 +2461,6 @@ static const VMStateDescription vmstate_vmxnet3 = {
 },
 .subsections = (const VMStateDescription*[]) {
 &vmxstate_vmxnet3_mcast_list,
-&vmstate_vmxnet3_pcie_device,
 NULL
 }
 };
-- 
2.21.0

[Qemu-devel] [PULL 00/33] migration queue

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The following changes since commit f28ed74fd116491e31329044d140fde4aa23b2a0:

  Update version for v4.1.0-rc5 release (2019-08-13 15:38:38 +0100)

are available in the Git repository at:

  git://github.com/dagrh/qemu.git tags/pull-migration-20190814a

for you to fetch changes up to 7dd59d01ddcc4a4ba0c44c2cc9e3b35c79aa7a29:

  migration: add some multifd traces (2019-08-14 17:33:14 +0100)


Migration pull 2019-08-15

Marcel's vmxnet3 live migration fix (that breaks vmxnet3 compatibility
but makes it work)

Error description improvements from Yury.

Multifd fixes from Ivan and Juan.

A load of small cleanups from Wei.

A small cleanup from Marc-André for a future patch.


Ivan Ren (4):
  migration: always initialise ram_counters for a new migration
  migration: add qemu_file_update_transfer interface
  migration: add speed limit for multifd migration
  migration: update ram_counters for multifd sync packet

Juan Quintela (3):
  migration: Add traces for multifd terminate threads
  migration: Make global sem_sync semaphore by channel
  migration: add some multifd traces

Marc-André Lureau (1):
  qemu-file: move qemu_{get,put}_counted_string() declarations

Marcel Apfelbaum (1):
  hw/net: fix vmxnet3 live migration

Wei Yang (23):
  migration: consolidate time info into populate_time_info
  migration/postcopy: the valid condition is one less then end
  migration/postcopy: break the loop when there is no more page to discard
  migration/postcopy: discard_length must not be 0
  migration/postcopy: reduce one operation to calculate fixup_start_addr
  migration/postcopy: do_fixup is true when host_offset is non-zero
  migration/savevm: flush file for iterable_only case
  migration/savevm: split qemu_savevm_state_complete_precopy() into two 
parts
  migration/savevm: move non SaveStateEntry condition check out of iteration
  migration/postcopy: PostcopyState is already set in 
loadvm_postcopy_handle_advise()
  migration/postcopy: start_postcopy could be true only when 
migrate_postcopy() return true
  migration: use migration_in_postcopy() to check POSTCOPY_ACTIVE
  migration: just pass RAMBlock is enough
  migration: equation is more proper than and to check LOADVM_QUIT
  migration: return -EINVAL directly when version_id mismatch
  migration: extract ram_load_precopy
  migration/postcopy: make PostcopyDiscardState a static variable
  migration/postcopy: simplify calculation of run_start and fixup_start_addr
  migration/postcopy: use QEMU_IS_ALIGNED to replace host_offset
  hmp: Remove migration capabilities from "info migrate"
  migration: remove unused field bytes_xfer
  migration: rename migration_bitmap_sync_range to 
ramblock_sync_dirty_bitmap
  migration/postcopy: use mis->bh instead of allocating a QEMUBH

Yury Kotov (1):
  migration: Add error_desc for file channel errors

 hw/net/vmxnet3.c|  52 +
 include/migration/qemu-file-types.h |   4 +
 migration/migration.c   |  79 -
 migration/migration.h   |   1 -
 migration/postcopy-ram.c|  70 +---
 migration/postcopy-ram.h|  13 +--
 migration/qemu-file-channel.c   |  30 ++---
 migration/qemu-file.c   |  68 +--
 migration/qemu-file.h   |  20 ++--
 migration/ram.c | 217 ++--
 migration/rdma.c|   6 +-
 migration/savevm.c  |  96 ++--
 migration/trace-events  |   6 +
 monitor/hmp-cmds.c  |  14 ---
 14 files changed, 348 insertions(+), 328 deletions(-)

[Qemu-devel] [PULL 14/33] migration: use migration_in_postcopy() to check POSTCOPY_ACTIVE

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Use common helper function to check the state.

Signed-off-by: Wei Yang 
Message-Id: <20190719071129.11880-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 3036221ee8..0e73e759ca 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3140,7 +3140,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
 
 CHECK_ERROR_STATE();
 
-if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+if (migration_in_postcopy()) {
 rcu_read_unlock();
 return RAM_SAVE_CONTROL_NOT_SUPP;
 }
@@ -3775,7 +3775,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, void 
*opaque,
 
 CHECK_ERROR_STATE();
 
-if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+if (migration_in_postcopy()) {
 rcu_read_unlock();
 return 0;
 }
@@ -3810,7 +3810,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void 
*opaque,
 
 CHECK_ERROR_STATE();
 
-if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+if (migration_in_postcopy()) {
 rcu_read_unlock();
 return 0;
 }
-- 
2.21.0

[Qemu-devel] [PULL 01/33] migration: Add error_desc for file channel errors

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Yury Kotov 

Currently, there is no information about error if outgoing migration was failed
because of file channel errors.
Example (QMP session):
-> { "execute": "migrate", "arguments": { "uri": "exec:head -c 1" }}
<- { "return": {} }
...
-> { "execute": "query-migrate" }
<- { "return": { "status": "failed" }} // There is not error's description

And even in the QEMU's output there is nothing.

This patch
1) Adds errp for the most of QEMUFileOps
2) Adds qemu_file_get_error_obj/qemu_file_set_error_obj
3) And finally using of qemu_file_get_error_obj in migration.c

And now, the status for the mentioned fail will be:
-> { "execute": "query-migrate" }
<- { "return": { "status": "failed",
 "error-desc": "Unable to write to command: Broken pipe" }}

Signed-off-by: Yury Kotov 
Message-Id: <20190422103420.15686-1-yury-ko...@yandex-team.ru>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 10 --
 migration/qemu-file-channel.c | 30 +
 migration/qemu-file.c | 63 ---
 migration/qemu-file.h | 15 ++---
 migration/savevm.c|  6 ++--
 5 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8a607fe1e2..28342969ea 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2963,6 +2963,7 @@ static MigThrError migration_detect_error(MigrationState 
*s)
 {
 int ret;
 int state = s->state;
+Error *local_error = NULL;
 
 if (state == MIGRATION_STATUS_CANCELLING ||
 state == MIGRATION_STATUS_CANCELLED) {
@@ -2971,13 +2972,18 @@ static MigThrError 
migration_detect_error(MigrationState *s)
 }
 
 /* Try to detect any file errors */
-ret = qemu_file_get_error(s->to_dst_file);
-
+ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
 if (!ret) {
 /* Everything is fine */
+assert(!local_error);
 return MIG_THR_ERR_NONE;
 }
 
+if (local_error) {
+migrate_set_error(s, local_error);
+error_free(local_error);
+}
+
 if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
 /*
  * For postcopy, we allow the network to be down for a
diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
index 8e639eb496..c382ea2d78 100644
--- a/migration/qemu-file-channel.c
+++ b/migration/qemu-file-channel.c
@@ -33,7 +33,8 @@
 static ssize_t channel_writev_buffer(void *opaque,
  struct iovec *iov,
  int iovcnt,
- int64_t pos)
+ int64_t pos,
+ Error **errp)
 {
 QIOChannel *ioc = QIO_CHANNEL(opaque);
 ssize_t done = 0;
@@ -47,7 +48,7 @@ static ssize_t channel_writev_buffer(void *opaque,
 
 while (nlocal_iov > 0) {
 ssize_t len;
-len = qio_channel_writev(ioc, local_iov, nlocal_iov, NULL);
+len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp);
 if (len == QIO_CHANNEL_ERR_BLOCK) {
 if (qemu_in_coroutine()) {
 qio_channel_yield(ioc, G_IO_OUT);
@@ -57,7 +58,6 @@ static ssize_t channel_writev_buffer(void *opaque,
 continue;
 }
 if (len < 0) {
-/* XXX handle Error objects */
 done = -EIO;
 goto cleanup;
 }
@@ -75,13 +75,14 @@ static ssize_t channel_writev_buffer(void *opaque,
 static ssize_t channel_get_buffer(void *opaque,
   uint8_t *buf,
   int64_t pos,
-  size_t size)
+  size_t size,
+  Error **errp)
 {
 QIOChannel *ioc = QIO_CHANNEL(opaque);
 ssize_t ret;
 
 do {
-ret = qio_channel_read(ioc, (char *)buf, size, NULL);
+ret = qio_channel_read(ioc, (char *)buf, size, errp);
 if (ret < 0) {
 if (ret == QIO_CHANNEL_ERR_BLOCK) {
 if (qemu_in_coroutine()) {
@@ -90,7 +91,6 @@ static ssize_t channel_get_buffer(void *opaque,
 qio_channel_wait(ioc, G_IO_IN);
 }
 } else {
-/* XXX handle Error * object */
 return -EIO;
 }
 }
@@ -100,18 +100,20 @@ static ssize_t channel_get_buffer(void *opaque,
 }
 
 
-static int channel_close(void *opaque)
+static int channel_close(void *opaque, Error **errp)
 {
+int ret;
 QIOChannel *ioc = QIO_CHANNEL(opaque);
-qio_channel_close(ioc, NULL);
+ret = qio_channel_close(ioc, errp);
 object_unref(OBJECT(ioc));
-return 0;
+return ret;
 }
 
 
 static int channel_shutdown(void *opaque,
 bool rd,
-bool wr)
+bool wr,

[Qemu-devel] [PULL 04/33] migration/postcopy: the valid condition is one less then end

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

If one equals end, it means we have gone through the whole bitmap.

Use a more restrict check to skip a unnecessary condition.

Signed-off-by: Wei Yang 
Message-Id: <20190627020822.15485-2-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 889148dd84..68bc11c9e7 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2865,7 +2865,7 @@ static int postcopy_send_discard_bm_ram(MigrationState 
*ms,
 for (current = 0; current < end; ) {
 unsigned long one = find_next_bit(unsentmap, end, current);
 
-if (one <= end) {
+if (one < end) {
 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
 unsigned long discard_length;
 
-- 
2.21.0

[Qemu-devel] [PULL 03/33] migration: consolidate time info into populate_time_info

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Consolidate time information fill up into its function for better
readability.

Signed-off-by: Wei Yang 
Message-Id: <20190716005411.4156-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 40 ++--
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 28342969ea..7c66da3a83 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -823,6 +823,25 @@ bool migration_is_setup_or_active(int state)
 }
 }
 
+static void populate_time_info(MigrationInfo *info, MigrationState *s)
+{
+info->has_status = true;
+info->has_setup_time = true;
+info->setup_time = s->setup_time;
+if (s->state == MIGRATION_STATUS_COMPLETED) {
+info->has_total_time = true;
+info->total_time = s->total_time;
+info->has_downtime = true;
+info->downtime = s->downtime;
+} else {
+info->has_total_time = true;
+info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
+   s->start_time;
+info->has_expected_downtime = true;
+info->expected_downtime = s->expected_downtime;
+}
+}
+
 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
 {
 info->has_ram = true;
@@ -908,16 +927,8 @@ static void fill_source_migration_info(MigrationInfo *info)
 case MIGRATION_STATUS_DEVICE:
 case MIGRATION_STATUS_POSTCOPY_PAUSED:
 case MIGRATION_STATUS_POSTCOPY_RECOVER:
- /* TODO add some postcopy stats */
-info->has_status = true;
-info->has_total_time = true;
-info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
-- s->start_time;
-info->has_expected_downtime = true;
-info->expected_downtime = s->expected_downtime;
-info->has_setup_time = true;
-info->setup_time = s->setup_time;
-
+/* TODO add some postcopy stats */
+populate_time_info(info, s);
 populate_ram_info(info, s);
 populate_disk_info(info);
 break;
@@ -926,14 +937,7 @@ static void fill_source_migration_info(MigrationInfo *info)
 /* TODO: display COLO specific information (checkpoint info etc.) */
 break;
 case MIGRATION_STATUS_COMPLETED:
-info->has_status = true;
-info->has_total_time = true;
-info->total_time = s->total_time;
-info->has_downtime = true;
-info->downtime = s->downtime;
-info->has_setup_time = true;
-info->setup_time = s->setup_time;
-
+populate_time_info(info, s);
 populate_ram_info(info, s);
 break;
 case MIGRATION_STATUS_FAILED:
-- 
2.21.0

[Qemu-devel] [PULL 19/33] migration/postcopy: make PostcopyDiscardState a static variable

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

In postcopy-ram.c, we provide three functions to discard certain
RAMBlock range:

  * postcopy_discard_send_init()
  * postcopy_discard_send_range()
  * postcopy_discard_send_finish()

Currently, we allocate/deallocate PostcopyDiscardState for each RAMBlock
on sending discard information to destination. This is not necessary and
the same data area could be reused for each RAMBlock.

This patch defines PostcopyDiscardState a static variable. By doing so:

  1) avoid memory allocation and deallocation to the system
  2) avoid potential failure of memory allocation
  3) hide some details for their users

Signed-off-by: Wei Yang 

Message-Id: <20190724010721.2146-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/postcopy-ram.c | 70 +---
 migration/postcopy-ram.h | 13 +++-
 migration/ram.c  | 30 +++--
 3 files changed, 46 insertions(+), 67 deletions(-)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 9faacacc9e..2cb1a69752 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -1377,22 +1377,16 @@ void 
postcopy_fault_thread_notify(MigrationIncomingState *mis)
  *   asking to discard individual ranges.
  *
  * @ms: The current migration state.
- * @offset: the bitmap offset of the named RAMBlock in the migration
- *   bitmap.
+ * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
  * @name: RAMBlock that discards will operate on.
- *
- * returns: a new PDS.
  */
-PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
- const char *name)
+static PostcopyDiscardState pds = {0};
+void postcopy_discard_send_init(MigrationState *ms, const char *name)
 {
-PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
-
-if (res) {
-res->ramblock_name = name;
-}
-
-return res;
+pds.ramblock_name = name;
+pds.cur_entry = 0;
+pds.nsentwords = 0;
+pds.nsentcmds = 0;
 }
 
 /**
@@ -1401,30 +1395,29 @@ PostcopyDiscardState 
*postcopy_discard_send_init(MigrationState *ms,
  *   be sent later.
  *
  * @ms: Current migration state.
- * @pds: Structure initialised by postcopy_discard_send_init().
  * @start,@length: a range of pages in the migration bitmap in the
  *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
  */
-void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
-unsigned long start, unsigned long length)
+void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
+ unsigned long length)
 {
 size_t tp_size = qemu_target_page_size();
 /* Convert to byte offsets within the RAM block */
-pds->start_list[pds->cur_entry] = start  * tp_size;
-pds->length_list[pds->cur_entry] = length * tp_size;
-trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
-pds->cur_entry++;
-pds->nsentwords++;
+pds.start_list[pds.cur_entry] = start  * tp_size;
+pds.length_list[pds.cur_entry] = length * tp_size;
+trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
+pds.cur_entry++;
+pds.nsentwords++;
 
-if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
 /* Full set, ship it! */
 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
-  pds->ramblock_name,
-  pds->cur_entry,
-  pds->start_list,
-  pds->length_list);
-pds->nsentcmds++;
-pds->cur_entry = 0;
+  pds.ramblock_name,
+  pds.cur_entry,
+  pds.start_list,
+  pds.length_list);
+pds.nsentcmds++;
+pds.cur_entry = 0;
 }
 }
 
@@ -1433,24 +1426,21 @@ void postcopy_discard_send_range(MigrationState *ms, 
PostcopyDiscardState *pds,
  * bitmap code. Sends any outstanding discard messages, frees the PDS
  *
  * @ms: Current migration state.
- * @pds: Structure initialised by postcopy_discard_send_init().
  */
-void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState 
*pds)
+void postcopy_discard_send_finish(MigrationState *ms)
 {
 /* Anything unsent? */
-if (pds->cur_entry) {
+if (pds.cur_entry) {
 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
-  pds->ramblock_name,
-  pds->cur_entry,
-  pds->start_list,
-

[Qemu-devel] [PULL 09/33] migration/savevm: flush file for iterable_only case

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

It would be proper to flush file even for iterable_only case.

Signed-off-by: Wei Yang 
Message-Id: <20190709140924.13291-2-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index a2a5f89b75..0bfdceefcc 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1292,7 +1292,7 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool 
iterable_only,
 }
 
 if (iterable_only) {
-return 0;
+goto flush;
 }
 
 vmdesc = qjson_new();
@@ -1353,6 +1353,7 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool 
iterable_only,
 }
 qjson_destroy(vmdesc);
 
+flush:
 qemu_fflush(f);
 return 0;
 }
-- 
2.21.0

[Qemu-devel] [PULL 06/33] migration/postcopy: discard_length must not be 0

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Since we break the loop when there is no more page to discard, we are
sure the following process would find some page to discard.

It is not necessary to check it again.

Signed-off-by: Wei Yang 
Message-Id: <20190627020822.15485-4-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 8a97dadec4..4bb5e24459 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2877,9 +2877,7 @@ static int postcopy_send_discard_bm_ram(MigrationState 
*ms,
 } else {
 discard_length = zero - one;
 }
-if (discard_length) {
-postcopy_discard_send_range(ms, pds, one, discard_length);
-}
+postcopy_discard_send_range(ms, pds, one, discard_length);
 current = one + discard_length;
 }
 
-- 
2.21.0

[Qemu-devel] [PULL 20/33] migration/postcopy: simplify calculation of run_start and fixup_start_addr

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

The purpose of the calculation is to find a HostPage which is partially
dirty.

  * fixup_start_addr points to the start of the HostPage to discard
  * run_start points to the next HostPage to check

While in the middle stage, there would two cases for run_start:

  * aligned with HostPage means this is not partially dirty
  * not aligned means this is partially dirty

When it is aligned, no work and calculation is necessary. run_start
already points to the start of next HostPage and is ready to continue.

When it is not aligned, the calculation could be simplified with:

  * fixup_start_addr = QEMU_ALIGN_DOWN(run_start, host_ratio)
  * run_start = QEMU_ALIGN_UP(run_start, host_ratio)

By doing so, run_start always points to the next HostPage to check.
fixup_start_addr always points to the HostPage to discard.

Signed-off-by: Wei Yang 
Message-Id: <20190806004648.8659-2-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 34 +++---
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index f428639af5..d2184c3cfc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2955,7 +2955,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
 }
 
 while (run_start < pages) {
-unsigned long fixup_start_addr;
 unsigned long host_offset;
 
 /*
@@ -2963,45 +2962,26 @@ static void 
postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
  * page, then we need to fixup this host page.
  */
 host_offset = run_start % host_ratio;
-if (host_offset) {
-fixup_start_addr = run_start - host_offset;
-/*
- * This host page has gone, the next loop iteration starts
- * from after the fixup
- */
-run_start = fixup_start_addr + host_ratio;
-} else {
+if (!host_offset) {
 /* Find the end of this run */
-unsigned long run_end;
 if (unsent_pass) {
-run_end = find_next_bit(unsentmap, pages, run_start + 1);
+run_start = find_next_bit(unsentmap, pages, run_start + 1);
 } else {
-run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
+run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
 }
 /*
  * If the end isn't at the start of a host page, then the
  * run doesn't finish at the end of a host page
  * and we need to discard.
  */
-host_offset = run_end % host_ratio;
-if (host_offset) {
-fixup_start_addr = run_end - host_offset;
-/*
- * This host page has gone, the next loop iteration starts
- * from after the fixup
- */
-run_start = fixup_start_addr + host_ratio;
-} else {
-/*
- * No discards on this iteration, next loop starts from
- * next sent/dirty page
- */
-run_start = run_end + 1;
-}
+host_offset = run_start % host_ratio;
 }
 
 if (host_offset) {
 unsigned long page;
+unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
+ host_ratio);
+run_start = QEMU_ALIGN_UP(run_start, host_ratio);
 
 /* Tell the destination to discard this page */
 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
-- 
2.21.0

[Qemu-devel] [PULL 08/33] migration/postcopy: do_fixup is true when host_offset is non-zero

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

This means it is not necessary to spare an extra variable to hold this
condition. Use host_offset directly is fine.

Signed-off-by: Wei Yang 
Message-Id: <20190710050814.31344-3-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index da399f2c8a..255f289bbb 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2962,7 +2962,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
 }
 
 while (run_start < pages) {
-bool do_fixup = false;
 unsigned long fixup_start_addr;
 unsigned long host_offset;
 
@@ -2972,7 +2971,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
  */
 host_offset = run_start % host_ratio;
 if (host_offset) {
-do_fixup = true;
 fixup_start_addr = run_start - host_offset;
 /*
  * This host page has gone, the next loop iteration starts
@@ -2994,7 +2992,6 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
  */
 host_offset = run_end % host_ratio;
 if (host_offset) {
-do_fixup = true;
 fixup_start_addr = run_end - host_offset;
 /*
  * This host page has gone, the next loop iteration starts
@@ -3010,7 +3007,7 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
 }
 }
 
-if (do_fixup) {
+if (host_offset) {
 unsigned long page;
 
 /* Tell the destination to discard this page */
-- 
2.21.0

[Qemu-devel] [PULL 07/33] migration/postcopy: reduce one operation to calculate fixup_start_addr

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Use the same way for run_end to calculate run_start, which saves one
operation.

Signed-off-by: Wei Yang 
Message-Id: <20190710050814.31344-2-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 4bb5e24459..da399f2c8a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2973,10 +2973,12 @@ static void 
postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
 host_offset = run_start % host_ratio;
 if (host_offset) {
 do_fixup = true;
-run_start -= host_offset;
-fixup_start_addr = run_start;
-/* For the next pass */
-run_start = run_start + host_ratio;
+fixup_start_addr = run_start - host_offset;
+/*
+ * This host page has gone, the next loop iteration starts
+ * from after the fixup
+ */
+run_start = fixup_start_addr + host_ratio;
 } else {
 /* Find the end of this run */
 unsigned long run_end;
-- 
2.21.0

[Qemu-devel] [PULL 11/33] migration/savevm: move non SaveStateEntry condition check out of iteration

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

in_postcopy and iterable_only are not SaveStateEntry specific, it would
be more proper to check them out of iteration.

Signed-off-by: Wei Yang 
Message-Id: <20190709140924.13291-4-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 63545a3026..69a827a92f 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1247,8 +1247,7 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f)
 }
 
 static
-int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy,
-bool iterable_only)
+int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
 {
 SaveStateEntry *se;
 int ret;
@@ -1257,7 +1256,6 @@ int qemu_savevm_state_complete_precopy_iterable(QEMUFile 
*f, bool in_postcopy,
 if (!se->ops ||
 (in_postcopy && se->ops->has_postcopy &&
  se->ops->has_postcopy(se->opaque)) ||
-(in_postcopy && !iterable_only) ||
 !se->ops->save_live_complete_precopy) {
 continue;
 }
@@ -1369,10 +1367,11 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, 
bool iterable_only,
 
 cpu_synchronize_all_states();
 
-ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy,
-  iterable_only);
-if (ret) {
-return ret;
+if (!in_postcopy || iterable_only) {
+ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
+if (ret) {
+return ret;
+}
 }
 
 if (iterable_only) {
-- 
2.21.0

[Qemu-devel] [PULL 21/33] migration/postcopy: use QEMU_IS_ALIGNED to replace host_offset

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Use QEMU_IS_ALIGNED for the check, it would be more consistent with
other align calculations.

Signed-off-by: Wei Yang 
Message-Id: <20190806004648.8659-3-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index d2184c3cfc..eee68a7991 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2955,14 +2955,12 @@ static void 
postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
 }
 
 while (run_start < pages) {
-unsigned long host_offset;
 
 /*
  * If the start of this run of pages is in the middle of a host
  * page, then we need to fixup this host page.
  */
-host_offset = run_start % host_ratio;
-if (!host_offset) {
+if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
 /* Find the end of this run */
 if (unsent_pass) {
 run_start = find_next_bit(unsentmap, pages, run_start + 1);
@@ -2974,10 +2972,9 @@ static void postcopy_chunk_hostpages_pass(MigrationState 
*ms, bool unsent_pass,
  * run doesn't finish at the end of a host page
  * and we need to discard.
  */
-host_offset = run_start % host_ratio;
 }
 
-if (host_offset) {
+if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
 unsigned long page;
 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
  host_ratio);
-- 
2.21.0

[Qemu-devel] [PULL 12/33] migration/postcopy: PostcopyState is already set in loadvm_postcopy_handle_advise()

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

PostcopyState is already set to ADVISE at the beginning of
loadvm_postcopy_handle_advise().

Remove the redundant set.

Signed-off-by: Wei Yang 
Message-Id: <20190711080816.6405-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 69a827a92f..eed5e551da 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1648,8 +1648,6 @@ static int 
loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
 return -1;
 }
 
-postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
-
 return 0;
 }
 
-- 
2.21.0

[Qemu-devel] [PULL 15/33] migration: just pass RAMBlock is enough

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

RAMBlock->used_length is always passed to migration_bitmap_sync_range(),
which could be retrieved from RAMBlock.

Suggested-by: Paolo Bonzini 
Signed-off-by: Wei Yang 
Message-Id: <20190718012547.16373-1-richardw.y...@linux.intel.com>
Reviewed-by: Peter Xu 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 255f289bbb..97f241d6d9 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1748,11 +1748,10 @@ static inline bool 
migration_bitmap_clear_dirty(RAMState *rs,
 }
 
 /* Called with RCU critical section */
-static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
-ram_addr_t length)
+static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb)
 {
 rs->migration_dirty_pages +=
-cpu_physical_memory_sync_dirty_bitmap(rb, 0, length,
+cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
   &rs->num_dirty_pages_period);
 }
 
@@ -1841,7 +1840,7 @@ static void migration_bitmap_sync(RAMState *rs)
 qemu_mutex_lock(&rs->bitmap_mutex);
 rcu_read_lock();
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-migration_bitmap_sync_range(rs, block, block->used_length);
+migration_bitmap_sync_range(rs, block);
 }
 ram_counters.remaining = ram_bytes_remaining();
 rcu_read_unlock();
@@ -4293,7 +4292,7 @@ static void colo_flush_ram_cache(void)
 memory_global_dirty_log_sync();
 rcu_read_lock();
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-migration_bitmap_sync_range(ram_state, block, block->used_length);
+migration_bitmap_sync_range(ram_state, block);
 }
 rcu_read_unlock();
 
-- 
2.21.0

[Qemu-devel] [PULL 10/33] migration/savevm: split qemu_savevm_state_complete_precopy() into two parts

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

This is a preparation patch for further cleanup.

No functional change, just wrap two major part of
qemu_savevm_state_complete_precopy() into function.

Signed-off-by: Wei Yang 
Message-Id: <20190709140924.13291-3-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 66 ++
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 0bfdceefcc..63545a3026 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1246,23 +1246,12 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f)
 qemu_fflush(f);
 }
 
-int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
-   bool inactivate_disks)
+static
+int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy,
+bool iterable_only)
 {
-QJSON *vmdesc;
-int vmdesc_len;
 SaveStateEntry *se;
 int ret;
-bool in_postcopy = migration_in_postcopy();
-Error *local_err = NULL;
-
-if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
-error_report_err(local_err);
-}
-
-trace_savevm_state_complete_precopy();
-
-cpu_synchronize_all_states();
 
 QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 if (!se->ops ||
@@ -1291,9 +1280,18 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool 
iterable_only,
 }
 }
 
-if (iterable_only) {
-goto flush;
-}
+return 0;
+}
+
+static
+int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
+bool in_postcopy,
+bool inactivate_disks)
+{
+QJSON *vmdesc;
+int vmdesc_len;
+SaveStateEntry *se;
+int ret;
 
 vmdesc = qjson_new();
 json_prop_int(vmdesc, "page_size", qemu_target_page_size());
@@ -1353,6 +1351,40 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool 
iterable_only,
 }
 qjson_destroy(vmdesc);
 
+return 0;
+}
+
+int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
+   bool inactivate_disks)
+{
+int ret;
+Error *local_err = NULL;
+bool in_postcopy = migration_in_postcopy();
+
+if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
+error_report_err(local_err);
+}
+
+trace_savevm_state_complete_precopy();
+
+cpu_synchronize_all_states();
+
+ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy,
+  iterable_only);
+if (ret) {
+return ret;
+}
+
+if (iterable_only) {
+goto flush;
+}
+
+ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
+  inactivate_disks);
+if (ret) {
+return ret;
+}
+
 flush:
 qemu_fflush(f);
 return 0;
-- 
2.21.0

[Qemu-devel] [PULL 22/33] hmp: Remove migration capabilities from "info migrate"

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

With the growth of migration capabilities, it is not proper to display
them in "info migrate". Users are recommended to use "info
migrate_capabiltiies" to list them.

Signed-off-by: Wei Yang 
Suggested-by: Dr. David Alan Gilbert 

Message-Id: <20190806003645.8426-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 monitor/hmp-cmds.c | 14 --
 1 file changed, 14 deletions(-)

diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 5ca3ebe942..35788c0645 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -220,24 +220,11 @@ static char *SocketAddress_to_str(SocketAddress *addr)
 void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 {
 MigrationInfo *info;
-MigrationCapabilityStatusList *caps, *cap;
 
 info = qmp_query_migrate(NULL);
-caps = qmp_query_migrate_capabilities(NULL);
 
 migration_global_dump(mon);
 
-/* do not display parameters during setup */
-if (info->has_status && caps) {
-monitor_printf(mon, "capabilities: ");
-for (cap = caps; cap; cap = cap->next) {
-monitor_printf(mon, "%s: %s ",
-   MigrationCapability_str(cap->value->capability),
-   cap->value->state ? "on" : "off");
-}
-monitor_printf(mon, "\n");
-}
-
 if (info->has_status) {
 monitor_printf(mon, "Migration status: %s",
MigrationStatus_str(info->status));
@@ -370,7 +357,6 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
 monitor_printf(mon, "]\n");
 }
 qapi_free_MigrationInfo(info);
-qapi_free_MigrationCapabilityStatusList(caps);
 }
 
 void hmp_info_migrate_capabilities(Monitor *mon, const QDict *qdict)
-- 
2.21.0

[Qemu-devel] [PULL 18/33] migration: extract ram_load_precopy

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

After cleanup, it would be clear to audience there are two cases
ram_load:

  * precopy
  * postcopy

And it is not necessary to check postcopy_running on each iteration for
precopy.

Signed-off-by: Wei Yang 
Reviewed-by: Dr. David Alan Gilbert 

Message-Id: <20190725002023.2335-3-richardw.y...@linux.intel.com>
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 73 +++--
 1 file changed, 46 insertions(+), 27 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 6a75aedc91..a44e9c0abc 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4318,40 +4318,26 @@ static void colo_flush_ram_cache(void)
 trace_colo_flush_ram_cache_end();
 }
 
-static int ram_load(QEMUFile *f, void *opaque, int version_id)
+/**
+ * ram_load_precopy: load pages in precopy case
+ *
+ * Returns 0 for success or -errno in case of error
+ *
+ * Called in precopy mode by ram_load().
+ * rcu_read_lock is taken prior to this being called.
+ *
+ * @f: QEMUFile where to send the data
+ */
+static int ram_load_precopy(QEMUFile *f)
 {
-int flags = 0, ret = 0, invalid_flags = 0;
-static uint64_t seq_iter;
-int len = 0;
-/*
- * If system is running in postcopy mode, page inserts to host memory must
- * be atomic
- */
-bool postcopy_running = postcopy_is_running();
+int flags = 0, ret = 0, invalid_flags = 0, len = 0;
 /* ADVISE is earlier, it shows the source has the postcopy capability on */
 bool postcopy_advised = postcopy_is_advised();
-
-seq_iter++;
-
-if (version_id != 4) {
-return -EINVAL;
-}
-
 if (!migrate_use_compression()) {
 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
 }
-/* This RCU critical section can be very long running.
- * When RCU reclaims in the code start to become numerous,
- * it will be necessary to reduce the granularity of this
- * critical section.
- */
-rcu_read_lock();
-
-if (postcopy_running) {
-ret = ram_load_postcopy(f);
-}
 
-while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
 ram_addr_t addr, total_ram_bytes;
 void *host = NULL;
 uint8_t ch;
@@ -4502,6 +4488,39 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 }
 
+return ret;
+}
+
+static int ram_load(QEMUFile *f, void *opaque, int version_id)
+{
+int ret = 0;
+static uint64_t seq_iter;
+/*
+ * If system is running in postcopy mode, page inserts to host memory must
+ * be atomic
+ */
+bool postcopy_running = postcopy_is_running();
+
+seq_iter++;
+
+if (version_id != 4) {
+return -EINVAL;
+}
+
+/*
+ * This RCU critical section can be very long running.
+ * When RCU reclaims in the code start to become numerous,
+ * it will be necessary to reduce the granularity of this
+ * critical section.
+ */
+rcu_read_lock();
+
+if (postcopy_running) {
+ret = ram_load_postcopy(f);
+} else {
+ret = ram_load_precopy(f);
+}
+
 ret |= wait_for_decompress_done();
 rcu_read_unlock();
 trace_ram_load_complete(ret, seq_iter);
-- 
2.21.0

[Qemu-devel] [PULL 26/33] migration: add speed limit for multifd migration

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Ivan Ren 

Limit the speed of multifd migration through common speed limitation
qemu file.

Signed-off-by: Ivan Ren 
Message-Id: <1564464816-21804-3-git-send-email-ivan...@tencent.com>
Reviewed-by: Wei Yang 
Reviewed-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 22 --
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index eee68a7991..1179519345 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -922,7 +922,7 @@ struct {
  * false.
  */
 
-static int multifd_send_pages(void)
+static int multifd_send_pages(RAMState *rs)
 {
 int i;
 static int next_channel;
@@ -954,6 +954,7 @@ static int multifd_send_pages(void)
 multifd_send_state->pages = p->pages;
 p->pages = pages;
 transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
+qemu_file_update_transfer(rs->f, transferred);
 ram_counters.multifd_bytes += transferred;
 ram_counters.transferred += transferred;;
 qemu_mutex_unlock(&p->mutex);
@@ -962,7 +963,7 @@ static int multifd_send_pages(void)
 return 1;
 }
 
-static int multifd_queue_page(RAMBlock *block, ram_addr_t offset)
+static int multifd_queue_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 {
 MultiFDPages_t *pages = multifd_send_state->pages;
 
@@ -981,12 +982,12 @@ static int multifd_queue_page(RAMBlock *block, ram_addr_t 
offset)
 }
 }
 
-if (multifd_send_pages() < 0) {
+if (multifd_send_pages(rs) < 0) {
 return -1;
 }
 
 if (pages->block != block) {
-return  multifd_queue_page(block, offset);
+return  multifd_queue_page(rs, block, offset);
 }
 
 return 1;
@@ -1054,7 +1055,7 @@ void multifd_save_cleanup(void)
 multifd_send_state = NULL;
 }
 
-static void multifd_send_sync_main(void)
+static void multifd_send_sync_main(RAMState *rs)
 {
 int i;
 
@@ -1062,7 +1063,7 @@ static void multifd_send_sync_main(void)
 return;
 }
 if (multifd_send_state->pages->used) {
-if (multifd_send_pages() < 0) {
+if (multifd_send_pages(rs) < 0) {
 error_report("%s: multifd_send_pages fail", __func__);
 return;
 }
@@ -1083,6 +1084,7 @@ static void multifd_send_sync_main(void)
 p->packet_num = multifd_send_state->packet_num++;
 p->flags |= MULTIFD_FLAG_SYNC;
 p->pending_job++;
+qemu_file_update_transfer(rs->f, p->packet_len);
 qemu_mutex_unlock(&p->mutex);
 qemu_sem_post(&p->sem);
 }
@@ -2078,7 +2080,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus 
*pss, bool last_stage)
 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
  ram_addr_t offset)
 {
-if (multifd_queue_page(block, offset) < 0) {
+if (multifd_queue_page(rs, block, offset) < 0) {
 return -1;
 }
 ram_counters.normal++;
@@ -3447,7 +3449,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
 
-multifd_send_sync_main();
+multifd_send_sync_main(*rsp);
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 qemu_fflush(f);
 
@@ -3535,7 +3537,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
 
 out:
-multifd_send_sync_main();
+multifd_send_sync_main(rs);
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 qemu_fflush(f);
 ram_counters.transferred += 8;
@@ -3594,7 +3596,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 
 rcu_read_unlock();
 
-multifd_send_sync_main();
+multifd_send_sync_main(rs);
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 qemu_fflush(f);
 
-- 
2.21.0

[Qemu-devel] [PULL 13/33] migration/postcopy: start_postcopy could be true only when migrate_postcopy() return true

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

There is only one place to set start_postcopy to true,
qmp_migrate_start_postcopy(), which make sure start_postcopy could be
set to true when migrate_postcopy() return true.

So start_postcopy is true implies the other one.

Signed-off-by: Wei Yang 
Message-Id: <20190718083747.5859-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 7c66da3a83..8331e62831 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3103,8 +3103,7 @@ static MigIterateState 
migration_iteration_run(MigrationState *s)
 
 if (pending_size && pending_size >= s->threshold_size) {
 /* Still a significant amount to transfer */
-if (migrate_postcopy() && !in_postcopy &&
-pend_pre <= s->threshold_size &&
+if (!in_postcopy && pend_pre <= s->threshold_size &&
 atomic_read(&s->start_postcopy)) {
 if (postcopy_start(s)) {
 error_report("%s: postcopy failed to start", __func__);
-- 
2.21.0

[Qemu-devel] [PULL 23/33] migration: remove unused field bytes_xfer

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

MigrationState->bytes_xfer is only set to 0 in migrate_init().

Remove this unnecessary field.

Signed-off-by: Wei Yang 
Message-Id: <20190402003106.17614-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 1 -
 migration/migration.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 8331e62831..12b8e5dbe5 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1699,7 +1699,6 @@ void migrate_init(MigrationState *s)
  * parameters/capabilities that the user set, and
  * locks.
  */
-s->bytes_xfer = 0;
 s->cleanup_bh = 0;
 s->to_dst_file = NULL;
 s->state = MIGRATION_STATUS_NONE;
diff --git a/migration/migration.h b/migration/migration.h
index 1fdd7b21fd..5bc60709db 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -132,7 +132,6 @@ struct MigrationState
 DeviceState parent_obj;
 
 /*< public >*/
-size_t bytes_xfer;
 QemuThread thread;
 QEMUBH *cleanup_bh;
 QEMUFile *to_dst_file;
-- 
2.21.0

[Qemu-devel] [PULL 25/33] migration: add qemu_file_update_transfer interface

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Ivan Ren 

Add qemu_file_update_transfer for just update bytes_xfer for speed
limitation. This will be used for further migration feature such as
multifd migration.

Signed-off-by: Ivan Ren 
Reviewed-by: Wei Yang 
Reviewed-by: Juan Quintela 
Message-Id: <1564464816-21804-2-git-send-email-ivan...@tencent.com>
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/qemu-file.c | 5 +
 migration/qemu-file.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index c04a7a891b..e33c46764f 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -654,6 +654,11 @@ void qemu_file_reset_rate_limit(QEMUFile *f)
 f->bytes_xfer = 0;
 }
 
+void qemu_file_update_transfer(QEMUFile *f, int64_t len)
+{
+f->bytes_xfer += len;
+}
+
 void qemu_put_be16(QEMUFile *f, unsigned int v)
 {
 qemu_put_byte(f, v >> 8);
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index eb886db65f..d064940b8c 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -150,6 +150,7 @@ int qemu_peek_byte(QEMUFile *f, int offset);
 void qemu_file_skip(QEMUFile *f, int size);
 void qemu_update_position(QEMUFile *f, size_t size);
 void qemu_file_reset_rate_limit(QEMUFile *f);
+void qemu_file_update_transfer(QEMUFile *f, int64_t len);
 void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
 int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
-- 
2.21.0

[Qemu-devel] [PULL 16/33] migration: equation is more proper than and to check LOADVM_QUIT

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

LOADVM_QUIT allows a command to quit all layers of nested loadvm loops,
while current return value check is not that proper even it works now.

Current return value check "ret & LOADVM_QUIT" would return true if
bit[0] is 1. This would be true when ret is -1 which is used to indicate
an error of handling a command.

Since there is only one place return LOADVM_QUIT and no other
combination of return value, use "ret == LOADVM_QUIT" would be more
proper.

Signed-off-by: Wei Yang 
Message-Id: <20190718064257.29218-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index eed5e551da..412768216c 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2437,7 +2437,7 @@ retry:
 case QEMU_VM_COMMAND:
 ret = loadvm_process_command(f);
 trace_qemu_loadvm_state_section_command(ret);
-if ((ret < 0) || (ret & LOADVM_QUIT)) {
+if ((ret < 0) || (ret == LOADVM_QUIT)) {
 goto out;
 }
 break;
-- 
2.21.0

[Qemu-devel] [PULL 33/33] migration: add some multifd traces

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Juan Quintela 

Signed-off-by: Juan Quintela 
Message-Id: <20190814020218.1868-6-quint...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c| 3 +++
 migration/trace-events | 4 
 2 files changed, 7 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index c7aa3d9a2c..35552c090b 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1170,6 +1170,7 @@ static void *multifd_send_thread(void *opaque)
 
 out:
 if (local_err) {
+trace_multifd_send_error(p->id);
 multifd_send_terminate_threads(local_err);
 }
 
@@ -1200,6 +1201,7 @@ static void multifd_new_send_channel_async(QIOTask *task, 
gpointer opaque)
 QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
 Error *local_err = NULL;
 
+trace_multifd_new_send_channel_async(p->id);
 if (qio_task_propagate_error(task, &local_err)) {
 migrate_set_error(migrate_get_current(), local_err);
 multifd_save_cleanup();
@@ -1486,6 +1488,7 @@ bool multifd_recv_new_channel(QIOChannel *ioc, Error 
**errp)
 atomic_read(&multifd_recv_state->count));
 return false;
 }
+trace_multifd_recv_new_channel(id);
 
 p = &multifd_recv_state->params[id];
 if (p->c != NULL) {
diff --git a/migration/trace-events b/migration/trace-events
index 886ce70ca0..00ffcd5930 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -81,14 +81,18 @@ migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, 
unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
 migration_throttle(void) ""
+multifd_new_send_channel_async(uint8_t id) "channel %d"
 multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, 
uint32_t next_packet_size) "channel %d packet_num %" PRIu64 " pages %d flags 
0x%x next packet size %d"
+multifd_recv_new_channel(uint8_t id) "channel %d"
 multifd_recv_sync_main(long packet_num) "packet num %ld"
 multifd_recv_sync_main_signal(uint8_t id) "channel %d"
 multifd_recv_sync_main_wait(uint8_t id) "channel %d"
 multifd_recv_terminate_threads(bool error) "error %d"
 multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel 
%d packets %" PRIu64 " pages %" PRIu64
 multifd_recv_thread_start(uint8_t id) "%d"
+multifd_save_setup_wait(uint8_t id) "%d"
 multifd_send(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, 
uint32_t next_packet_size) "channel %d packet_num %" PRIu64 " pages %d flags 
0x%x next packet size %d"
+multifd_send_error(uint8_t id) "channel %d"
 multifd_send_sync_main(long packet_num) "packet num %ld"
 multifd_send_sync_main_signal(uint8_t id) "channel %d"
 multifd_send_sync_main_wait(uint8_t id) "channel %d"
-- 
2.21.0

[Qemu-devel] [PULL 29/33] migration/postcopy: use mis->bh instead of allocating a QEMUBH

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

For migration incoming side, it either quit in precopy or postcopy. It
is safe to use the mis->bh for both instead of allocating a dedicated
QEMUBH for postcopy.

Signed-off-by: Wei Yang 
Reviewed-by: Dr. David Alan Gilbert 

Message-Id: <20190805053146.32326-1-richardw.y...@linux.intel.com>
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 17 -
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 1ac15301ad..6369a4ff7a 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1866,16 +1866,10 @@ static int 
loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
 return 0;
 }
 
-
-typedef struct {
-QEMUBH *bh;
-} HandleRunBhData;
-
 static void loadvm_postcopy_handle_run_bh(void *opaque)
 {
 Error *local_err = NULL;
-HandleRunBhData *data = opaque;
-MigrationIncomingState *mis = migration_incoming_get_current();
+MigrationIncomingState *mis = opaque;
 
 /* TODO we should move all of this lot into postcopy_ram.c or a shared code
  * in migration.c
@@ -1907,15 +1901,13 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
 runstate_set(RUN_STATE_PAUSED);
 }
 
-qemu_bh_delete(data->bh);
-g_free(data);
+qemu_bh_delete(mis->bh);
 }
 
 /* After all discards we can start running and asking for pages */
 static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
 {
 PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
-HandleRunBhData *data;
 
 trace_loadvm_postcopy_handle_run();
 if (ps != POSTCOPY_INCOMING_LISTENING) {
@@ -1923,9 +1915,8 @@ static int 
loadvm_postcopy_handle_run(MigrationIncomingState *mis)
 return -1;
 }
 
-data = g_new(HandleRunBhData, 1);
-data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
-qemu_bh_schedule(data->bh);
+mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
+qemu_bh_schedule(mis->bh);
 
 /* We need to finish reading the stream from the package
  * and also stop reading anything more from the stream that loaded the
-- 
2.21.0

[Qemu-devel] [PULL 17/33] migration: return -EINVAL directly when version_id mismatch

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

It is not reasonable to continue when version_id mismatch.

Signed-off-by: Wei Yang 
Message-Id: <20190722075339.25121-2-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 97f241d6d9..6a75aedc91 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4334,7 +4334,7 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 seq_iter++;
 
 if (version_id != 4) {
-ret = -EINVAL;
+return -EINVAL;
 }
 
 if (!migrate_use_compression()) {
-- 
2.21.0

[Qemu-devel] [PULL 32/33] migration: Make global sem_sync semaphore by channel

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Juan Quintela 

This makes easy to debug things because when you want for all threads
to arrive at that semaphore, you know which one your are waiting for.

Signed-off-by: Juan Quintela 
Message-Id: <20190814020218.1868-3-quint...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index b542929a7c..c7aa3d9a2c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -661,6 +661,8 @@ typedef struct {
 uint64_t num_packets;
 /* pages sent through this channel */
 uint64_t num_pages;
+/* syncs main thread and channels */
+QemuSemaphore sem_sync;
 }  MultiFDSendParams;
 
 typedef struct {
@@ -896,8 +898,6 @@ struct {
 MultiFDSendParams *params;
 /* array of pages to sent */
 MultiFDPages_t *pages;
-/* syncs main thread and channels */
-QemuSemaphore sem_sync;
 /* global number of generated multifd packets */
 uint64_t packet_num;
 /* send channels ready */
@@ -1039,6 +1039,7 @@ void multifd_save_cleanup(void)
 p->c = NULL;
 qemu_mutex_destroy(&p->mutex);
 qemu_sem_destroy(&p->sem);
+qemu_sem_destroy(&p->sem_sync);
 g_free(p->name);
 p->name = NULL;
 multifd_pages_clear(p->pages);
@@ -1048,7 +1049,6 @@ void multifd_save_cleanup(void)
 p->packet = NULL;
 }
 qemu_sem_destroy(&multifd_send_state->channels_ready);
-qemu_sem_destroy(&multifd_send_state->sem_sync);
 g_free(multifd_send_state->params);
 multifd_send_state->params = NULL;
 multifd_pages_clear(multifd_send_state->pages);
@@ -1096,7 +1096,7 @@ static void multifd_send_sync_main(RAMState *rs)
 MultiFDSendParams *p = &multifd_send_state->params[i];
 
 trace_multifd_send_sync_main_wait(p->id);
-qemu_sem_wait(&multifd_send_state->sem_sync);
+qemu_sem_wait(&p->sem_sync);
 }
 trace_multifd_send_sync_main(multifd_send_state->packet_num);
 }
@@ -1156,7 +1156,7 @@ static void *multifd_send_thread(void *opaque)
 qemu_mutex_unlock(&p->mutex);
 
 if (flags & MULTIFD_FLAG_SYNC) {
-qemu_sem_post(&multifd_send_state->sem_sync);
+qemu_sem_post(&p->sem_sync);
 }
 qemu_sem_post(&multifd_send_state->channels_ready);
 } else if (p->quit) {
@@ -1179,7 +1179,7 @@ out:
  */
 if (ret != 0) {
 if (flags & MULTIFD_FLAG_SYNC) {
-qemu_sem_post(&multifd_send_state->sem_sync);
+qemu_sem_post(&p->sem_sync);
 }
 qemu_sem_post(&multifd_send_state->channels_ready);
 }
@@ -1225,7 +1225,6 @@ int multifd_save_setup(void)
 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 multifd_send_state->pages = multifd_pages_init(page_count);
-qemu_sem_init(&multifd_send_state->sem_sync, 0);
 qemu_sem_init(&multifd_send_state->channels_ready, 0);
 
 for (i = 0; i < thread_count; i++) {
@@ -1233,6 +1232,7 @@ int multifd_save_setup(void)
 
 qemu_mutex_init(&p->mutex);
 qemu_sem_init(&p->sem, 0);
+qemu_sem_init(&p->sem_sync, 0);
 p->quit = false;
 p->pending_job = 0;
 p->id = i;
-- 
2.21.0

[Qemu-devel] [PULL 24/33] migration: always initialise ram_counters for a new migration

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Ivan Ren 

This patch fix a multifd migration bug in migration speed calculation, this
problem can be reproduced as follows:
1. start a vm and give a heavy memory write stress to prevent the vm be
   successfully migrated to destination
2. begin a migration with multifd
3. migrate for a long time [actually, this can be measured by transferred bytes]
4. migrate cancel
5. begin a new migration with multifd, the migration will directly run into
   migration_completion phase

Reason as follows:

Migration update bandwidth and s->threshold_size in function
migration_update_counters after BUFFER_DELAY time:

current_bytes = migration_total_bytes(s);
transferred = current_bytes - s->iteration_initial_bytes;
time_spent = current_time - s->iteration_start_time;
bandwidth = (double)transferred / time_spent;
s->threshold_size = bandwidth * s->parameters.downtime_limit;

In multifd migration, migration_total_bytes function return
qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes.
s->iteration_initial_bytes will be initialized to 0 at every new migration,
but ram_counters is a global variable, and history migration data will be
accumulated. So if the ram_counters.multifd_bytes is big enough, it may lead
pending_size >= s->threshold_size become false in migration_iteration_run
after the first migration_update_counters.

Signed-off-by: Ivan Ren 
Reviewed-by: Juan Quintela 
Reviewed-by: Wei Yang 
Suggested-by: Wei Yang 
Message-Id: <1564741121-1840-1-git-send-email-ivan...@tencent.com>
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 25 +++--
 migration/savevm.c|  1 +
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 12b8e5dbe5..c49e9dc035 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1911,6 +1911,11 @@ static bool migrate_prepare(MigrationState *s, bool blk, 
bool blk_inc,
 }
 
 migrate_init(s);
+/*
+ * set ram_counters memory to zero for a
+ * new migration
+ */
+memset(&ram_counters, 0, sizeof(ram_counters));
 
 return true;
 }
@@ -3034,6 +3039,17 @@ static void migration_calculate_complete(MigrationState 
*s)
 }
 }
 
+static void update_iteration_initial_status(MigrationState *s)
+{
+/*
+ * Update these three fields at the same time to avoid mismatch info lead
+ * wrong speed calculation.
+ */
+s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+s->iteration_initial_bytes = migration_total_bytes(s);
+s->iteration_initial_pages = ram_get_total_transferred_pages();
+}
+
 static void migration_update_counters(MigrationState *s,
   int64_t current_time)
 {
@@ -3069,9 +3085,7 @@ static void migration_update_counters(MigrationState *s,
 
 qemu_file_reset_rate_limit(s->to_dst_file);
 
-s->iteration_start_time = current_time;
-s->iteration_initial_bytes = current_bytes;
-s->iteration_initial_pages = ram_get_total_transferred_pages();
+update_iteration_initial_status(s);
 
 trace_migrate_transferred(transferred, time_spent,
   bandwidth, s->threshold_size);
@@ -3194,7 +3208,7 @@ static void *migration_thread(void *opaque)
 rcu_register_thread();
 
 object_ref(OBJECT(s));
-s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+update_iteration_initial_status(s);
 
 qemu_savevm_state_header(s->to_dst_file);
 
@@ -3259,8 +3273,7 @@ static void *migration_thread(void *opaque)
  * the local variables. This is important to avoid
  * breaking transferred_bytes and bandwidth calculation
  */
-s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-s->iteration_initial_bytes = 0;
+update_iteration_initial_status(s);
 }
 
 current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
diff --git a/migration/savevm.c b/migration/savevm.c
index 412768216c..1ac15301ad 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1456,6 +1456,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 }
 
 migrate_init(ms);
+memset(&ram_counters, 0, sizeof(ram_counters));
 ms->to_dst_file = f;
 
 qemu_mutex_unlock_iothread();
-- 
2.21.0

[Qemu-devel] [PULL 27/33] migration: update ram_counters for multifd sync packet

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Ivan Ren 

Multifd sync will send MULTIFD_FLAG_SYNC flag info to destination, add
these bytes to ram_counters record.

Signed-off-by: Ivan Ren 
Suggested-by: Wei Yang 
Message-Id: <1564464816-21804-4-git-send-email-ivan...@tencent.com>
Reviewed-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 1179519345..30f13ffbdd 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1085,6 +1085,8 @@ static void multifd_send_sync_main(RAMState *rs)
 p->flags |= MULTIFD_FLAG_SYNC;
 p->pending_job++;
 qemu_file_update_transfer(rs->f, p->packet_len);
+ram_counters.multifd_bytes += p->packet_len;
+ram_counters.transferred += p->packet_len;
 qemu_mutex_unlock(&p->mutex);
 qemu_sem_post(&p->sem);
 }
-- 
2.21.0

[Qemu-devel] [PULL 28/33] migration: rename migration_bitmap_sync_range to ramblock_sync_dirty_bitmap

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Wei Yang 

Rename for better understanding of the code.

Suggested-by: Paolo Bonzini 
Signed-off-by: Wei Yang 
Message-Id: <20190808033155.30162-1-richardw.y...@linux.intel.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 30f13ffbdd..9e6cc1e685 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1752,7 +1752,7 @@ static inline bool migration_bitmap_clear_dirty(RAMState 
*rs,
 }
 
 /* Called with RCU critical section */
-static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb)
+static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 {
 rs->migration_dirty_pages +=
 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
@@ -1844,7 +1844,7 @@ static void migration_bitmap_sync(RAMState *rs)
 qemu_mutex_lock(&rs->bitmap_mutex);
 rcu_read_lock();
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-migration_bitmap_sync_range(rs, block);
+ramblock_sync_dirty_bitmap(rs, block);
 }
 ram_counters.remaining = ram_bytes_remaining();
 rcu_read_unlock();
@@ -4265,7 +4265,7 @@ static void colo_flush_ram_cache(void)
 memory_global_dirty_log_sync();
 rcu_read_lock();
 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
-migration_bitmap_sync_range(ram_state, block);
+ramblock_sync_dirty_bitmap(ram_state, block);
 }
 rcu_read_unlock();
 
-- 
2.21.0

[Qemu-devel] [PULL 30/33] qemu-file: move qemu_{get, put}_counted_string() declarations

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Marc-André Lureau 

Move migration helpers for strings under include/, so they can be used
outside of migration/

Signed-off-by: Marc-André Lureau 
Reviewed-by: Juan Quintela 
Message-Id: <20190808150325.21939-2-marcandre.lur...@redhat.com>
Signed-off-by: Dr. David Alan Gilbert 
---
 include/migration/qemu-file-types.h | 4 
 migration/qemu-file.h   | 4 
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/migration/qemu-file-types.h 
b/include/migration/qemu-file-types.h
index c0a1988155..2867e3da84 100644
--- a/include/migration/qemu-file-types.h
+++ b/include/migration/qemu-file-types.h
@@ -161,6 +161,10 @@ static inline void qemu_get_sbe64s(QEMUFile *f, int64_t 
*pv)
 qemu_get_be64s(f, (uint64_t *)pv);
 }
 
+size_t qemu_get_counted_string(QEMUFile *f, char buf[256]);
+
+void qemu_put_counted_string(QEMUFile *f, const char *name);
+
 int qemu_file_rate_limit(QEMUFile *f);
 
 #endif
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index d064940b8c..b6303dbeef 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -161,8 +161,6 @@ QEMUFile *qemu_file_get_return_path(QEMUFile *f);
 void qemu_fflush(QEMUFile *f);
 void qemu_file_set_blocking(QEMUFile *f, bool block);
 
-size_t qemu_get_counted_string(QEMUFile *f, char buf[256]);
-
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data);
@@ -181,6 +179,4 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t 
block_offset,
  ram_addr_t offset, size_t size,
  uint64_t *bytes_sent);
 
-void qemu_put_counted_string(QEMUFile *f, const char *name);
-
 #endif
-- 
2.21.0

[Qemu-devel] [PULL 31/33] migration: Add traces for multifd terminate threads

2019-08-15 Thread Dr. David Alan Gilbert (git)

From: Juan Quintela 

Signed-off-by: Juan Quintela 
Message-Id: <20190814020218.1868-2-quint...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c| 4 
 migration/trace-events | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 9e6cc1e685..b542929a7c 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -997,6 +997,8 @@ static void multifd_send_terminate_threads(Error *err)
 {
 int i;
 
+trace_multifd_send_terminate_threads(err != NULL);
+
 if (err) {
 MigrationState *s = migrate_get_current();
 migrate_set_error(s, err);
@@ -1258,6 +1260,8 @@ static void multifd_recv_terminate_threads(Error *err)
 {
 int i;
 
+trace_multifd_recv_terminate_threads(err != NULL);
+
 if (err) {
 MigrationState *s = migrate_get_current();
 migrate_set_error(s, err);
diff --git a/migration/trace-events b/migration/trace-events
index d8e54c367a..886ce70ca0 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -85,12 +85,14 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t 
used, uint32_t flags, uin
 multifd_recv_sync_main(long packet_num) "packet num %ld"
 multifd_recv_sync_main_signal(uint8_t id) "channel %d"
 multifd_recv_sync_main_wait(uint8_t id) "channel %d"
+multifd_recv_terminate_threads(bool error) "error %d"
 multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel 
%d packets %" PRIu64 " pages %" PRIu64
 multifd_recv_thread_start(uint8_t id) "%d"
 multifd_send(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, 
uint32_t next_packet_size) "channel %d packet_num %" PRIu64 " pages %d flags 
0x%x next packet size %d"
 multifd_send_sync_main(long packet_num) "packet num %ld"
 multifd_send_sync_main_signal(uint8_t id) "channel %d"
 multifd_send_sync_main_wait(uint8_t id) "channel %d"
+multifd_send_terminate_threads(bool error) "error %d"
 multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel 
%d packets %" PRIu64 " pages %"  PRIu64
 multifd_send_thread_start(uint8_t id) "%d"
 ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: 
%" PRIx64 " %zx"
-- 
2.21.0

[Qemu-devel] [PATCH 1/2] virtio: add vhost-user-fs base device

2019-08-16 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The virtio-fs virtio device provides shared file system access using
the FUSE protocol carried ovew virtio.
The actual file server is implemented in an external vhost-user-fs device
backend process.

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Sebastien Boeuf 
Signed-off-by: Dr. David Alan Gilbert 
---
 configure   |  13 +
 hw/virtio/Makefile.objs |   1 +
 hw/virtio/vhost-user-fs.c   | 297 
 include/hw/virtio/vhost-user-fs.h   |  45 +++
 include/standard-headers/linux/virtio_fs.h  |  41 +++
 include/standard-headers/linux/virtio_ids.h |   1 +
 6 files changed, 398 insertions(+)
 create mode 100644 hw/virtio/vhost-user-fs.c
 create mode 100644 include/hw/virtio/vhost-user-fs.h
 create mode 100644 include/standard-headers/linux/virtio_fs.h

diff --git a/configure b/configure
index 714e7fb6a1..e7e33ee783 100755
--- a/configure
+++ b/configure
@@ -382,6 +382,7 @@ vhost_crypto=""
 vhost_scsi=""
 vhost_vsock=""
 vhost_user=""
+vhost_user_fs=""
 kvm="no"
 hax="no"
 hvf="no"
@@ -1316,6 +1317,10 @@ for opt do
   ;;
   --enable-vhost-vsock) vhost_vsock="yes"
   ;;
+  --disable-vhost-user-fs) vhost_user_fs="no"
+  ;;
+  --enable-vhost-user-fs) vhost_user_fs="yes"
+  ;;
   --disable-opengl) opengl="no"
   ;;
   --enable-opengl) opengl="yes"
@@ -2269,6 +2274,10 @@ test "$vhost_crypto" = "" && vhost_crypto=$vhost_user
 if test "$vhost_crypto" = "yes" && test "$vhost_user" = "no"; then
   error_exit "--enable-vhost-crypto requires --enable-vhost-user"
 fi
+test "$vhost_user_fs" = "" && vhost_user_fs=$vhost_user
+if test "$vhost_user_fs" = "yes" && test "$vhost_user" = "no"; then
+  error_exit "--enable-vhost-user-fs requires --enable-vhost-user"
+fi
 
 # OR the vhost-kernel and vhost-user values for simplicity
 if test "$vhost_net" = ""; then
@@ -6425,6 +6434,7 @@ echo "vhost-crypto support $vhost_crypto"
 echo "vhost-scsi support $vhost_scsi"
 echo "vhost-vsock support $vhost_vsock"
 echo "vhost-user support $vhost_user"
+echo "vhost-user-fs support $vhost_user_fs"
 echo "Trace backends$trace_backends"
 if have_backend "simple"; then
 echo "Trace output file $trace_file-"
@@ -6921,6 +6931,9 @@ fi
 if test "$vhost_user" = "yes" ; then
   echo "CONFIG_VHOST_USER=y" >> $config_host_mak
 fi
+if test "$vhost_user_fs" = "yes" ; then
+  echo "CONFIG_VHOST_USER_FS=y" >> $config_host_mak
+fi
 if test "$blobs" = "yes" ; then
   echo "INSTALL_BLOBS=yes" >> $config_host_mak
 fi
diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 964ce78607..47ffbf22c4 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -11,6 +11,7 @@ common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
 common-obj-$(CONFIG_VIRTIO_MMIO) += virtio-mmio.o
 obj-$(CONFIG_VIRTIO_BALLOON) += virtio-balloon.o
 obj-$(CONFIG_VIRTIO_CRYPTO) += virtio-crypto.o
+obj-$(CONFIG_VHOST_USER_FS) += vhost-user-fs.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += 
virtio-crypto-pci.o
 obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
 common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += 
virtio-pmem-pci.o
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
new file mode 100644
index 00..2753c2c07a
--- /dev/null
+++ b/hw/virtio/vhost-user-fs.c
@@ -0,0 +1,297 @@
+/*
+ * Vhost-user filesystem virtio device
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *  Stefan Hajnoczi 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+#include "standard-headers/linux/virtio_fs.h"
+#include "qapi/error.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/vhost-user-fs.h"
+#include "monitor/monitor.h"
+
+static void vuf_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+VHostUserFS *fs = VHOST_USER_FS(vdev);
+struct virtio_fs_config fscfg = {};
+
+memcpy((char *)fscfg.tag, fs->conf.tag,
+   MIN(strlen(fs->conf.tag) + 1, sizeof(fscfg.tag)));
+
+virtio_stl_p(vdev, &fscfg.num_queues, fs->conf.num_queues);
+
+memcpy(config, &fscfg, sizeof(fscfg));
+}
+
+static void vuf_start(VirtIODevice *vdev)
+{
+VHostUserFS *fs = VHOST_USER_FS(vdev);
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+int ret;
+int i;
+
+if (!k->set_guest_notifiers) {
+error_report("binding does not support guest notifiers");
+return;
+}
+
+ret = vhost_dev_enable_notifiers(&fs->vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error enabling host notifiers: %d", -ret);
+return;
+}
+
+ret = k->set_guest_notifiers(qbus->parent, fs->vhost_dev.nvqs, true);
+if (ret < 0) {
+error_report("Error binding guest notifier: %

[Qemu-devel] [PATCH 0/2] Add virtio-fs (experimental)

2019-08-16 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Hi,
  This pair of patches adds the core of the virtio-fs support to qemu;
it's marked experimental since the kernel patch and spec changes aren't
in yet; but they're bubbling along.

  While the spec change is still in progress; the ID number is already
reserved.

  A future set of patches will add the optional DAX mapping support.

  The actual qemu change is pretty minimal, since it's really only
a virtio device with some queues.

Some links:
  Mailing list: https://www.redhat.com/mailman/listinfo/virtio-fs
  Dev tree: Including filesystem daemon: https://gitlab.com/virtio-fs/qemu
  kernel: https://gitlab.com/virtio-fs/linux
  virtio spec changes: 
https://lists.oasis-open.org/archives/virtio-dev/201908/msg00056.html

Dr. David Alan Gilbert (2):
  virtio: add vhost-user-fs base device
  virtio: add vhost-user-fs-pci device

 configure   |  13 +
 hw/virtio/Makefile.objs |   2 +
 hw/virtio/vhost-user-fs-pci.c   |  79 ++
 hw/virtio/vhost-user-fs.c   | 297 
 include/hw/virtio/vhost-user-fs.h   |  45 +++
 include/standard-headers/linux/virtio_fs.h  |  41 +++
 include/standard-headers/linux/virtio_ids.h |   1 +
 7 files changed, 478 insertions(+)
 create mode 100644 hw/virtio/vhost-user-fs-pci.c
 create mode 100644 hw/virtio/vhost-user-fs.c
 create mode 100644 include/hw/virtio/vhost-user-fs.h
 create mode 100644 include/standard-headers/linux/virtio_fs.h

-- 
2.21.0

[Qemu-devel] [PATCH 2/2] virtio: add vhost-user-fs-pci device

2019-08-16 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Add the PCI version of vhost-user-fs.

Launch QEMU like this:

  qemu -chardev socket,path=/tmp/vhost-fs.sock,id=chr0
   -device x-vhost-user-fs-pci,tag=myfs,chardev=chr0

Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Sebastien Boeuf 
Signed-off-by: Dr. David Alan Gilbert 
---
 hw/virtio/Makefile.objs   |  1 +
 hw/virtio/vhost-user-fs-pci.c | 79 +++
 2 files changed, 80 insertions(+)
 create mode 100644 hw/virtio/vhost-user-fs-pci.c

diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
index 47ffbf22c4..e2f70fbb89 100644
--- a/hw/virtio/Makefile.objs
+++ b/hw/virtio/Makefile.objs
@@ -15,6 +15,7 @@ obj-$(CONFIG_VHOST_USER_FS) += vhost-user-fs.o
 obj-$(call land,$(CONFIG_VIRTIO_CRYPTO),$(CONFIG_VIRTIO_PCI)) += 
virtio-crypto-pci.o
 obj-$(CONFIG_VIRTIO_PMEM) += virtio-pmem.o
 common-obj-$(call land,$(CONFIG_VIRTIO_PMEM),$(CONFIG_VIRTIO_PCI)) += 
virtio-pmem-pci.o
+obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += 
vhost-user-fs-pci.o
 obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
 
 ifeq ($(CONFIG_VIRTIO_PCI),y)
diff --git a/hw/virtio/vhost-user-fs-pci.c b/hw/virtio/vhost-user-fs-pci.c
new file mode 100644
index 00..07e295fd44
--- /dev/null
+++ b/hw/virtio/vhost-user-fs-pci.c
@@ -0,0 +1,79 @@
+/*
+ * Vhost-user filesystem virtio device PCI glue
+ *
+ * Copyright 2018-2019 Red Hat, Inc.
+ *
+ * Authors:
+ *  Dr. David Alan Gilbert 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.  See the COPYING file in the
+ * top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-user-fs.h"
+#include "virtio-pci.h"
+
+struct VHostUserFSPCI {
+VirtIOPCIProxy parent_obj;
+VHostUserFS vdev;
+};
+
+typedef struct VHostUserFSPCI VHostUserFSPCI;
+
+#define TYPE_VHOST_USER_FS_PCI "vhost-user-fs-pci-base"
+
+#define VHOST_USER_FS_PCI(obj) \
+OBJECT_CHECK(VHostUserFSPCI, (obj), TYPE_VHOST_USER_FS_PCI)
+
+static Property vhost_user_fs_pci_properties[] = {
+DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 4),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_fs_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VHostUserFSPCI *dev = VHOST_USER_FS_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(&dev->vdev);
+
+qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void vhost_user_fs_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = vhost_user_fs_pci_realize;
+set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+dc->props = vhost_user_fs_pci_properties;
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+pcidev_k->revision = 0x00;
+pcidev_k->class_id = PCI_CLASS_STORAGE_OTHER;
+}
+
+static void vhost_user_fs_pci_instance_init(Object *obj)
+{
+VHostUserFSPCI *dev = VHOST_USER_FS_PCI(obj);
+
+virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+TYPE_VHOST_USER_FS);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_fs_pci_info = {
+.base_name = TYPE_VHOST_USER_FS_PCI,
+.non_transitional_name = "x-vhost-user-fs-pci",
+.instance_size = sizeof(VHostUserFSPCI),
+.instance_init = vhost_user_fs_pci_instance_init,
+.class_init= vhost_user_fs_pci_class_init,
+};
+
+static void vhost_user_fs_pci_register(void)
+{
+virtio_pci_types_register(&vhost_user_fs_pci_info);
+}
+
+type_init(vhost_user_fs_pci_register);
-- 
2.21.0

[Qemu-devel] [for 4.1 PATCH] Revert "hw: report invalid disable-legacy|modern usage for virtio-1-only devs"

2019-07-29 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

This reverts commit f2784eed306449c3d04a71a05ed6463b8289aedf
since that accidentally removes the PCIe capabilities from virtio
devices because virtio_pci_dc_realize is called before the new 'mode'
flag is set.
I keep the expanded hw_compat entry because we've lost the ability to
do 'optional'.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/display/virtio-gpu-pci.c   |  4 +---
 hw/display/virtio-vga.c   |  4 +---
 hw/virtio/virtio-crypto-pci.c |  4 +---
 hw/virtio/virtio-input-pci.c  |  4 +---
 hw/virtio/virtio-pci.c| 26 ++
 hw/virtio/virtio-pci.h| 31 ++-
 6 files changed, 20 insertions(+), 53 deletions(-)

diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c
index d6f01b4a98..e4c7eb6193 100644
--- a/hw/display/virtio-gpu-pci.c
+++ b/hw/display/virtio-gpu-pci.c
@@ -33,9 +33,7 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 Error *local_error = NULL;
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", &local_error);
 
 if (local_error) {
diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c
index 416e7fec87..79a145e284 100644
--- a/hw/display/virtio-vga.c
+++ b/hw/display/virtio-vga.c
@@ -137,9 +137,7 @@ static void virtio_vga_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 
 /* init virtio bits */
 qdev_set_parent_bus(DEVICE(g), BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(g), true, "realized", &err);
 if (err) {
 error_propagate(errp, err);
diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c
index c8a2317a10..91d4446080 100644
--- a/hw/virtio/virtio-crypto-pci.c
+++ b/hw/virtio/virtio-crypto-pci.c
@@ -53,9 +53,7 @@ static void virtio_crypto_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 }
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", errp);
 object_property_set_link(OBJECT(vcrypto),
  OBJECT(vcrypto->vdev.conf.cryptodev), "cryptodev",
diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c
index 1c40292abc..ad7774e93e 100644
--- a/hw/virtio/virtio-input-pci.c
+++ b/hw/virtio/virtio-input-pci.c
@@ -49,9 +49,7 @@ static void virtio_input_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 DeviceState *vdev = DEVICE(&vinput->vdev);
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", errp);
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index ce928f2429..f6d2223e78 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1723,22 +1723,16 @@ static void virtio_pci_realize(PCIDevice *pci_dev, 
Error **errp)
/* PCI BAR regions must be powers of 2 */
pow2ceil(proxy->notify.offset + proxy->notify.size));
 
-if ((proxy->disable_legacy == ON_OFF_AUTO_ON) ||
-((proxy->disable_legacy == ON_OFF_AUTO_AUTO) && pcie_port)) {
-if (proxy->disable_modern) {
-error_setg(errp, "device cannot work as neither modern nor "
-   "legacy mode is enabled");
-error_append_hint(errp, "Set either disable-modern or "
-  "disable-legacy to off\n");
-return;
-}
-proxy->mode = VIRTIO_PCI_MODE_MODERN;
-} else {
-if (proxy->disable_modern) {
-proxy->mode = VIRTIO_PCI_MODE_LEGACY;
-} else {
-proxy->mode = VIRTIO_PCI_MODE_TRANSITIONAL;
-}
+if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
+proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+}
+
+if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) {
+error_setg(errp, "device cannot work as neither modern nor legacy mode"
+   " is enabled");
+error_append_hint(errp, "Set either disable-modern or disable-legacy"
+  " to off\n");
+return;
 }
 
 if (pcie_port && pci_is_express(pci_dev)) {
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 619d9098c1..292275acb1 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -15,7 +15,6 @@
 #ifndef QEMU_VIRTIO_PCI_H
 #define QEMU_VIRTIO_PCI_H
 
-#include "qapi/error.h"
 #include "hw/pci/msi.h"
 #include "hw/

[Qemu-devel] [PATCH v2 1/2] Revert "Revert "globals: Allow global properties to be optional""

2019-07-29 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

This reverts commit 8fa70dbd8bb478d9483c1da3e9976a2d86b3f9a0.

Because we're about to revert it's neighbour and thus uses an optional
again.

Signed-off-by: Dr. David Alan Gilbert 
---
 include/hw/qdev-core.h | 3 +++
 qom/object.c   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index e157fc4acd..136df7774c 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -252,6 +252,8 @@ struct PropertyInfo {
 /**
  * GlobalProperty:
  * @used: Set to true if property was used when initializing a device.
+ * @optional: If set to true, GlobalProperty will be skipped without errors
+ *if the property doesn't exist.
  *
  * An error is fatal for non-hotplugged devices, when the global is applied.
  */
@@ -260,6 +262,7 @@ typedef struct GlobalProperty {
 const char *property;
 const char *value;
 bool used;
+bool optional;
 } GlobalProperty;
 
 static inline void
diff --git a/qom/object.c b/qom/object.c
index 3966a3d461..147727 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -386,6 +386,9 @@ void object_apply_global_props(Object *obj, const GPtrArray 
*props, Error **errp
 if (object_dynamic_cast(obj, p->driver) == NULL) {
 continue;
 }
+if (p->optional && !object_property_find(obj, p->property, NULL)) {
+continue;
+}
 p->used = true;
 object_property_parse(obj, p->value, p->property, &err);
 if (err != NULL) {
-- 
2.21.0

[Qemu-devel] [For 4.1 PATCH v2 0/2] Reversions to fix PCIe in virtio

2019-07-29 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Revert a couple of patches that break PCIe capabilities in virtio
devices. The 'optional' revert is just reverted to make the main
reversion trivial.

Symptom:
  Loss of PCIe capabilities in virtio devices hung off PCIe bridges

Signed-off-by: Dr. David Alan Gilbert 


Dr. David Alan Gilbert (2):
  Revert "Revert "globals: Allow global properties to be optional""
  Revert "hw: report invalid disable-legacy|modern usage for
virtio-1-only devs"

 hw/core/machine.c | 23 +++
 hw/display/virtio-gpu-pci.c   |  4 +---
 hw/display/virtio-vga.c   |  4 +---
 hw/virtio/virtio-crypto-pci.c |  4 +---
 hw/virtio/virtio-input-pci.c  |  4 +---
 hw/virtio/virtio-pci.c| 26 ++
 hw/virtio/virtio-pci.h| 31 ++-
 include/hw/qdev-core.h|  3 +++
 qom/object.c  |  3 +++
 9 files changed, 29 insertions(+), 73 deletions(-)

-- 
2.21.0

[Qemu-devel] [PATCH v2 2/2] Revert "hw: report invalid disable-legacy|modern usage for virtio-1-only devs"

2019-07-29 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

This reverts commit f2784eed306449c3d04a71a05ed6463b8289aedf
since that accidentally removes the PCIe capabilities from virtio
devices because virtio_pci_dc_realize is called before the new 'mode'
flag is set.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/core/machine.c | 23 +++
 hw/display/virtio-gpu-pci.c   |  4 +---
 hw/display/virtio-vga.c   |  4 +---
 hw/virtio/virtio-crypto-pci.c |  4 +---
 hw/virtio/virtio-input-pci.c  |  4 +---
 hw/virtio/virtio-pci.c| 26 ++
 hw/virtio/virtio-pci.h| 31 ++-
 7 files changed, 23 insertions(+), 73 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index c58a8e594e..c4a2ab2282 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -115,26 +115,9 @@ const size_t hw_compat_2_7_len = 
G_N_ELEMENTS(hw_compat_2_7);
 
 GlobalProperty hw_compat_2_6[] = {
 { "virtio-mmio", "format_transport_address", "off" },
-/*
- * don't include devices which are modern-only
- * ie keyboard, mouse, tablet, gpu, vga & crypto
- */
-{ "virtio-9p-pci", "disable-modern", "on" },
-{ "virtio-9p-pci", "disable-legacy", "off" },
-{ "virtio-balloon-pci", "disable-modern", "on" },
-{ "virtio-balloon-pci", "disable-legacy", "off" },
-{ "virtio-blk-pci", "disable-modern", "on" },
-{ "virtio-blk-pci", "disable-legacy", "off" },
-{ "virtio-input-host-pci", "disable-modern", "on" },
-{ "virtio-input-host-pci", "disable-legacy", "off" },
-{ "virtio-net-pci", "disable-modern", "on" },
-{ "virtio-net-pci", "disable-legacy", "off" },
-{ "virtio-rng-pci", "disable-modern", "on" },
-{ "virtio-rng-pci", "disable-legacy", "off" },
-{ "virtio-scsi-pci", "disable-modern", "on" },
-{ "virtio-scsi-pci", "disable-legacy", "off" },
-{ "virtio-serial-pci", "disable-modern", "on" },
-{ "virtio-serial-pci", "disable-legacy", "off" },
+/* Optional because not all virtio-pci devices support legacy mode */
+{ "virtio-pci", "disable-modern", "on",  .optional = true },
+{ "virtio-pci", "disable-legacy", "off", .optional = true },
 };
 const size_t hw_compat_2_6_len = G_N_ELEMENTS(hw_compat_2_6);
 
diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c
index d6f01b4a98..e4c7eb6193 100644
--- a/hw/display/virtio-gpu-pci.c
+++ b/hw/display/virtio-gpu-pci.c
@@ -33,9 +33,7 @@ static void virtio_gpu_pci_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 Error *local_error = NULL;
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", &local_error);
 
 if (local_error) {
diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c
index 416e7fec87..79a145e284 100644
--- a/hw/display/virtio-vga.c
+++ b/hw/display/virtio-vga.c
@@ -137,9 +137,7 @@ static void virtio_vga_base_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 
 /* init virtio bits */
 qdev_set_parent_bus(DEVICE(g), BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(g), true, "realized", &err);
 if (err) {
 error_propagate(errp, err);
diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c
index c8a2317a10..91d4446080 100644
--- a/hw/virtio/virtio-crypto-pci.c
+++ b/hw/virtio/virtio-crypto-pci.c
@@ -53,9 +53,7 @@ static void virtio_crypto_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 }
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", errp);
 object_property_set_link(OBJECT(vcrypto),
  OBJECT(vcrypto->vdev.conf.cryptodev), "cryptodev",
diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c
index 1c40292abc..ad7774e93e 100644
--- a/hw/virtio/virtio-input-pci.c
+++ b/hw/virtio/virtio-input-pci.c
@@ -49,9 +49,7 @@ static void virtio_input_pci_realize(VirtIOPCIProxy 
*vpci_dev, Error **errp)
 DeviceState *vdev = DEVICE(&vinput->vdev);
 
 qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
-return;
-}
+virtio_pci_force_virtio_1(vpci_dev);
 object_property_set_bool(OBJECT(vdev), true, "realized", errp);
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index ce928f2429..f6d2223e78 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1723,22 +1723,16 @@ static void virtio_pci_realize(PCIDevice *pci_dev, 
Error **errp)
/* PCI BAR regions must be powers of 2 */
pow2ceil(pro

[Qemu-devel] [PATCH 2/2] pcie_root_port: Disable ACS on older machines

2019-07-30 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

ACS got added in 4.0 unconditionally,  that broke older<->4.0 migration
where there was a PCIe root port.
Fix this by turning it off for 3.1 and older machines; note this
fixes compatibility for older QEMUs but breaks compatibility with 4.0
for older machine types.

machine typesource qemu   dest qemu
   3.1 3.1   4.0broken
   3.1 3.1   4.1rc2 broken
   3.1 3.1   4.1+this   OK ++
   3.1 4.0   4.1rc2 OK
   3.1 4.0   4.1+this   broken --
   4.0 4.0   4.1rc2 OK
   4.0 4.0   4.1+this   OK

So we gain and lose; the consensus seems to be treat this as a
fix for older machine types.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/core/machine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index c58a8e594e..26a5f30e6d 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -52,6 +52,7 @@ GlobalProperty hw_compat_3_1[] = {
 { "virtio-blk-device", "discard", "false" },
 { "virtio-blk-device", "write-zeroes", "false" },
 { "virtio-balloon-device", "qemu-4-0-config-size", "false" },
+{ "pcie-root-port-base", "disable-acs", "true" }, /* Added in 4.1 */
 };
 const size_t hw_compat_3_1_len = G_N_ELEMENTS(hw_compat_3_1);
 
-- 
2.21.0

[Qemu-devel] [for 4.1 PATCH 0/2] Disable PCIe ACS on older machines

2019-07-30 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

PCIe ACS (Access Control services) got added in 4.0 and broke
migration to and from 3.1 and earlier.  Fix it here
for older machine types, at the cost of breaking that compatibility
with 4.0.

Signed-off-by: Dr. David Alan Gilbert 


Dr. David Alan Gilbert (2):
  pcie_root_port: Allow ACS to be disabled
  pcie_root_port: Disable ACS on older machines

 hw/core/machine.c  | 1 +
 hw/pci-bridge/pcie_root_port.c | 3 ++-
 include/hw/pci/pcie_port.h | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

-- 
2.21.0

[Qemu-devel] [PATCH 1/2] pcie_root_port: Allow ACS to be disabled

2019-07-30 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

ACS was added in 4.0 unconditionally, this breaks migration
compatibility.
Allow ACS to be disabled by adding a property that's
checked by pcie_root_port.

Unfortunately pcie-root-port doesn't have any instance data,
so there's no where for that flag to live, so stuff it into
PCIESlot.

Signed-off-by: Dr. David Alan Gilbert 
---
 hw/pci-bridge/pcie_root_port.c | 3 ++-
 include/hw/pci/pcie_port.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/pci-bridge/pcie_root_port.c b/hw/pci-bridge/pcie_root_port.c
index 09019ca05d..1d8a778709 100644
--- a/hw/pci-bridge/pcie_root_port.c
+++ b/hw/pci-bridge/pcie_root_port.c
@@ -111,7 +111,7 @@ static void rp_realize(PCIDevice *d, Error **errp)
 pcie_aer_root_init(d);
 rp_aer_vector_update(d);
 
-if (rpc->acs_offset) {
+if (rpc->acs_offset && !s->disable_acs) {
 pcie_acs_init(d, rpc->acs_offset);
 }
 return;
@@ -145,6 +145,7 @@ static void rp_exit(PCIDevice *d)
 static Property rp_props[] = {
 DEFINE_PROP_BIT(COMPAT_PROP_PCP, PCIDevice, cap_present,
 QEMU_PCIE_SLTCAP_PCP_BITNR, true),
+DEFINE_PROP_BOOL("disable-acs", PCIESlot, disable_acs, false),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/include/hw/pci/pcie_port.h b/include/hw/pci/pcie_port.h
index 09586f4641..7515430087 100644
--- a/include/hw/pci/pcie_port.h
+++ b/include/hw/pci/pcie_port.h
@@ -53,6 +53,8 @@ struct PCIESlot {
 PCIExpLinkSpeed speed;
 PCIExpLinkWidth width;
 
+/* Disable ACS (really for a pcie_root_port) */
+booldisable_acs;
 QLIST_ENTRY(PCIESlot) next;
 };
 
-- 
2.21.0

[PATCH 0/4] virtiofsd coverity fixes

2020-02-04 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Hi,
  This is a set of fixes that fixes things that coverity pointed out.
Only the last one (the NULL check in do_read) is probably important.

Dave

Dr. David Alan Gilbert (4):
  virtiofsd: Remove fuse_req_getgroups
  virtiofsd: fv_create_listen_socket error path socket leak
  virtiofsd: load_capng missing unlock
  virtiofsd: do_read missing NULL check

 tools/virtiofsd/fuse.h   | 20 
 tools/virtiofsd/fuse_lowlevel.c  | 81 ++--
 tools/virtiofsd/fuse_lowlevel.h  | 21 -
 tools/virtiofsd/fuse_virtio.c|  2 +
 tools/virtiofsd/passthrough_ll.c |  1 +
 5 files changed, 7 insertions(+), 118 deletions(-)

-- 
2.24.1

[PATCH 3/4] virtiofsd: load_capng missing unlock

2020-02-04 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Missing unlock in error path.

Fixes: Covertiy CID 1413123
Signed-off-by: Dr. David Alan Gilbert 
---
 tools/virtiofsd/passthrough_ll.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index e6f2399efc..c635fc8820 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -232,6 +232,7 @@ static int load_capng(void)
  */
 cap.saved = capng_save_state();
 if (!cap.saved) {
+pthread_mutex_unlock(&cap.mutex);
 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
 return -EINVAL;
 }
-- 
2.24.1

[PATCH 2/4] virtiofsd: fv_create_listen_socket error path socket leak

2020-02-04 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

If we fail when bringing up the socket we can leak the listen_fd;
in practice the daemon will exit so it's not really a problem.

Fixes: Coverity CID 1413121
Signed-off-by: Dr. David Alan Gilbert 
---
 tools/virtiofsd/fuse_virtio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 80a6e929df..dd1c605dbf 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -916,6 +916,7 @@ static int fv_create_listen_socket(struct fuse_session *se)
 old_umask = umask(0077);
 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
+close(listen_sock);
 umask(old_umask);
 return -1;
 }
@@ -923,6 +924,7 @@ static int fv_create_listen_socket(struct fuse_session *se)
 
 if (listen(listen_sock, 1) == -1) {
 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
+close(listen_sock);
 return -1;
 }
 
-- 
2.24.1

[PATCH 1/4] virtiofsd: Remove fuse_req_getgroups

2020-02-04 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Remove fuse_req_getgroups that's unused in virtiofsd; it came in
from libfuse but we don't actually use it.  It was called from
fuse_getgroups which we previously removed (but had left it's header
in).

Coverity had complained about null termination in it, but removing
it is the easiest answer.

Fixes: Coverity CID: 1413117 (String not null terminated)
Signed-off-by: Dr. David Alan Gilbert 
---
 tools/virtiofsd/fuse.h  | 20 -
 tools/virtiofsd/fuse_lowlevel.c | 77 -
 tools/virtiofsd/fuse_lowlevel.h | 21 -
 3 files changed, 118 deletions(-)

diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h
index 7a4c713559..aba13fef2d 100644
--- a/tools/virtiofsd/fuse.h
+++ b/tools/virtiofsd/fuse.h
@@ -1006,26 +1006,6 @@ void fuse_exit(struct fuse *f);
  */
 struct fuse_context *fuse_get_context(void);
 
-/**
- * Get the current supplementary group IDs for the current request
- *
- * Similar to the getgroups(2) system call, except the return value is
- * always the total number of group IDs, even if it is larger than the
- * specified size.
- *
- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
- * the group list to userspace, hence this function needs to parse
- * "/proc/$TID/task/$TID/status" to get the group IDs.
- *
- * This feature may not be supported on all operating systems.  In
- * such a case this function will return -ENOSYS.
- *
- * @param size size of given array
- * @param list array of group IDs to be filled in
- * @return the total number of supplementary group IDs or -errno on failure
- */
-int fuse_getgroups(int size, gid_t list[]);
-
 /**
  * Check if the current request has already been interrupted
  *
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index de2e2e0c65..01c418aade 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -2667,83 +2667,6 @@ int fuse_lowlevel_is_virtio(struct fuse_session *se)
 return !!se->virtio_dev;
 }
 
-#ifdef linux
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
-{
-char *buf;
-size_t bufsize = 1024;
-char path[128];
-int ret;
-int fd;
-unsigned long pid = req->ctx.pid;
-char *s;
-
-sprintf(path, "/proc/%lu/task/%lu/status", pid, pid);
-
-retry:
-buf = malloc(bufsize);
-if (buf == NULL) {
-return -ENOMEM;
-}
-
-ret = -EIO;
-fd = open(path, O_RDONLY);
-if (fd == -1) {
-goto out_free;
-}
-
-ret = read(fd, buf, bufsize);
-close(fd);
-if (ret < 0) {
-ret = -EIO;
-goto out_free;
-}
-
-if ((size_t)ret == bufsize) {
-free(buf);
-bufsize *= 4;
-goto retry;
-}
-
-ret = -EIO;
-s = strstr(buf, "\nGroups:");
-if (s == NULL) {
-goto out_free;
-}
-
-s += 8;
-ret = 0;
-while (1) {
-char *end;
-unsigned long val = strtoul(s, &end, 0);
-if (end == s) {
-break;
-}
-
-s = end;
-if (ret < size) {
-list[ret] = val;
-}
-ret++;
-}
-
-out_free:
-free(buf);
-return ret;
-}
-#else /* linux */
-/*
- * This is currently not implemented on other than Linux...
- */
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
-{
-(void)req;
-(void)size;
-(void)list;
-return -ENOSYS;
-}
-#endif
-
 void fuse_session_exit(struct fuse_session *se)
 {
 se->exited = 1;
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 138041e5f1..8f6d705b5c 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1704,27 +1704,6 @@ void *fuse_req_userdata(fuse_req_t req);
  */
 const struct fuse_ctx *fuse_req_ctx(fuse_req_t req);
 
-/**
- * Get the current supplementary group IDs for the specified request
- *
- * Similar to the getgroups(2) system call, except the return value is
- * always the total number of group IDs, even if it is larger than the
- * specified size.
- *
- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
- * the group list to userspace, hence this function needs to parse
- * "/proc/$TID/task/$TID/status" to get the group IDs.
- *
- * This feature may not be supported on all operating systems.  In
- * such a case this function will return -ENOSYS.
- *
- * @param req request handle
- * @param size size of given array
- * @param list array of group IDs to be filled in
- * @return the total number of supplementary group IDs or -errno on failure
- */
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]);
-
 /**
  * Callback function for an interrupt
  *
-- 
2.24.1

[PATCH 4/4] virtiofsd: do_read missing NULL check

2020-02-04 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Missing a NULL check if the argument fetch fails.

Fixes: Coverity CID 1413119
Signed-off-by: Dr. David Alan Gilbert 
---
 tools/virtiofsd/fuse_lowlevel.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 01c418aade..704c0369b2 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1116,6 +1116,10 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid,
 struct fuse_file_info fi;
 
 arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+if (!arg) {
+fuse_reply_err(req, EINVAL);
+return;
+}
 
 memset(&fi, 0, sizeof(fi));
 fi.fh = arg->fh;
-- 
2.24.1

[PULL 0/5] virtiofs queue

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The following changes since commit 2b8a51cdb3e8d15a5c35de7a2e76a813ae7358f0:

  Merge remote-tracking branch 'remotes/gkurz/tags/9p-next-2020-02-08' into 
staging (2020-02-10 16:07:29 +)

are available in the Git repository at:

  https://gitlab.com/dagrh/qemu.git tags/pull-virtiofs-20200210

for you to fetch changes up to 6a7e2bbee5fa5f167959f05319550d2a49a6b8bb:

  docs: add virtiofsd(1) man page (2020-02-10 17:25:52 +)


virtiofsd pull 2020-02-10

Coverity fixes and a reworked man page.


Dr. David Alan Gilbert (4):
  virtiofsd: Remove fuse_req_getgroups
  virtiofsd: fv_create_listen_socket error path socket leak
  virtiofsd: load_capng missing unlock
  virtiofsd: do_read missing NULL check

Stefan Hajnoczi (1):
  docs: add virtiofsd(1) man page

 MAINTAINERS  |   1 +
 Makefile |   9 ++-
 docs/interop/conf.py |   5 +-
 docs/interop/index.rst   |   1 +
 docs/interop/virtiofsd.rst   | 120 +++
 tools/virtiofsd/fuse.h   |  20 ---
 tools/virtiofsd/fuse_lowlevel.c  |  81 ++
 tools/virtiofsd/fuse_lowlevel.h  |  21 ---
 tools/virtiofsd/fuse_virtio.c|   2 +
 tools/virtiofsd/passthrough_ll.c |   1 +
 10 files changed, 141 insertions(+), 120 deletions(-)
 create mode 100644 docs/interop/virtiofsd.rst

[PULL 2/5] virtiofsd: fv_create_listen_socket error path socket leak

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

If we fail when bringing up the socket we can leak the listen_fd;
in practice the daemon will exit so it's not really a problem.

Fixes: Coverity CID 1413121
Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Stefan Hajnoczi 
---
 tools/virtiofsd/fuse_virtio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 80a6e929df..dd1c605dbf 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -916,6 +916,7 @@ static int fv_create_listen_socket(struct fuse_session *se)
 old_umask = umask(0077);
 if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
 fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
+close(listen_sock);
 umask(old_umask);
 return -1;
 }
@@ -923,6 +924,7 @@ static int fv_create_listen_socket(struct fuse_session *se)
 
 if (listen(listen_sock, 1) == -1) {
 fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
+close(listen_sock);
 return -1;
 }
 
-- 
2.24.1

[PULL 3/5] virtiofsd: load_capng missing unlock

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Missing unlock in error path.

Fixes: Covertiy CID 1413123
Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Stefan Hajnoczi 
---
 tools/virtiofsd/passthrough_ll.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index e6f2399efc..c635fc8820 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -232,6 +232,7 @@ static int load_capng(void)
  */
 cap.saved = capng_save_state();
 if (!cap.saved) {
+pthread_mutex_unlock(&cap.mutex);
 fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
 return -EINVAL;
 }
-- 
2.24.1

[PULL 5/5] docs: add virtiofsd(1) man page

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: Stefan Hajnoczi 

Document the virtiofsd(1) program and its command-line options.  This
man page is a rST conversion of the original texi documentation that I
wrote.

Reviewed-by: Liam Merwick 
Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 MAINTAINERS|   1 +
 Makefile   |   9 ++-
 docs/interop/conf.py   |   5 +-
 docs/interop/index.rst |   1 +
 docs/interop/virtiofsd.rst | 120 +
 5 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 docs/interop/virtiofsd.rst

diff --git a/MAINTAINERS b/MAINTAINERS
index ce46c0a552..c7717df720 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1613,6 +1613,7 @@ S: Supported
 F: tools/virtiofsd/*
 F: hw/virtio/vhost-user-fs*
 F: include/hw/virtio/vhost-user-fs.h
+F: docs/interop/virtiofsd.rst
 
 virtio-input
 M: Gerd Hoffmann 
diff --git a/Makefile b/Makefile
index 461d40bea6..f0e1a2fc1d 100644
--- a/Makefile
+++ b/Makefile
@@ -348,6 +348,9 @@ DOCS=qemu-doc.html qemu-doc.txt qemu.1
 DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-img.1
 DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-nbd.8
 DOCS+=$(MANUAL_BUILDDIR)/interop/qemu-ga.8
+ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy)
+DOCS+=$(MANUAL_BUILDDIR)/interop/virtiofsd.1
+endif
 DOCS+=$(MANUAL_BUILDDIR)/system/qemu-block-drivers.7
 DOCS+=docs/interop/qemu-qmp-ref.html docs/interop/qemu-qmp-ref.txt 
docs/interop/qemu-qmp-ref.7
 DOCS+=docs/interop/qemu-ga-ref.html docs/interop/qemu-ga-ref.txt 
docs/interop/qemu-ga-ref.7
@@ -861,6 +864,9 @@ ifdef CONFIG_VIRTFS
$(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1"
$(INSTALL_DATA) $(MANUAL_BUILDDIR)/interop/virtfs-proxy-helper.1 
"$(DESTDIR)$(mandir)/man1"
 endif
+ifeq ($(CONFIG_LINUX)$(CONFIG_SECCOMP)$(CONFIG_LIBCAP_NG),yyy)
+   $(INSTALL_DATA) docs/interop/virtiofsd.1 "$(DESTDIR)$(mandir)/man1"
+endif
 
 install-datadir:
$(INSTALL_DIR) "$(DESTDIR)$(qemu_datadir)"
@@ -1051,7 +1057,8 @@ $(MANUAL_BUILDDIR)/system/index.html: $(call 
manual-deps,system)
$(call build-manual,system,html)
 
 $(call define-manpage-rule,interop,\
-   qemu-ga.8 qemu-img.1 qemu-nbd.8 qemu-trace-stap.1 
virtfs-proxy-helper.1,\
+   qemu-ga.8 qemu-img.1 qemu-nbd.8 qemu-trace-stap.1\
+   virtiofsd.1 virtfs-proxy-helper.1,\
$(SRC_PATH/qemu-img-cmds.hx))
 
 $(call define-manpage-rule,system,qemu-block-drivers.7)
diff --git a/docs/interop/conf.py b/docs/interop/conf.py
index b0f322207c..b3cda17042 100644
--- a/docs/interop/conf.py
+++ b/docs/interop/conf.py
@@ -27,5 +27,8 @@ man_pages = [
  [], 1),
 ('virtfs-proxy-helper', 'virtfs-proxy-helper',
  u'QEMU 9p virtfs proxy filesystem helper',
- ['M. Mohan Kumar'], 1)
+ ['M. Mohan Kumar'], 1),
+('virtiofsd', 'virtiofsd', u'QEMU virtio-fs shared file system daemon',
+ ['Stefan Hajnoczi ',
+  'Masayoshi Mizuma '], 1),
 ]
diff --git a/docs/interop/index.rst b/docs/interop/index.rst
index 3b763b1eeb..e8455b4270 100644
--- a/docs/interop/index.rst
+++ b/docs/interop/index.rst
@@ -24,3 +24,4 @@ Contents:
vhost-user
vhost-user-gpu
virtfs-proxy-helper
+   virtiofsd
diff --git a/docs/interop/virtiofsd.rst b/docs/interop/virtiofsd.rst
new file mode 100644
index 00..378594c422
--- /dev/null
+++ b/docs/interop/virtiofsd.rst
@@ -0,0 +1,120 @@
+QEMU virtio-fs shared file system daemon
+
+
+Synopsis
+
+
+**virtiofsd** [*OPTIONS*]
+
+Description
+---
+
+Share a host directory tree with a guest through a virtio-fs device.  This
+program is a vhost-user backend that implements the virtio-fs device.  Each
+virtio-fs device instance requires its own virtiofsd process.
+
+This program is designed to work with QEMU's ``--device vhost-user-fs-pci``
+but should work with any virtual machine monitor (VMM) that supports
+vhost-user.  See the Examples section below.
+
+This program must be run as the root user.  Upon startup the program will
+switch into a new file system namespace with the shared directory tree as its
+root.  This prevents "file system escapes" due to symlinks and other file
+system objects that might lead to files outside the shared directory.  The
+program also sandboxes itself using seccomp(2) to prevent ptrace(2) and other
+vectors that could allow an attacker to compromise the system after gaining
+control of the virtiofsd process.
+
+Options
+---
+
+.. program:: virtiofsd
+
+.. option:: -h, --help
+
+  Print help.
+
+.. option:: -V, --version
+
+  Print version.
+
+.. option:: -d
+
+  Enable debug output.
+
+.. option:: --syslog
+
+  Print log messages to syslog instead of stderr.
+
+.. option:: -o OPTION
+
+  * debug -
+Enable debug output.
+
+  * flock|no_flock -
+Enable/disable flock.  The default is ``no_flock``.
+
+  * log_level=LEVEL -
+Print only log messages matching LEVEL or more severe.  LEVEL is one of
+``err``, ``warn``, ``info`

[PULL 4/5] virtiofsd: do_read missing NULL check

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Missing a NULL check if the argument fetch fails.

Fixes: Coverity CID 1413119
Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Stefan Hajnoczi 
---
 tools/virtiofsd/fuse_lowlevel.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index 01c418aade..704c0369b2 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -1116,6 +1116,10 @@ static void do_read(fuse_req_t req, fuse_ino_t nodeid,
 struct fuse_file_info fi;
 
 arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+if (!arg) {
+fuse_reply_err(req, EINVAL);
+return;
+}
 
 memset(&fi, 0, sizeof(fi));
 fi.fh = arg->fh;
-- 
2.24.1

[PULL 1/5] virtiofsd: Remove fuse_req_getgroups

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Remove fuse_req_getgroups that's unused in virtiofsd; it came in
from libfuse but we don't actually use it.  It was called from
fuse_getgroups which we previously removed (but had left it's header
in).

Coverity had complained about null termination in it, but removing
it is the easiest answer.

Fixes: Coverity CID: 1413117 (String not null terminated)
Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Stefan Hajnoczi 
---
 tools/virtiofsd/fuse.h  | 20 -
 tools/virtiofsd/fuse_lowlevel.c | 77 -
 tools/virtiofsd/fuse_lowlevel.h | 21 -
 3 files changed, 118 deletions(-)

diff --git a/tools/virtiofsd/fuse.h b/tools/virtiofsd/fuse.h
index 7a4c713559..aba13fef2d 100644
--- a/tools/virtiofsd/fuse.h
+++ b/tools/virtiofsd/fuse.h
@@ -1006,26 +1006,6 @@ void fuse_exit(struct fuse *f);
  */
 struct fuse_context *fuse_get_context(void);
 
-/**
- * Get the current supplementary group IDs for the current request
- *
- * Similar to the getgroups(2) system call, except the return value is
- * always the total number of group IDs, even if it is larger than the
- * specified size.
- *
- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
- * the group list to userspace, hence this function needs to parse
- * "/proc/$TID/task/$TID/status" to get the group IDs.
- *
- * This feature may not be supported on all operating systems.  In
- * such a case this function will return -ENOSYS.
- *
- * @param size size of given array
- * @param list array of group IDs to be filled in
- * @return the total number of supplementary group IDs or -errno on failure
- */
-int fuse_getgroups(int size, gid_t list[]);
-
 /**
  * Check if the current request has already been interrupted
  *
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index de2e2e0c65..01c418aade 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -2667,83 +2667,6 @@ int fuse_lowlevel_is_virtio(struct fuse_session *se)
 return !!se->virtio_dev;
 }
 
-#ifdef linux
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
-{
-char *buf;
-size_t bufsize = 1024;
-char path[128];
-int ret;
-int fd;
-unsigned long pid = req->ctx.pid;
-char *s;
-
-sprintf(path, "/proc/%lu/task/%lu/status", pid, pid);
-
-retry:
-buf = malloc(bufsize);
-if (buf == NULL) {
-return -ENOMEM;
-}
-
-ret = -EIO;
-fd = open(path, O_RDONLY);
-if (fd == -1) {
-goto out_free;
-}
-
-ret = read(fd, buf, bufsize);
-close(fd);
-if (ret < 0) {
-ret = -EIO;
-goto out_free;
-}
-
-if ((size_t)ret == bufsize) {
-free(buf);
-bufsize *= 4;
-goto retry;
-}
-
-ret = -EIO;
-s = strstr(buf, "\nGroups:");
-if (s == NULL) {
-goto out_free;
-}
-
-s += 8;
-ret = 0;
-while (1) {
-char *end;
-unsigned long val = strtoul(s, &end, 0);
-if (end == s) {
-break;
-}
-
-s = end;
-if (ret < size) {
-list[ret] = val;
-}
-ret++;
-}
-
-out_free:
-free(buf);
-return ret;
-}
-#else /* linux */
-/*
- * This is currently not implemented on other than Linux...
- */
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[])
-{
-(void)req;
-(void)size;
-(void)list;
-return -ENOSYS;
-}
-#endif
-
 void fuse_session_exit(struct fuse_session *se)
 {
 se->exited = 1;
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 138041e5f1..8f6d705b5c 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1704,27 +1704,6 @@ void *fuse_req_userdata(fuse_req_t req);
  */
 const struct fuse_ctx *fuse_req_ctx(fuse_req_t req);
 
-/**
- * Get the current supplementary group IDs for the specified request
- *
- * Similar to the getgroups(2) system call, except the return value is
- * always the total number of group IDs, even if it is larger than the
- * specified size.
- *
- * The current fuse kernel module in linux (as of 2.6.30) doesn't pass
- * the group list to userspace, hence this function needs to parse
- * "/proc/$TID/task/$TID/status" to get the group IDs.
- *
- * This feature may not be supported on all operating systems.  In
- * such a case this function will return -ENOSYS.
- *
- * @param req request handle
- * @param size size of given array
- * @param list array of group IDs to be filled in
- * @return the total number of supplementary group IDs or -errno on failure
- */
-int fuse_req_getgroups(fuse_req_t req, int size, gid_t list[]);
-
 /**
  * Callback function for an interrupt
  *
-- 
2.24.1

[PATCH] migration/rdma: rdma_accept_incoming_migration fix error handling

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

rdma_accept_incoming_migration is called from an fd handler and
can't return an Error * anywhere.
Currently it's leaking Error's in errp/local_err - there's
no point putting them in there unless we can report them.

Turn most into fprintf's, and the last into an error_reportf_err
where it's coming up from another function.

Signed-off-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 2379b8345b..f67161c98f 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3980,13 +3980,13 @@ static void rdma_accept_incoming_migration(void *opaque)
 RDMAContext *rdma = opaque;
 int ret;
 QEMUFile *f;
-Error *local_err = NULL, **errp = &local_err;
+Error *local_err = NULL;
 
 trace_qemu_rdma_accept_incoming_migration();
 ret = qemu_rdma_accept(rdma);
 
 if (ret) {
-ERROR(errp, "RDMA Migration initialization failed!");
+fprintf(stderr, "RDMA ERROR: Migration initialization failed");
 return;
 }
 
@@ -3998,13 +3998,16 @@ static void rdma_accept_incoming_migration(void *opaque)
 
 f = qemu_fopen_rdma(rdma, "rb");
 if (f == NULL) {
-ERROR(errp, "could not qemu_fopen_rdma!");
+fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma");
 qemu_rdma_cleanup(rdma);
 return;
 }
 
 rdma->migration_started_on_destination = 1;
-migration_fd_process_incoming(f, errp);
+migration_fd_process_incoming(f, &local_err);
+if (local_err) {
+error_reportf_err(local_err, "RDMA ERROR:");
+}
 }
 
 void rdma_start_incoming_migration(const char *host_port, Error **errp)
-- 
2.24.1

[PATCH] tests/migration: Add some slack to auto converge

2020-02-10 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

There's an assert in autoconverge that checks that we quit the
iteration when we go below the expected threshold.  Philippe
saw a case where this assert fired with the measured value
slightly over the threshold. (about 3k out of a few million).

I can think of two reasons:
  a) Rounding errors
  b) That after we make the decision to quit iteration we do one
more sync and that sees a few more dirty pages.

So add 1% slack to the assertion, that should cover a and
most cases of b, probably all we'll see for the test.

Signed-off-by: Dr. David Alan Gilbert 
---
 tests/qtest/migration-test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index cf27ebbc9d..a78ac0c7da 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1237,7 +1237,8 @@ static void test_migrate_auto_converge(void)
 g_assert_cmpint(percentage, <=, max_pct);
 
 remaining = read_ram_property_int(from, "remaining");
-g_assert_cmpint(remaining, <, expected_threshold);
+g_assert_cmpint(remaining, <,
+(expected_threshold + expected_threshold / 100));
 
 migrate_continue(from, "pre-switchover");
 
-- 
2.24.1

[PATCH 3/3] virtiofsd: Swing deprecated message to removed-features

2023-01-18 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Move the deprecation message, since it's now gone.

Signed-off-by: Dr. David Alan Gilbert 
---
 docs/about/deprecated.rst   | 18 --
 docs/about/removed-features.rst | 13 +
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 9f1bbc495d..8543fa3285 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -325,24 +325,6 @@ versions, aliases will point to newer CPU model versions
 depending on the machine type, so management software must
 resolve CPU model aliases before starting a virtual machine.
 
-Tools
--
-
-virtiofsd
-'
-
-There is a new Rust implementation of ``virtiofsd`` at
-``https://gitlab.com/virtio-fs/virtiofsd``;
-since this is now marked stable, new development should be done on that
-rather than the existing C version in the QEMU tree.
-The C version will still accept fixes and patches that
-are already in development for the moment, but will eventually
-be deleted from this tree.
-New deployments should use the Rust version, and existing systems
-should consider moving to it.  The command line and feature set
-is very close and moving should be simple.
-
-
 QEMU guest agent
 
 
diff --git a/docs/about/removed-features.rst b/docs/about/removed-features.rst
index 6c3aa5097f..9b0a212cfe 100644
--- a/docs/about/removed-features.rst
+++ b/docs/about/removed-features.rst
@@ -865,3 +865,16 @@ The VXHS code did not compile since v2.12.0. It was 
removed in 5.1.
 The corresponding upstream server project is no longer maintained.
 Users are recommended to switch to an alternative distributed block
 device driver such as RBD.
+
+Tools
+-
+
+virtiofsd (removed in 8.0)
+''
+
+There is a newer Rust implementation of ``virtiofsd`` at
+``https://gitlab.com/virtio-fs/virtiofsd``; this has been
+stable for some time and is now widely used.
+The command line and feature set is very close to the removed
+C implementation.
+
-- 
2.39.0

[PATCH 1/3] virtiofsd: Remove build and docs glue

2023-01-18 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

Remove all the virtiofsd build and docs infrastructure.

Signed-off-by: Dr. David Alan Gilbert 
---
 MAINTAINERS|  2 --
 docs/conf.py   |  4 
 docs/meson.build   |  1 -
 docs/tools/index.rst   |  1 -
 meson.build|  1 -
 meson_options.txt  |  2 --
 .../ci/org.centos/stream/8/x86_64/configure|  2 --
 scripts/coverity-scan/COMPONENTS.md|  3 ---
 scripts/meson-buildoptions.sh  |  3 ---
 tools/meson.build  | 13 -
 tools/virtiofsd/50-qemu-virtiofsd.json.in  |  5 -
 tools/virtiofsd/meson.build| 18 --
 12 files changed, 55 deletions(-)
 delete mode 100644 tools/virtiofsd/50-qemu-virtiofsd.json.in
 delete mode 100644 tools/virtiofsd/meson.build

diff --git a/MAINTAINERS b/MAINTAINERS
index 0fe50d01e3..4f8ab04dba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2093,10 +2093,8 @@ virtiofs
 M: Dr. David Alan Gilbert 
 M: Stefan Hajnoczi 
 S: Supported
-F: tools/virtiofsd/*
 F: hw/virtio/vhost-user-fs*
 F: include/hw/virtio/vhost-user-fs.h
-F: docs/tools/virtiofsd.rst
 L: virtio...@redhat.com
 
 virtio-input
diff --git a/docs/conf.py b/docs/conf.py
index e33cf3d381..b2b4c166e1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -290,10 +290,6 @@
 ('tools/virtfs-proxy-helper', 'virtfs-proxy-helper',
  'QEMU 9p virtfs proxy filesystem helper',
  ['M. Mohan Kumar'], 1),
-('tools/virtiofsd', 'virtiofsd',
- 'QEMU virtio-fs shared file system daemon',
- ['Stefan Hajnoczi ',
-  'Masayoshi Mizuma '], 1),
 ]
 man_make_section_directory = False
 
diff --git a/docs/meson.build b/docs/meson.build
index 9136fed3b7..bbcdccce68 100644
--- a/docs/meson.build
+++ b/docs/meson.build
@@ -48,7 +48,6 @@ if build_docs
 'qemu-storage-daemon.1': (have_tools ? 'man1' : ''),
 'qemu-trace-stap.1': (stap.found() ? 'man1' : ''),
 'virtfs-proxy-helper.1': (have_virtfs_proxy_helper ? 'man1' : ''),
-'virtiofsd.1': (have_virtiofsd ? 'man1' : ''),
 'qemu.1': 'man1',
 'qemu-block-drivers.7': 'man7',
 'qemu-cpu-models.7': 'man7'
diff --git a/docs/tools/index.rst b/docs/tools/index.rst
index 1edd5a8054..641550111c 100644
--- a/docs/tools/index.rst
+++ b/docs/tools/index.rst
@@ -14,4 +14,3 @@ command line utilities and other standalone programs.
qemu-pr-helper
qemu-trace-stap
virtfs-proxy-helper
-   virtiofsd
diff --git a/meson.build b/meson.build
index 58d8cd68a6..2f1bf88c9a 100644
--- a/meson.build
+++ b/meson.build
@@ -3860,7 +3860,6 @@ if have_block
   summary_info += {'Block whitelist (ro)': 
get_option('block_drv_ro_whitelist')}
   summary_info += {'Use block whitelist in tools': 
get_option('block_drv_whitelist_in_tools')}
   summary_info += {'VirtFS support':have_virtfs}
-  summary_info += {'build virtiofs daemon': have_virtiofsd}
   summary_info += {'Live block migration': 
config_host_data.get('CONFIG_LIVE_BLOCK_MIGRATION')}
   summary_info += {'replication support': 
config_host_data.get('CONFIG_REPLICATION')}
   summary_info += {'bochs support': get_option('bochs').allowed()}
diff --git a/meson_options.txt b/meson_options.txt
index 559a571b6b..0c9666437c 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -268,8 +268,6 @@ option('vhost_user_blk_server', type: 'feature', value: 
'auto',
description: 'build vhost-user-blk server')
 option('virtfs', type: 'feature', value: 'auto',
description: 'virtio-9p support')
-option('virtiofsd', type: 'feature', value: 'auto',
-   description: 'build virtiofs daemon (virtiofsd)')
 option('libvduse', type: 'feature', value: 'auto',
description: 'build VDUSE Library')
 option('vduse_blk_export', type: 'feature', value: 'auto',
diff --git a/scripts/ci/org.centos/stream/8/x86_64/configure 
b/scripts/ci/org.centos/stream/8/x86_64/configure
index 75882faa9c..54e9043674 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@@ -137,7 +137,6 @@
 --disable-vhost-vdpa \
 --disable-virglrenderer \
 --disable-virtfs \
---disable-virtiofsd \
 --disable-vnc \
 --disable-vnc-jpeg \
 --disable-png \
@@ -190,7 +189,6 @@
 --enable-tpm \
 --enable-trace-backends=dtrace \
 --enable-usb-redir \
---enable-virtiofsd \
 --enable-vhost-kernel \
 --enable-vhost-net \
 --enable-vhost-user \
diff --git a/scripts/coverity-scan/COMPONENTS.md 
b/scripts/coverity-scan/COMPONENTS.md
index 0e6ab4936e..639dcee45a 100644
--- a/scripts/coverity-scan/COMPONENTS.md
+++ b/scripts/coverity-scan/COMPONENTS.md
@@ -132,9 +132,6 @@ util
 xen
   ~ (/qemu)?(.*/xen.*)
 
-virtiofsd
-  ~ (/qemu)?(/tools/virtiofsd/.*)
-
 (headers)
   ~ (/qemu)?(/include/.*)
 
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index

[PATCH 0/3] Remove C virtiofsd

2023-01-18 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

We deprecated the C virtiofsd in commit 34deee7b6a1418f3d62a
in v7.0 in favour of the Rust implementation at

  https://gitlab.com/virtio-fs/virtiofsd

since then, the Rust version has had more development and
has held up well.  It's time to say goodbye to the C version
that got us going.

The only thing I've not cleaned up here is
  tests/avocado/virtiofs_submounts.py

which I guess needs to figure out where the virtiofsd implementation
is and use it; suggestions welcome.

Dave


Dr. David Alan Gilbert (3):
  virtiofsd: Remove build and docs glue
  virtiofsd: Remove source
  virtiofsd: Swing deprecated message to removed-features

 MAINTAINERS   |2 -
 docs/about/deprecated.rst |   18 -
 docs/about/removed-features.rst   |   13 +
 docs/conf.py  |4 -
 docs/meson.build  |1 -
 docs/tools/index.rst  |1 -
 docs/tools/virtiofsd.rst  |  403 --
 meson.build   |1 -
 meson_options.txt |2 -
 .../ci/org.centos/stream/8/x86_64/configure   |2 -
 scripts/coverity-scan/COMPONENTS.md   |3 -
 scripts/meson-buildoptions.sh |3 -
 tools/meson.build |   13 -
 tools/virtiofsd/50-qemu-virtiofsd.json.in |5 -
 tools/virtiofsd/buffer.c  |  350 --
 tools/virtiofsd/fuse_common.h |  837 ---
 tools/virtiofsd/fuse_i.h  |  107 -
 tools/virtiofsd/fuse_log.c|   40 -
 tools/virtiofsd/fuse_log.h|   75 -
 tools/virtiofsd/fuse_lowlevel.c   | 2732 --
 tools/virtiofsd/fuse_lowlevel.h   | 1988 
 tools/virtiofsd/fuse_misc.h   |   59 -
 tools/virtiofsd/fuse_opt.c|  446 --
 tools/virtiofsd/fuse_opt.h|  272 -
 tools/virtiofsd/fuse_signals.c|   93 -
 tools/virtiofsd/fuse_virtio.c | 1081 
 tools/virtiofsd/fuse_virtio.h |   33 -
 tools/virtiofsd/helper.c  |  409 --
 tools/virtiofsd/meson.build   |   18 -
 tools/virtiofsd/passthrough_helpers.h |   51 -
 tools/virtiofsd/passthrough_ll.c  | 4521 -
 tools/virtiofsd/passthrough_seccomp.c |  182 -
 tools/virtiofsd/passthrough_seccomp.h |   14 -
 33 files changed, 13 insertions(+), 13766 deletions(-)
 delete mode 100644 docs/tools/virtiofsd.rst
 delete mode 100644 tools/virtiofsd/50-qemu-virtiofsd.json.in
 delete mode 100644 tools/virtiofsd/buffer.c
 delete mode 100644 tools/virtiofsd/fuse_common.h
 delete mode 100644 tools/virtiofsd/fuse_i.h
 delete mode 100644 tools/virtiofsd/fuse_log.c
 delete mode 100644 tools/virtiofsd/fuse_log.h
 delete mode 100644 tools/virtiofsd/fuse_lowlevel.c
 delete mode 100644 tools/virtiofsd/fuse_lowlevel.h
 delete mode 100644 tools/virtiofsd/fuse_misc.h
 delete mode 100644 tools/virtiofsd/fuse_opt.c
 delete mode 100644 tools/virtiofsd/fuse_opt.h
 delete mode 100644 tools/virtiofsd/fuse_signals.c
 delete mode 100644 tools/virtiofsd/fuse_virtio.c
 delete mode 100644 tools/virtiofsd/fuse_virtio.h
 delete mode 100644 tools/virtiofsd/helper.c
 delete mode 100644 tools/virtiofsd/meson.build
 delete mode 100644 tools/virtiofsd/passthrough_helpers.h
 delete mode 100644 tools/virtiofsd/passthrough_ll.c
 delete mode 100644 tools/virtiofsd/passthrough_seccomp.c
 delete mode 100644 tools/virtiofsd/passthrough_seccomp.h

-- 
2.39.0

[PATCH] MAINTAINERS: Remove and change David Gilbert maintainer entries

2023-03-30 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

I'm leaving Red Hat next week, so clean up the maintainer entries.

'virtiofs' is just the device code now, so is pretty small, and
Stefan is still a maintainer there.

'migration' still has Juan.

For 'HMP' I'll swing that over to my personal email.

Signed-off-by: Dr. David Alan Gilbert 
---
 MAINTAINERS | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ef45b5e71e..f0f7fb3746 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2119,7 +2119,6 @@ T: git https://github.com/borntraeger/qemu.git s390-next
 L: qemu-s3...@nongnu.org
 
 virtiofs
-M: Dr. David Alan Gilbert 
 M: Stefan Hajnoczi 
 S: Supported
 F: hw/virtio/vhost-user-fs*
@@ -2863,7 +2862,7 @@ F: tests/unit/test-rcu-*.c
 F: util/rcu.c
 
 Human Monitor (HMP)
-M: Dr. David Alan Gilbert 
+M: Dr. David Alan Gilbert 
 S: Maintained
 F: monitor/monitor-internal.h
 F: monitor/misc.c
@@ -3136,7 +3135,6 @@ F: scripts/checkpatch.pl
 
 Migration
 M: Juan Quintela 
-M: Dr. David Alan Gilbert 
 S: Maintained
 F: hw/core/vmstate-if.c
 F: include/hw/vmstate-if.h
-- 
2.39.2

[PULL 02/33] QIOChannelSocket: Introduce assert and reduce ifdefs to improve readability

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Leonardo Bras 

During implementation of MSG_ZEROCOPY feature, a lot of #ifdefs were
introduced, particularly at qio_channel_socket_writev().

Rewrite some of those changes so it's easier to read.

Also, introduce an assert to help detect incorrect zero-copy usage is when
it's disabled on build.

Signed-off-by: Leonardo Bras 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Reviewed-by: Peter Xu 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
  dgilbert: Fixed up thinko'd g_assert_unreachable->g_assert_not_reached
---
 io/channel-socket.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/io/channel-socket.c b/io/channel-socket.c
index dc9c165de1..b8c13dba7c 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -578,11 +578,17 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 memcpy(CMSG_DATA(cmsg), fds, fdsize);
 }
 
-#ifdef QEMU_MSG_ZEROCOPY
 if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
+#ifdef QEMU_MSG_ZEROCOPY
 sflags = MSG_ZEROCOPY;
-}
+#else
+/*
+ * We expect QIOChannel class entry point to have
+ * blocked this code path already
+ */
+g_assert_not_reached();
 #endif
+}
 
  retry:
 ret = sendmsg(sioc->fd, &msg, sflags);
@@ -592,15 +598,13 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
 return QIO_CHANNEL_ERR_BLOCK;
 case EINTR:
 goto retry;
-#ifdef QEMU_MSG_ZEROCOPY
 case ENOBUFS:
-if (sflags & MSG_ZEROCOPY) {
+if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
 error_setg_errno(errp, errno,
  "Process can't lock enough memory for using 
MSG_ZEROCOPY");
 return -1;
 }
 break;
-#endif
 }
 
 error_setg_errno(errp, errno,
-- 
2.36.1

[PULL 01/33] migration: Remove RDMA_UNREGISTRATION_EXAMPLE

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Juan Quintela 

Nobody has ever showed up to unregister individual pages, and another
set of patches written by Daniel P. Berrangé 
just remove qemu_rdma_signal_unregister() function needed here.

Signed-off-by: Juan Quintela 
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 41 -
 1 file changed, 41 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 672d1958a9..8504152f39 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1370,30 +1370,6 @@ const char *print_wrid(int wrid)
 return wrid_desc[wrid];
 }
 
-/*
- * RDMA requires memory registration (mlock/pinning), but this is not good for
- * overcommitment.
- *
- * In preparation for the future where LRU information or workload-specific
- * writable writable working set memory access behavior is available to QEMU
- * it would be nice to have in place the ability to UN-register/UN-pin
- * particular memory regions from the RDMA hardware when it is determine that
- * those regions of memory will likely not be accessed again in the near 
future.
- *
- * While we do not yet have such information right now, the following
- * compile-time option allows us to perform a non-optimized version of this
- * behavior.
- *
- * By uncommenting this option, you will cause *all* RDMA transfers to be
- * unregistered immediately after the transfer completes on both sides of the
- * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
- *
- * This will have a terrible impact on migration performance, so until future
- * workload information or LRU information is available, do not attempt to use
- * this feature except for basic testing.
- */
-/* #define RDMA_UNREGISTRATION_EXAMPLE */
-
 /*
  * Perform a non-optimized memory unregistration after every transfer
  * for demonstration purposes, only if pin-all is not requested.
@@ -1571,18 +1547,6 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct 
ibv_cq *cq,
 if (rdma->nb_sent > 0) {
 rdma->nb_sent--;
 }
-
-if (!rdma->pin_all) {
-/*
- * FYI: If one wanted to signal a specific chunk to be unregistered
- * using LRU or workload-specific information, this is the function
- * you would call to do so. That chunk would then get 
asynchronously
- * unregistered later.
- */
-#ifdef RDMA_UNREGISTRATION_EXAMPLE
-qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
-#endif
-}
 } else {
 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
 }
@@ -2137,11 +2101,6 @@ retry:
 
 chunk_end = ram_chunk_end(block, chunk + chunks);
 
-if (!rdma->pin_all) {
-#ifdef RDMA_UNREGISTRATION_EXAMPLE
-qemu_rdma_unregister_waiting(rdma);
-#endif
-}
 
 while (test_bit(chunk, block->transit_bitmap)) {
 (void)count;
-- 
2.36.1

[PULL 03/33] QIOChannelSocket: Fix zero-copy send so socket flush works

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Leonardo Bras 

Somewhere between v6 and v7 the of the zero-copy-send patchset a crucial
part of the flushing mechanism got missing: incrementing zero_copy_queued.

Without that, the flushing interface becomes a no-op, and there is no
guarantee the buffer is really sent.

This can go as bad as causing a corruption in RAM during migration.

Fixes: 2bc58ffc2926 ("QIOChannelSocket: Implement io_writev zero copy flag & 
io_flush for CONFIG_LINUX")
Reported-by: 徐闯 
Signed-off-by: Leonardo Bras 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Peter Xu 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 io/channel-socket.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/io/channel-socket.c b/io/channel-socket.c
index b8c13dba7c..4466bb1cd4 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -611,6 +611,11 @@ static ssize_t qio_channel_socket_writev(QIOChannel *ioc,
  "Unable to write to socket");
 return -1;
 }
+
+if (flags & QIO_CHANNEL_WRITE_FLAG_ZERO_COPY) {
+sioc->zero_copy_queued++;
+}
+
 return ret;
 }
 #else /* WIN32 */
-- 
2.36.1

[PULL 13/33] migration: introduce a QIOChannel impl for BlockDriverState VMState

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

Introduce a QIOChannelBlock class that exposes the BlockDriverState
VMState region for I/O.

This is kept in the migration/ directory rather than io/, to avoid
a mutual dependancy between block/ <-> io/ directories. Also the
VMState should only be used by the migration code.

Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
  dgilbert: Fixed coding style in qio_channel_block_close
---
 migration/channel-block.c | 195 ++
 migration/channel-block.h |  59 
 migration/meson.build |   1 +
 3 files changed, 255 insertions(+)
 create mode 100644 migration/channel-block.c
 create mode 100644 migration/channel-block.h

diff --git a/migration/channel-block.c b/migration/channel-block.c
new file mode 100644
index 00..c55c8c93ce
--- /dev/null
+++ b/migration/channel-block.c
@@ -0,0 +1,195 @@
+/*
+ * QEMU I/O channels block driver
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "migration/channel-block.h"
+#include "qapi/error.h"
+#include "block/block.h"
+#include "trace.h"
+
+QIOChannelBlock *
+qio_channel_block_new(BlockDriverState *bs)
+{
+QIOChannelBlock *ioc;
+
+ioc = QIO_CHANNEL_BLOCK(object_new(TYPE_QIO_CHANNEL_BLOCK));
+
+bdrv_ref(bs);
+ioc->bs = bs;
+
+return ioc;
+}
+
+
+static void
+qio_channel_block_finalize(Object *obj)
+{
+QIOChannelBlock *ioc = QIO_CHANNEL_BLOCK(obj);
+
+g_clear_pointer(&ioc->bs, bdrv_unref);
+}
+
+
+static ssize_t
+qio_channel_block_readv(QIOChannel *ioc,
+const struct iovec *iov,
+size_t niov,
+int **fds,
+size_t *nfds,
+Error **errp)
+{
+QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+QEMUIOVector qiov;
+int ret;
+
+qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
+ret = bdrv_readv_vmstate(bioc->bs, &qiov, bioc->offset);
+if (ret < 0) {
+return ret;
+}
+
+bioc->offset += qiov.size;
+return qiov.size;
+}
+
+
+static ssize_t
+qio_channel_block_writev(QIOChannel *ioc,
+ const struct iovec *iov,
+ size_t niov,
+ int *fds,
+ size_t nfds,
+ int flags,
+ Error **errp)
+{
+QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+QEMUIOVector qiov;
+int ret;
+
+qemu_iovec_init_external(&qiov, (struct iovec *)iov, niov);
+ret = bdrv_writev_vmstate(bioc->bs, &qiov, bioc->offset);
+if (ret < 0) {
+return ret;
+}
+
+bioc->offset += qiov.size;
+return qiov.size;
+}
+
+
+static int
+qio_channel_block_set_blocking(QIOChannel *ioc,
+   bool enabled,
+   Error **errp)
+{
+if (!enabled) {
+error_setg(errp, "Non-blocking mode not supported for block devices");
+return -1;
+}
+return 0;
+}
+
+
+static off_t
+qio_channel_block_seek(QIOChannel *ioc,
+   off_t offset,
+   int whence,
+   Error **errp)
+{
+QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+
+switch (whence) {
+case SEEK_SET:
+bioc->offset = offset;
+break;
+case SEEK_CUR:
+bioc->offset += whence;
+break;
+case SEEK_END:
+error_setg(errp, "Size of VMstate region is unknown");
+return (off_t)-1;
+default:
+g_assert_not_reached();
+}
+
+return bioc->offset;
+}
+
+
+static int
+qio_channel_block_close(QIOChannel *ioc,
+Error **errp)
+{
+QIOChannelBlock *bioc = QIO_CHANNEL_BLOCK(ioc);
+int rv = bdrv_flush(bioc->bs);
+
+if (rv < 0) {
+error_setg_errno(errp, -rv,
+ "Unable to flush VMState");
+return -1;
+}
+
+g_clear_pointer(&bioc->bs, bdrv_unref);
+bioc->offset = 0;
+
+return 0;
+}
+
+
+static void
+qio_channel_block_set_aio_fd_handler(QIOChannel *ioc,
+ AioContext *ctx,
+ IOHandler *io_read,
+

[PULL 10/33] migration: rename qemu_ftell to qemu_file_total_transferred

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

The name 'ftell' gives the misleading impression that the QEMUFile
objects are seekable. This is not the case, as in general we just
have an opaque stream. The users of this method are only interested
in the total bytes processed. This switches to a new name that
reflects the intended usage.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
   dgilbert: Wrapped long line
---
 migration/block.c | 10 +-
 migration/migration.c |  3 ++-
 migration/qemu-file.c |  4 ++--
 migration/qemu-file.h | 33 +++--
 migration/savevm.c|  6 +++---
 migration/vmstate.c   |  5 +++--
 6 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 077a413325..823453c977 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -756,8 +756,8 @@ static int block_save_setup(QEMUFile *f, void *opaque)
 static int block_save_iterate(QEMUFile *f, void *opaque)
 {
 int ret;
-int64_t last_ftell = qemu_ftell(f);
-int64_t delta_ftell;
+int64_t last_bytes = qemu_file_total_transferred(f);
+int64_t delta_bytes;
 
 trace_migration_block_save("iterate", block_mig_state.submitted,
block_mig_state.transferred);
@@ -809,10 +809,10 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
 }
 
 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
-delta_ftell = qemu_ftell(f) - last_ftell;
-if (delta_ftell > 0) {
+delta_bytes = qemu_file_total_transferred(f) - last_bytes;
+if (delta_bytes > 0) {
 return 1;
-} else if (delta_ftell < 0) {
+} else if (delta_bytes < 0) {
 return -1;
 } else {
 return 0;
diff --git a/migration/migration.c b/migration/migration.c
index 5863af1b13..6d56eb1617 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3539,7 +3539,8 @@ static MigThrError migration_detect_error(MigrationState 
*s)
 /* How many bytes have we transferred since the beginning of the migration */
 static uint64_t migration_total_bytes(MigrationState *s)
 {
-return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
+return qemu_file_total_transferred(s->to_dst_file) +
+ram_counters.multifd_bytes;
 }
 
 static void migration_calculate_complete(MigrationState *s)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index eabc2d7c6e..7ee9b5bf05 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -657,7 +657,7 @@ int qemu_get_byte(QEMUFile *f)
 return result;
 }
 
-int64_t qemu_ftell_fast(QEMUFile *f)
+int64_t qemu_file_total_transferred_fast(QEMUFile *f)
 {
 int64_t ret = f->total_transferred;
 int i;
@@ -669,7 +669,7 @@ int64_t qemu_ftell_fast(QEMUFile *f)
 return ret;
 }
 
-int64_t qemu_ftell(QEMUFile *f)
+int64_t qemu_file_total_transferred(QEMUFile *f)
 {
 qemu_fflush(f);
 return f->total_transferred;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 3f36d4dc8c..05f6aef903 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -124,8 +124,37 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps 
*ops, bool has_ioc);
 void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks);
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
-int64_t qemu_ftell(QEMUFile *f);
-int64_t qemu_ftell_fast(QEMUFile *f);
+
+/*
+ * qemu_file_total_transferred:
+ *
+ * Report the total number of bytes transferred with
+ * this file.
+ *
+ * For writable files, any pending buffers will be
+ * flushed, so the reported value will be equal to
+ * the number of bytes transferred on the wire.
+ *
+ * For readable files, the reported value will be
+ * equal to the number of bytes transferred on the
+ * wire.
+ *
+ * Returns: the total bytes transferred
+ */
+int64_t qemu_file_total_transferred(QEMUFile *f);
+
+/*
+ * qemu_file_total_transferred_fast:
+ *
+ * As qemu_file_total_transferred except for writable
+ * files, where no flush is performed and the reported
+ * amount will include the size of any queued buffers,
+ * on top of the amount actually transferred.
+ *
+ * Returns: the total bytes transferred and queued
+ */
+int64_t qemu_file_total_transferred_fast(QEMUFile *f);
+
 /*
  * put_buffer without copying the buffer.
  * The buffer should be available till it is sent asynchronously.
diff --git a/migration/savevm.c b/migration/savevm.c
index d9076897b8..75d05f1a84 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -916,9 +916,9 @@ static void vmstate_save_old_style(QEMUFile *f, 
SaveStateEntry *se,
 {
 int64_t old_offset, size;
 
-old_offset = qemu_ftell_fast(f);
+old_offset = qemu_file_total_transferred_fast(f);
 se->ops->save_state(f, se->opaque);
-size = qemu_ftell_fast(f) - old_offset;
+size = qemu_file_total_transferred_fast(f) - old_offset;
 
 if (vmdesc) {
 json_w

[PULL 04/33] migration: Change zero_copy_send from migration parameter to migration capability

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Leonardo Bras 

When originally implemented, zero_copy_send was designed as a Migration
paramenter.

But taking into account how is that supposed to work, and how
the difference between a capability and a parameter, it only makes sense
that zero-copy-send would work better as a capability.

Taking into account how recently the change got merged, it was decided
that it's still time to make it right, and convert zero_copy_send into
a Migration capability.

Signed-off-by: Leonardo Bras 
Reviewed-by: Juan Quintela 
Acked-by: Markus Armbruster 
Acked-by: Peter Xu 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
  dgilbert: always define the capability, even on non-Linux but error if
set; avoids build problems with the capability
---
 migration/migration.c | 58 +++
 monitor/hmp-cmds.c|  6 -
 qapi/migration.json   | 33 +++-
 3 files changed, 34 insertions(+), 63 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 31739b2af9..5863af1b13 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -163,7 +163,8 @@ INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
 MIGRATION_CAPABILITY_COMPRESS,
 MIGRATION_CAPABILITY_XBZRLE,
 MIGRATION_CAPABILITY_X_COLO,
-MIGRATION_CAPABILITY_VALIDATE_UUID);
+MIGRATION_CAPABILITY_VALIDATE_UUID,
+MIGRATION_CAPABILITY_ZERO_COPY_SEND);
 
 /* When we add fault tolerance, we could have several
migrations at once.  For now we don't need to add
@@ -910,10 +911,6 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->multifd_zlib_level = s->parameters.multifd_zlib_level;
 params->has_multifd_zstd_level = true;
 params->multifd_zstd_level = s->parameters.multifd_zstd_level;
-#ifdef CONFIG_LINUX
-params->has_zero_copy_send = true;
-params->zero_copy_send = s->parameters.zero_copy_send;
-#endif
 params->has_xbzrle_cache_size = true;
 params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
 params->has_max_postcopy_bandwidth = true;
@@ -1275,6 +1272,24 @@ static bool migrate_caps_check(bool *cap_list,
 }
 }
 
+#ifdef CONFIG_LINUX
+if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] &&
+(!cap_list[MIGRATION_CAPABILITY_MULTIFD] ||
+ migrate_use_compression() ||
+ migrate_use_tls())) {
+error_setg(errp,
+   "Zero copy only available for non-compressed non-TLS 
multifd migration");
+return false;
+}
+#else
+if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
+error_setg(errp,
+   "Zero copy currently only available on Linux");
+return false;
+}
+#endif
+
+
 /* incoming side only */
 if (runstate_check(RUN_STATE_INMIGRATE) &&
 !migrate_multi_channels_is_allowed() &&
@@ -1497,16 +1512,6 @@ static bool migrate_params_check(MigrationParameters 
*params, Error **errp)
 error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: 
");
 return false;
 }
-#ifdef CONFIG_LINUX
-if (params->zero_copy_send &&
-(!migrate_use_multifd() ||
- params->multifd_compression != MULTIFD_COMPRESSION_NONE ||
- (params->tls_creds && *params->tls_creds))) {
-error_setg(errp,
-   "Zero copy only available for non-compressed non-TLS 
multifd migration");
-return false;
-}
-#endif
 return true;
 }
 
@@ -1580,11 +1585,6 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 if (params->has_multifd_compression) {
 dest->multifd_compression = params->multifd_compression;
 }
-#ifdef CONFIG_LINUX
-if (params->has_zero_copy_send) {
-dest->zero_copy_send = params->zero_copy_send;
-}
-#endif
 if (params->has_xbzrle_cache_size) {
 dest->xbzrle_cache_size = params->xbzrle_cache_size;
 }
@@ -1697,11 +1697,6 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 if (params->has_multifd_compression) {
 s->parameters.multifd_compression = params->multifd_compression;
 }
-#ifdef CONFIG_LINUX
-if (params->has_zero_copy_send) {
-s->parameters.zero_copy_send = params->zero_copy_send;
-}
-#endif
 if (params->has_xbzrle_cache_size) {
 s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
 xbzrle_cache_resize(params->xbzrle_cache_size, errp);
@@ -2593,7 +2588,7 @@ bool migrate_use_zero_copy_send(void)
 
 s = migrate_get_current();
 
-return s->parameters.zero_copy_send;
+return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
 }
 #endif
 
@@ -4249,10 +4244,6 @@ static Property migration_properties[] = {
 DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
   parameters.multifd_zstd_level,
   DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
-#ifdef CONFIG_LINUX
-DEFINE_PROP_BOOL("zero_copy_sen

[PULL 00/33] migration queue

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

The following changes since commit 2b049d2c8dc01de750410f8f1a4eac498c04c723:

  Merge tag 'pull-aspeed-20220622' of https://github.com/legoater/qemu into 
staging (2022-06-22 07:27:06 -0700)

are available in the Git repository at:

  https://gitlab.com/dagrh/qemu.git tags/pull-migration-20220622b

for you to fetch changes up to 9c6eb6dc3785a280b504195d308da082641af2a7:

  tests: Add dirty page rate limit test (2022-06-22 19:33:43 +0100)


Migration pull 2022-06-22

Compared to Juan's pull:
  a) Hopefully fixed non-Linux builds
(Local build test on mingw64 works
Note: the zero-copy capability is now
defined on non-Linux systems)
  b) Added Hyman's series - it had been
on queue for a while (sorry for the delay)
  c) Fixed up a whole bunch of check-patch failures
- please use it!

In this today migration PULL request:
- Dainiel Berrangé - qemufileops cleanup
- Leonardo Bras  - cleanups for zero copy
- Juan Quintela  - RDMA cleanups
- Hyman Huang - per-vcpu dirty ring work

Signed-off-by: Dr. David Alan Gilbert 


Daniel P. Berrangé (21):
  io: add a QIOChannelNull equivalent to /dev/null
  migration: switch to use QIOChannelNull for dummy channel
  migration: remove unreachble RDMA code in save_hook impl
  migration: rename rate limiting fields in QEMUFile
  migration: rename 'pos' field in QEMUFile to 'bytes_processed'
  migration: rename qemu_ftell to qemu_file_total_transferred
  migration: rename qemu_update_position to qemu_file_credit_transfer
  migration: rename qemu_file_update_transfer to qemu_file_acct_rate_limit
  migration: introduce a QIOChannel impl for BlockDriverState VMState
  migration: convert savevm to use QIOChannelBlock for VMState
  migration: stop passing 'opaque' parameter to QEMUFile hooks
  migration: hardcode assumption that QEMUFile is backed with QIOChannel
  migration: introduce new constructors for QEMUFile
  migration: remove unused QEMUFileGetFD typedef / qemu_get_fd method
  migration: remove the QEMUFileOps 'shut_down' callback
  migration: remove the QEMUFileOps 'set_blocking' callback
  migration: remove the QEMUFileOps 'close' callback
  migration: remove the QEMUFileOps 'get_buffer' callback
  migration: remove the QEMUFileOps 'writev_buffer' callback
  migration: remove the QEMUFileOps 'get_return_path' callback
  migration: remove the QEMUFileOps abstraction

Hyman Huang (8):
  accel/kvm/kvm-all: Refactor per-vcpu dirty ring reaping
  cpus: Introduce cpu_list_generation_id
  migration/dirtyrate: Refactor dirty page rate calculation
  softmmu/dirtylimit: Implement vCPU dirtyrate calculation periodically
  accel/kvm/kvm-all: Introduce kvm_dirty_ring_size function
  softmmu/dirtylimit: Implement virtual CPU throttle
  softmmu/dirtylimit: Implement dirty page rate limit
  tests: Add dirty page rate limit test

Juan Quintela (1):
  migration: Remove RDMA_UNREGISTRATION_EXAMPLE

Leonardo Bras (3):
  QIOChannelSocket: Introduce assert and reduce ifdefs to improve 
readability
  QIOChannelSocket: Fix zero-copy send so socket flush works
  migration: Change zero_copy_send from migration parameter to migration 
capability

 accel/kvm/kvm-all.c   |  46 ++-
 accel/stubs/kvm-stub.c|   5 +
 cpus-common.c |   8 +
 hmp-commands-info.hx  |  13 +
 hmp-commands.hx   |  32 ++
 include/exec/cpu-common.h |   1 +
 include/exec/memory.h |   5 +-
 include/hw/core/cpu.h |   6 +
 include/io/channel-null.h |  55 
 include/monitor/hmp.h |   3 +
 include/sysemu/dirtylimit.h   |  37 +++
 include/sysemu/dirtyrate.h|  28 ++
 include/sysemu/kvm.h  |   2 +
 io/channel-null.c | 237 +++
 io/channel-socket.c   |  19 +-
 io/meson.build|   1 +
 io/trace-events   |   3 +
 migration/block.c |  10 +-
 migration/channel-block.c | 195 +
 migration/channel-block.h |  59 
 migration/channel.c   |   4 +-
 migration/colo.c  |   5 +-
 migration/dirtyrate.c | 227 --
 migration/dirtyrate.h |   7 +-
 migration/meson.build |   2 +-
 migration/migration.c |  68 ++---
 migration/multifd.c   |   4 +-
 migration/qemu-file-channel.c | 194 
 migration/qemu-file-channel.h |  32 --
 migration/qemu-file.c | 193 ++--
 migration/qemu-file.h | 125 
 migration/ram.c   |   8 +-
 migration/rdma.c  | 185 +++-
 migration/savevm.c|  55 +---
 migrat

[PULL 12/33] migration: rename qemu_file_update_transfer to qemu_file_acct_rate_limit

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

The qemu_file_update_transfer name doesn't give a clear guide on what
its purpose is, and how it differs from the qemu_file_credit_transfer
method. The latter is specifically for accumulating for total migration
traffic, while the former is specifically for accounting in thue rate
limit calculations. The new name give better guidance on its usage.

Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/multifd.c   | 4 ++--
 migration/qemu-file.c | 2 +-
 migration/qemu-file.h | 9 -
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/migration/multifd.c b/migration/multifd.c
index 9282ab6aa4..684c014c86 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -435,7 +435,7 @@ static int multifd_send_pages(QEMUFile *f)
 p->pages = pages;
 transferred = ((uint64_t) pages->num) * qemu_target_page_size()
 + p->packet_len;
-qemu_file_update_transfer(f, transferred);
+qemu_file_acct_rate_limit(f, transferred);
 ram_counters.multifd_bytes += transferred;
 ram_counters.transferred += transferred;
 qemu_mutex_unlock(&p->mutex);
@@ -610,7 +610,7 @@ int multifd_send_sync_main(QEMUFile *f)
 p->packet_num = multifd_send_state->packet_num++;
 p->flags |= MULTIFD_FLAG_SYNC;
 p->pending_job++;
-qemu_file_update_transfer(f, p->packet_len);
+qemu_file_acct_rate_limit(f, p->packet_len);
 ram_counters.multifd_bytes += p->packet_len;
 ram_counters.transferred += p->packet_len;
 qemu_mutex_unlock(&p->mutex);
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index f73b010d39..7fe0d9fa30 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -704,7 +704,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f)
 f->rate_limit_used = 0;
 }
 
-void qemu_file_update_transfer(QEMUFile *f, int64_t len)
+void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len)
 {
 f->rate_limit_used += len;
 }
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index d96f5f7118..901f2cf697 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -188,7 +188,14 @@ void qemu_file_skip(QEMUFile *f, int size);
  */
 void qemu_file_credit_transfer(QEMUFile *f, size_t size);
 void qemu_file_reset_rate_limit(QEMUFile *f);
-void qemu_file_update_transfer(QEMUFile *f, int64_t len);
+/*
+ * qemu_file_acct_rate_limit:
+ *
+ * Report on a number of bytes the have been transferred
+ * out of band from the main file object I/O methods, and
+ * need to be applied to the rate limiting calcuations
+ */
+void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len);
 void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
 int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
-- 
2.36.1

[PULL 05/33] io: add a QIOChannelNull equivalent to /dev/null

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

This is for code which needs a portable equivalent to a QIOChannelFile
connected to /dev/null.

Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 include/io/channel-null.h |  55 +++
 io/channel-null.c | 237 ++
 io/meson.build|   1 +
 io/trace-events   |   3 +
 tests/unit/meson.build|   1 +
 tests/unit/test-io-channel-null.c |  95 
 6 files changed, 392 insertions(+)
 create mode 100644 include/io/channel-null.h
 create mode 100644 io/channel-null.c
 create mode 100644 tests/unit/test-io-channel-null.c

diff --git a/include/io/channel-null.h b/include/io/channel-null.h
new file mode 100644
index 00..f6d54e63cf
--- /dev/null
+++ b/include/io/channel-null.h
@@ -0,0 +1,55 @@
+/*
+ * QEMU I/O channels null driver
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#ifndef QIO_CHANNEL_FILE_H
+#define QIO_CHANNEL_FILE_H
+
+#include "io/channel.h"
+#include "qom/object.h"
+
+#define TYPE_QIO_CHANNEL_NULL "qio-channel-null"
+OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelNull, QIO_CHANNEL_NULL)
+
+
+/**
+ * QIOChannelNull:
+ *
+ * The QIOChannelNull object provides a channel implementation
+ * that discards all writes and returns EOF for all reads.
+ */
+
+struct QIOChannelNull {
+QIOChannel parent;
+bool closed;
+};
+
+
+/**
+ * qio_channel_null_new:
+ *
+ * Create a new IO channel object that discards all writes
+ * and returns EOF for all reads.
+ *
+ * Returns: the new channel object
+ */
+QIOChannelNull *
+qio_channel_null_new(void);
+
+#endif /* QIO_CHANNEL_NULL_H */
diff --git a/io/channel-null.c b/io/channel-null.c
new file mode 100644
index 00..75e3781507
--- /dev/null
+++ b/io/channel-null.c
@@ -0,0 +1,237 @@
+/*
+ * QEMU I/O channels null driver
+ *
+ * Copyright (c) 2022 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "io/channel-null.h"
+#include "io/channel-watch.h"
+#include "qapi/error.h"
+#include "trace.h"
+#include "qemu/iov.h"
+
+typedef struct QIOChannelNullSource QIOChannelNullSource;
+struct QIOChannelNullSource {
+GSource parent;
+QIOChannel *ioc;
+GIOCondition condition;
+};
+
+
+QIOChannelNull *
+qio_channel_null_new(void)
+{
+QIOChannelNull *ioc;
+
+ioc = QIO_CHANNEL_NULL(object_new(TYPE_QIO_CHANNEL_NULL));
+
+trace_qio_channel_null_new(ioc);
+
+return ioc;
+}
+
+
+static void
+qio_channel_null_init(Object *obj)
+{
+QIOChannelNull *ioc = QIO_CHANNEL_NULL(obj);
+ioc->closed = false;
+}
+
+
+static ssize_t
+qio_channel_null_readv(QIOChannel *ioc,
+   const struct iovec *iov,
+   size_t niov,
+   int **fds G_GNUC_UNUSED,
+   size_t *nfds G_GNUC_UNUSED,
+   Error **errp)
+{
+QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);
+
+if (nioc->closed) {
+error_setg_errno(errp, EINVAL,
+ "Channel is closed");
+return -1;
+}
+
+return 0;
+}
+
+
+static ssize_t
+qio_channel_null_writev(QIOChannel *ioc,
+const struct iovec *iov,
+size_t niov,
+int *fds G_GNUC_UNUSED,
+size_t nfds G_GNUC_UNUSED,
+int flags G_GNUC_UNUSED,
+Error **errp)
+{
+QIOChannelNull *nioc = QIO_CHANNEL_NULL(ioc);
+
+if (nioc->closed) {
+error_setg_errno(errp, EINVAL,
+ "Chan

[PULL 08/33] migration: rename rate limiting fields in QEMUFile

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

This renames the following QEMUFile fields

 * bytes_xfer -> rate_limit_used
 * xfer_limit -> rate_limit_max

The intent is to make it clear that 'bytes_xfer' is specifically related
to rate limiting of data and applies to data queued, which need not have
been transferred on the wire yet if a flush hasn't taken place.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/qemu-file.c | 30 +++---
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 1479cddad9..03f0b13a55 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -39,8 +39,16 @@ struct QEMUFile {
 const QEMUFileHooks *hooks;
 void *opaque;
 
-int64_t bytes_xfer;
-int64_t xfer_limit;
+/*
+ * Maximum amount of data in bytes to transfer during one
+ * rate limiting time window
+ */
+int64_t rate_limit_max;
+/*
+ * Total amount of data in bytes queued for transfer
+ * during this rate limiting time window
+ */
+int64_t rate_limit_used;
 
 int64_t pos; /* start of buffer when writing, end of buffer
 when reading */
@@ -304,7 +312,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t 
block_offset,
 int ret = f->hooks->save_page(f, f->opaque, block_offset,
   offset, size, bytes_sent);
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
-f->bytes_xfer += size;
+f->rate_limit_used += size;
 }
 
 if (ret != RAM_SAVE_CONTROL_DELAYED &&
@@ -457,7 +465,7 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, 
size_t size,
 return;
 }
 
-f->bytes_xfer += size;
+f->rate_limit_used += size;
 add_to_iovec(f, buf, size, may_free);
 }
 
@@ -475,7 +483,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, 
size_t size)
 l = size;
 }
 memcpy(f->buf + f->buf_index, buf, l);
-f->bytes_xfer += l;
+f->rate_limit_used += l;
 add_buf_to_iovec(f, l);
 if (qemu_file_get_error(f)) {
 break;
@@ -492,7 +500,7 @@ void qemu_put_byte(QEMUFile *f, int v)
 }
 
 f->buf[f->buf_index] = v;
-f->bytes_xfer++;
+f->rate_limit_used++;
 add_buf_to_iovec(f, 1);
 }
 
@@ -674,7 +682,7 @@ int qemu_file_rate_limit(QEMUFile *f)
 if (qemu_file_get_error(f)) {
 return 1;
 }
-if (f->xfer_limit > 0 && f->bytes_xfer > f->xfer_limit) {
+if (f->rate_limit_max > 0 && f->rate_limit_used > f->rate_limit_max) {
 return 1;
 }
 return 0;
@@ -682,22 +690,22 @@ int qemu_file_rate_limit(QEMUFile *f)
 
 int64_t qemu_file_get_rate_limit(QEMUFile *f)
 {
-return f->xfer_limit;
+return f->rate_limit_max;
 }
 
 void qemu_file_set_rate_limit(QEMUFile *f, int64_t limit)
 {
-f->xfer_limit = limit;
+f->rate_limit_max = limit;
 }
 
 void qemu_file_reset_rate_limit(QEMUFile *f)
 {
-f->bytes_xfer = 0;
+f->rate_limit_used = 0;
 }
 
 void qemu_file_update_transfer(QEMUFile *f, int64_t len)
 {
-f->bytes_xfer += len;
+f->rate_limit_used += len;
 }
 
 void qemu_put_be16(QEMUFile *f, unsigned int v)
-- 
2.36.1

[PULL 06/33] migration: switch to use QIOChannelNull for dummy channel

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

This removes one further custom impl of QEMUFile, in favour of a
QIOChannel based impl.

Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/ram.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 5f5e37f64d..89082716d6 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -32,11 +32,13 @@
 #include "qemu/bitmap.h"
 #include "qemu/madvise.h"
 #include "qemu/main-loop.h"
+#include "io/channel-null.h"
 #include "xbzrle.h"
 #include "ram.h"
 #include "migration.h"
 #include "migration/register.h"
 #include "migration/misc.h"
+#include "migration/qemu-file-channel.h"
 #include "qemu-file.h"
 #include "postcopy-ram.h"
 #include "page_cache.h"
@@ -457,8 +459,6 @@ static QemuThread *compress_threads;
  */
 static QemuMutex comp_done_lock;
 static QemuCond comp_done_cond;
-/* The empty QEMUFileOps will be used by file in CompressParam */
-static const QEMUFileOps empty_ops = { };
 
 static QEMUFile *decomp_file;
 static DecompressParam *decomp_param;
@@ -569,7 +569,8 @@ static int compress_threads_save_setup(void)
 /* comp_param[i].file is just used as a dummy buffer to save data,
  * set its ops to empty.
  */
-comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
+comp_param[i].file = qemu_fopen_channel_output(
+QIO_CHANNEL(qio_channel_null_new()));
 comp_param[i].done = true;
 comp_param[i].quit = false;
 qemu_mutex_init(&comp_param[i].mutex);
-- 
2.36.1

[PULL 26/33] accel/kvm/kvm-all: Refactor per-vcpu dirty ring reaping

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Hyman Huang(黄勇) 

Add a non-required argument 'CPUState' to kvm_dirty_ring_reap so
that it can cover single vcpu dirty-ring-reaping scenario.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
Message-Id: 

Signed-off-by: Dr. David Alan Gilbert 
---
 accel/kvm/kvm-all.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ba3210b1c1..672ed004ab 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -757,17 +757,20 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, 
CPUState *cpu)
 }
 
 /* Must be with slots_lock held */
-static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
 {
 int ret;
-CPUState *cpu;
 uint64_t total = 0;
 int64_t stamp;
 
 stamp = get_clock();
 
-CPU_FOREACH(cpu) {
-total += kvm_dirty_ring_reap_one(s, cpu);
+if (cpu) {
+total = kvm_dirty_ring_reap_one(s, cpu);
+} else {
+CPU_FOREACH(cpu) {
+total += kvm_dirty_ring_reap_one(s, cpu);
+}
 }
 
 if (total) {
@@ -788,7 +791,7 @@ static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
  * Currently for simplicity, we must hold BQL before calling this.  We can
  * consider to drop the BQL if we're clear with all the race conditions.
  */
-static uint64_t kvm_dirty_ring_reap(KVMState *s)
+static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
 {
 uint64_t total;
 
@@ -808,7 +811,7 @@ static uint64_t kvm_dirty_ring_reap(KVMState *s)
  * reset below.
  */
 kvm_slots_lock();
-total = kvm_dirty_ring_reap_locked(s);
+total = kvm_dirty_ring_reap_locked(s, cpu);
 kvm_slots_unlock();
 
 return total;
@@ -855,7 +858,7 @@ static void kvm_dirty_ring_flush(void)
  * vcpus out in a synchronous way.
  */
 kvm_cpu_synchronize_kick_all();
-kvm_dirty_ring_reap(kvm_state);
+kvm_dirty_ring_reap(kvm_state, NULL);
 trace_kvm_dirty_ring_flush(1);
 }
 
@@ -1399,7 +1402,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
  * Not easy.  Let's cross the fingers until it's fixed.
  */
 if (kvm_state->kvm_dirty_ring_size) {
-kvm_dirty_ring_reap_locked(kvm_state);
+kvm_dirty_ring_reap_locked(kvm_state, NULL);
 } else {
 kvm_slot_get_dirty_log(kvm_state, mem);
 }
@@ -1471,7 +1474,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
 
 qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(s);
+kvm_dirty_ring_reap(s, NULL);
 qemu_mutex_unlock_iothread();
 
 r->reaper_iteration++;
@@ -2967,7 +2970,7 @@ int kvm_cpu_exec(CPUState *cpu)
  */
 trace_kvm_dirty_ring_full(cpu->cpu_index);
 qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state);
+kvm_dirty_ring_reap(kvm_state, NULL);
 qemu_mutex_unlock_iothread();
 ret = 0;
 break;
-- 
2.36.1

[PULL 07/33] migration: remove unreachble RDMA code in save_hook impl

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

The QEMUFile 'save_hook' callback has a 'size_t size' parameter.

The RDMA impl of this has logic that takes different actions
depending on whether the value is zero or non-zero. It has
commented out logic that would have taken further actions
if the value was negative.

The only place where the 'save_hook' callback is invoked is
the ram_control_save_page() method, which passes 'size'
through from its caller. The only caller of this method is
in turn control_save_page(). This method unconditionally
passes the 'TARGET_PAGE_SIZE' constant for the 'size' parameter.

IOW, the only scenario for 'size' that can execute in the
qemu_rdma_save_page method is 'size > 0'. The remaining code
has been unreachable since RDMA support was first introduced
9 years ago.

Reviewed-by: Eric Blake 
Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
---
 migration/rdma.c | 120 +--
 1 file changed, 21 insertions(+), 99 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 8504152f39..c5fa4a408a 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -1462,34 +1462,6 @@ static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, 
uint64_t index,
 return result;
 }
 
-/*
- * Set bit for unregistration in the next iteration.
- * We cannot transmit right here, but will unpin later.
- */
-static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
-uint64_t chunk, uint64_t wr_id)
-{
-if (rdma->unregistrations[rdma->unregister_next] != 0) {
-error_report("rdma migration: queue is full");
-} else {
-RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
-
-if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
-trace_qemu_rdma_signal_unregister_append(chunk,
- rdma->unregister_next);
-
-rdma->unregistrations[rdma->unregister_next++] =
-qemu_rdma_make_wrid(wr_id, index, chunk);
-
-if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
-rdma->unregister_next = 0;
-}
-} else {
-trace_qemu_rdma_signal_unregister_already(chunk);
-}
-}
-}
-
 /*
  * Consult the connection manager to see a work request
  * (of any kind) has completed.
@@ -3237,23 +3209,7 @@ qio_channel_rdma_shutdown(QIOChannel *ioc,
  *Offset is an offset to be added to block_offset and used
  *to also lookup the corresponding RAMBlock.
  *
- *@size > 0 :
- *Initiate an transfer this size.
- *
- *@size == 0 :
- *A 'hint' or 'advice' that means that we wish to speculatively
- *and asynchronously unregister this memory. In this case, there is no
- *guarantee that the unregister will actually happen, for example,
- *if the memory is being actively transmitted. Additionally, the memory
- *may be re-registered at any future time if a write within the same
- *chunk was requested again, even if you attempted to unregister it
- *here.
- *
- *@size < 0 : TODO, not yet supported
- *Unregister the memory NOW. This means that the caller does not
- *expect there to be any future RDMA transfers and we just want to 
clean
- *things up. This is used in case the upper layer owns the memory and
- *cannot wait for qemu_fclose() to occur.
+ *@size : Number of bytes to transfer
  *
  *@bytes_sent : User-specificed pointer to indicate how many bytes were
  *  sent. Usually, this will not be more than a few bytes of
@@ -3282,61 +3238,27 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void 
*opaque,
 
 qemu_fflush(f);
 
-if (size > 0) {
-/*
- * Add this page to the current 'chunk'. If the chunk
- * is full, or the page doesn't belong to the current chunk,
- * an actual RDMA write will occur and a new chunk will be formed.
- */
-ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
-if (ret < 0) {
-error_report("rdma migration: write error! %d", ret);
-goto err;
-}
-
-/*
- * We always return 1 bytes because the RDMA
- * protocol is completely asynchronous. We do not yet know
- * whether an  identified chunk is zero or not because we're
- * waiting for other pages to potentially be merged with
- * the current chunk. So, we have to call qemu_update_position()
- * later on when the actual write occurs.
- */
-if (bytes_sent) {
-*bytes_sent = 1;
-}
-} else {
-uint64_t index, chunk;
-
-/* TODO: Change QEMUFileOps prototype to be signed: size_t => long
-if (size < 0) {
-ret

[PULL 16/33] migration: hardcode assumption that QEMUFile is backed with QIOChannel

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

The only callers of qemu_fopen_ops pass 'true' for the 'has_ioc'
parameter, so hardcode this assumption in QEMUFile, by passing in
the QIOChannel object as a non-opaque parameter.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
   dgilbert: Fixed long line
---
 migration/qemu-file-channel.c |  4 ++--
 migration/qemu-file.c | 35 +--
 migration/qemu-file.h |  2 +-
 3 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
index bb5a5752df..ce8eced417 100644
--- a/migration/qemu-file-channel.c
+++ b/migration/qemu-file-channel.c
@@ -184,11 +184,11 @@ static const QEMUFileOps channel_output_ops = {
 QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc)
 {
 object_ref(OBJECT(ioc));
-return qemu_fopen_ops(ioc, &channel_input_ops, true);
+return qemu_fopen_ops(ioc, &channel_input_ops);
 }
 
 QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc)
 {
 object_ref(OBJECT(ioc));
-return qemu_fopen_ops(ioc, &channel_output_ops, true);
+return qemu_fopen_ops(ioc, &channel_output_ops);
 }
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index cdcb6e1788..30e2160041 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -37,7 +37,7 @@
 struct QEMUFile {
 const QEMUFileOps *ops;
 const QEMUFileHooks *hooks;
-void *opaque;
+QIOChannel *ioc;
 
 /*
  * Maximum amount of data in bytes to transfer during one
@@ -65,8 +65,6 @@ struct QEMUFile {
 Error *last_error_obj;
 /* has the file has been shutdown */
 bool shutdown;
-/* Whether opaque points to a QIOChannel */
-bool has_ioc;
 };
 
 /*
@@ -81,7 +79,7 @@ int qemu_file_shutdown(QEMUFile *f)
 if (!f->ops->shut_down) {
 return -ENOSYS;
 }
-ret = f->ops->shut_down(f->opaque, true, true, NULL);
+ret = f->ops->shut_down(f->ioc, true, true, NULL);
 
 if (!f->last_error) {
 qemu_file_set_error(f, -EIO);
@@ -98,7 +96,7 @@ QEMUFile *qemu_file_get_return_path(QEMUFile *f)
 if (!f->ops->get_return_path) {
 return NULL;
 }
-return f->ops->get_return_path(f->opaque);
+return f->ops->get_return_path(f->ioc);
 }
 
 bool qemu_file_mode_is_not_valid(const char *mode)
@@ -113,15 +111,15 @@ bool qemu_file_mode_is_not_valid(const char *mode)
 return false;
 }
 
-QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops, bool has_ioc)
+QEMUFile *qemu_fopen_ops(QIOChannel *ioc, const QEMUFileOps *ops)
 {
 QEMUFile *f;
 
 f = g_new0(QEMUFile, 1);
 
-f->opaque = opaque;
+f->ioc = ioc;
 f->ops = ops;
-f->has_ioc = has_ioc;
+
 return f;
 }
 
@@ -242,7 +240,7 @@ void qemu_fflush(QEMUFile *f)
 }
 if (f->iovcnt > 0) {
 expect = iov_size(f->iov, f->iovcnt);
-ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt,
+ret = f->ops->writev_buffer(f->ioc, f->iov, f->iovcnt,
 f->total_transferred, &local_error);
 
 qemu_iovec_release_ram(f);
@@ -358,7 +356,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
 return 0;
 }
 
-len = f->ops->get_buffer(f->opaque, f->buf + pending, f->total_transferred,
+len = f->ops->get_buffer(f->ioc, f->buf + pending, f->total_transferred,
  IO_BUF_SIZE - pending, &local_error);
 if (len > 0) {
 f->buf_size += len;
@@ -394,7 +392,7 @@ int qemu_fclose(QEMUFile *f)
 ret = qemu_file_get_error(f);
 
 if (f->ops->close) {
-int ret2 = f->ops->close(f->opaque, NULL);
+int ret2 = f->ops->close(f->ioc, NULL);
 if (ret >= 0) {
 ret = ret2;
 }
@@ -861,18 +859,19 @@ void qemu_put_counted_string(QEMUFile *f, const char *str)
 void qemu_file_set_blocking(QEMUFile *f, bool block)
 {
 if (f->ops->set_blocking) {
-f->ops->set_blocking(f->opaque, block, NULL);
+f->ops->set_blocking(f->ioc, block, NULL);
 }
 }
 
 /*
- * Return the ioc object if it's a migration channel.  Note: it can return NULL
- * for callers passing in a non-migration qemufile.  E.g. see qemu_fopen_bdrv()
- * and its usage in e.g. load_snapshot().  So we need to check against NULL
- * before using it.  If without the check, migration_incoming_state_destroy()
- * could fail for load_snapshot().
+ * qemu_file_get_ioc:
+ *
+ * Get the ioc object for the file, without incrementing
+ * the reference count.
+ *
+ * Returns: the ioc object
  */
 QIOChannel *qemu_file_get_ioc(QEMUFile *file)
 {
-return file->has_ioc ? QIO_CHANNEL(file->opaque) : NULL;
+return file->ioc;
 }
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 277f1d5a62..3a1ecc0e34 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -118,7 +118,7 @@ typedef struct QEMUFileHooks {
 QEMURamS

[PULL 14/33] migration: convert savevm to use QIOChannelBlock for VMState

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Daniel P. Berrangé 

With this change, all QEMUFile usage is backed by QIOChannel at
last.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Daniel P. Berrangé 
Reviewed-by: Juan Quintela 
Signed-off-by: Juan Quintela 
Signed-off-by: Dr. David Alan Gilbert 
  dgilbert: Wrap long lines
---
 migration/savevm.c | 44 ++--
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index 75d05f1a84..3e9612121a 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -35,6 +35,7 @@
 #include "migration/misc.h"
 #include "migration/register.h"
 #include "migration/global_state.h"
+#include "migration/channel-block.h"
 #include "ram.h"
 #include "qemu-file-channel.h"
 #include "qemu-file.h"
@@ -130,48 +131,15 @@ static struct mig_cmd_args {
 /***/
 /* savevm/loadvm support */
 
-static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
-   int64_t pos, Error **errp)
-{
-int ret;
-QEMUIOVector qiov;
-
-qemu_iovec_init_external(&qiov, iov, iovcnt);
-ret = bdrv_writev_vmstate(opaque, &qiov, pos);
-if (ret < 0) {
-return ret;
-}
-
-return qiov.size;
-}
-
-static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
-size_t size, Error **errp)
-{
-return bdrv_load_vmstate(opaque, buf, pos, size);
-}
-
-static int bdrv_fclose(void *opaque, Error **errp)
-{
-return bdrv_flush(opaque);
-}
-
-static const QEMUFileOps bdrv_read_ops = {
-.get_buffer = block_get_buffer,
-.close =  bdrv_fclose
-};
-
-static const QEMUFileOps bdrv_write_ops = {
-.writev_buffer  = block_writev_buffer,
-.close  = bdrv_fclose
-};
-
 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 {
 if (is_writable) {
-return qemu_fopen_ops(bs, &bdrv_write_ops, false);
+return qemu_fopen_channel_output(
+   QIO_CHANNEL(qio_channel_block_new(bs)));
+} else {
+return qemu_fopen_channel_input(
+   QIO_CHANNEL(qio_channel_block_new(bs)));
 }
-return qemu_fopen_ops(bs, &bdrv_read_ops, false);
 }
 
 
-- 
2.36.1

[PULL 28/33] migration/dirtyrate: Refactor dirty page rate calculation

2022-06-22 Thread Dr. David Alan Gilbert (git)

From: Hyman Huang(黄勇) 

abstract out dirty log change logic into function
global_dirty_log_change.

abstract out dirty page rate calculation logic via
dirty-ring into function vcpu_calculate_dirtyrate.

abstract out mathematical dirty page rate calculation
into do_calculate_dirtyrate, decouple it from DirtyStat.

rename set_sample_page_period to dirty_stat_wait, which
is well-understood and will be reused in dirtylimit.

handle cpu hotplug/unplug scenario during measurement of
dirty page rate.

export util functions outside migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Peter Xu 
Message-Id: 
<1d65b53c19cfc7dca0114422129515055fa18fb8.1652931128.git.huang...@chinatelecom.cn>
Signed-off-by: Dr. David Alan Gilbert 
---
 include/sysemu/dirtyrate.h |  28 +
 migration/dirtyrate.c  | 227 +++--
 migration/dirtyrate.h  |   7 +-
 3 files changed, 174 insertions(+), 88 deletions(-)
 create mode 100644 include/sysemu/dirtyrate.h

diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
new file mode 100644
index 00..4d3b9a4902
--- /dev/null
+++ b/include/sysemu/dirtyrate.h
@@ -0,0 +1,28 @@
+/*
+ * dirty page rate helper functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ *  Hyman Huang(黄勇) 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_DIRTYRATE_H
+#define QEMU_DIRTYRATE_H
+
+typedef struct VcpuStat {
+int nvcpu; /* number of vcpu */
+DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot);
+
+void global_dirty_log_change(unsigned int flag,
+ bool start);
+#endif
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index aace12a787..795fab5c37 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
 static DirtyRateMeasureMode dirtyrate_mode =
 DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
 
-static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
+static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
 {
 int64_t current_time;
 
@@ -60,6 +60,132 @@ static int64_t set_sample_page_period(int64_t msec, int64_t 
initial_time)
 return msec;
 }
 
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+if (start) {
+dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+} else {
+dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+}
+}
+
+static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
+  int64_t calc_time_ms)
+{
+uint64_t memory_size_MB;
+uint64_t increased_dirty_pages =
+dirty_pages.end_pages - dirty_pages.start_pages;
+
+memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+
+return memory_size_MB * 1000 / calc_time_ms;
+}
+
+void global_dirty_log_change(unsigned int flag, bool start)
+{
+qemu_mutex_lock_iothread();
+if (start) {
+memory_global_dirty_log_start(flag);
+} else {
+memory_global_dirty_log_stop(flag);
+}
+qemu_mutex_unlock_iothread();
+}
+
+/*
+ * global_dirty_log_sync
+ * 1. sync dirty log from kvm
+ * 2. stop dirty tracking if needed.
+ */
+static void global_dirty_log_sync(unsigned int flag, bool one_shot)
+{
+qemu_mutex_lock_iothread();
+memory_global_dirty_log_sync();
+if (one_shot) {
+memory_global_dirty_log_stop(flag);
+}
+qemu_mutex_unlock_iothread();
+}
+
+static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
+{
+CPUState *cpu;
+DirtyPageRecord *records;
+int nvcpu = 0;
+
+CPU_FOREACH(cpu) {
+nvcpu++;
+}
+
+stat->nvcpu = nvcpu;
+stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+
+records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
+
+return records;
+}
+
+static void vcpu_dirty_stat_collect(VcpuStat *stat,
+DirtyPageRecord *records,
+bool start)
+{
+CPUState *cpu;
+
+CPU_FOREACH(cpu) {
+record_dirtypages(records, cpu, start);
+}
+}
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+DirtyPageRecord *records;
+int64_t init_time_ms;
+int64_t duration;
+int64_t dirtyrate;
+int i = 0;
+unsigned int gen_id;
+
+retry:
+init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+cpu_list_lock();
+gen_id = cpu_

< 5 6 7 8 9 10 11 12 13 14 >

901 - 1000 of 3600 matches

Mail list logo